## Mount to gdrive to read data, adjacency matrix, p_link and labels. 

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# Install pandas to read csv files from gdrive. Install torch and numpy which are packages that are needed durind coding.

In [2]:
!pip install pandas
import pandas as pd
import torch
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## It is time to prepare x, edge_index, y, train_mask and test_mask which are needed for PyG. 

**Step1 (preparing x)**: Read multivariate dataset from gdrive and convert the type to what is needed in PyG.

In [3]:
x = pd.read_csv('/gdrive/MyDrive/7classes.sim/df1_t_7classes.csv', sep =',',header = None)
x = x.astype(np.float32) #try to cast all DataFrame columns to specified numpy.dtype
x = torch.tensor(x.values) #convert numpy.dtype to a tensor
print(x.sum(dim = 1).unique().size()) #to see if there is a duplicate data in x or not
print(x.dtype)
print(x.type())
print(x)
print(x.size())

torch.Size([70])
torch.float32
torch.FloatTensor
tensor([[-0.6680,  0.3545,  0.1382,  ...,  2.1196,  0.2813,  1.3384],
        [ 0.0282,  1.6481,  1.9474,  ...,  0.8241,  0.9470,  1.9798],
        [-0.4644,  0.5799,  0.0618,  ..., -0.5351, -0.2698,  0.9294],
        ...,
        [-0.2195, -0.8809,  0.0466,  ..., -0.5610,  2.1045, -0.2778],
        [-0.7321,  0.0596,  1.4736,  ...,  0.7870,  0.7216, -1.5205],
        [ 1.0196,  0.5744,  0.3943,  ...,  1.7039, -1.3692,  0.6619]])
torch.Size([70, 500])


**Step2 (preparing edge_index)**: Read adjacency matrix from gdrive and convert the type to what is needed in PyG. 

In [4]:
from scipy.sparse import coo_matrix
edge_index_csv = pd.read_csv('/gdrive/MyDrive/7classes.sim/adj_7classes.csv',sep =',',header = None)
#print(edge_index_csv)
edge_index_numpy_ndarry = edge_index_csv.values #convert xlsx file to numpy.ndarry
edge_index_coo = coo_matrix(edge_index_numpy_ndarry) #convert symmetric matrix to coo_matrix
print(edge_index_coo)

  (0, 21)	1
  (0, 22)	1
  (0, 26)	1
  (0, 36)	1
  (0, 38)	1
  (0, 65)	1
  (1, 12)	1
  (1, 19)	1
  (1, 63)	1
  (2, 3)	1
  (2, 4)	1
  (2, 19)	1
  (2, 30)	1
  (2, 42)	1
  (2, 67)	1
  (3, 2)	1
  (3, 16)	1
  (4, 2)	1
  (4, 10)	1
  (4, 16)	1
  (4, 23)	1
  (4, 31)	1
  (4, 49)	1
  (4, 51)	1
  (4, 62)	1
  :	:
  (61, 45)	1
  (61, 53)	1
  (62, 4)	1
  (62, 15)	1
  (62, 39)	1
  (62, 63)	1
  (63, 1)	1
  (63, 21)	1
  (63, 62)	1
  (64, 34)	1
  (65, 0)	1
  (65, 44)	1
  (65, 54)	1
  (66, 18)	1
  (66, 39)	1
  (66, 51)	1
  (67, 2)	1
  (67, 4)	1
  (68, 24)	1
  (68, 26)	1
  (68, 28)	1
  (68, 29)	1
  (69, 38)	1
  (69, 42)	1
  (69, 58)	1


In [5]:
edge_index_numpy = np.vstack((edge_index_coo.row, edge_index_coo.col)) #convert coo_matrix to numpy.ndarray
edge_index_torch_int32 = torch.from_numpy(edge_index_numpy) #convert numpy.ndarray to torch.int32
edge_index = edge_index_torch_int32.to(torch.int64) #convert torch.int32 to torch.long
print(edge_index)
print(edge_index.dtype)
print(edge_index.type())
print(edge_index.size())

tensor([[ 0,  0,  0,  0,  0,  0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  4,
          4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
          6,  7,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,
          9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14, 15, 15, 16, 16,
         17, 17, 17, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21,
         21, 21, 22, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 26, 26, 26, 26, 27,
         27, 27, 28, 28, 29, 29, 29, 30, 30, 30, 31, 32, 32, 32, 32, 32, 32, 33,
         33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, 39,
         39, 39, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 43, 43,
         43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 46,
         46, 46, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 50, 50, 50, 51, 51, 51,
         51, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 54, 54, 54, 55, 55,
         56, 56, 56, 56, 56,

**Step3 (preparing y)**: Read labels from gdrive and convert the type to what is needed in PyG.

In [6]:
labels_csv = pd.read_csv('/gdrive/MyDrive/7classes.sim/labels_7classes.csv',sep =',',header=None)
y = torch.tensor(labels_csv.values)
y.resize_((70))
print(y)

tensor([0, 5, 6, 6, 6, 3, 1, 4, 1, 1, 6, 4, 5, 0, 1, 5, 6, 0, 5, 3, 1, 5, 4, 6,
        2, 3, 4, 3, 4, 4, 6, 6, 1, 0, 3, 4, 2, 3, 0, 5, 2, 1, 0, 5, 2, 1, 2, 4,
        2, 6, 0, 6, 4, 1, 2, 3, 2, 2, 0, 3, 3, 1, 5, 5, 3, 2, 5, 0, 4, 0])


**Step4 (preparing train_mask, test_mask)**: split x and y into train and test set.

In [7]:
from sklearn.model_selection import train_test_split
main_mask, test_mask, y_main, y_test = train_test_split(x, y,  test_size = 39, random_state = 0, shuffle = True, stratify = y)
train_mask, val_mask, y_train, y_val = train_test_split(main_mask, y_main,  test_size = 10, random_state = 0, shuffle = True, stratify = y_main)
#train_mask = train_mask.to(torch.long)
#test_mask = test_mask.to(torch.long)
print(train_mask)
print(test_mask)
print(train_mask.size())
print(val_mask.size())
print(test_mask.size())

tensor([[-1.0609,  2.1436, -0.1678,  ...,  0.4928,  0.4604,  1.2071],
        [-0.4210,  1.0202, -0.0709,  ..., -1.3458, -0.7098, -0.6120],
        [-0.2456, -1.2005, -2.0871,  ...,  1.1912, -0.8355, -0.3074],
        ...,
        [-0.6386,  0.3763,  0.4909,  ..., -1.3751, -0.3818,  1.0854],
        [ 0.2696,  1.7279,  0.5187,  ...,  0.2844, -0.1213,  0.3580],
        [-0.5474,  0.1105, -1.1235,  ...,  1.8091, -1.0986,  0.8451]])
tensor([[ 1.1026,  1.3767,  1.3339,  ..., -0.6261, -1.1796,  1.2935],
        [ 0.8492,  0.3485, -0.2633,  ..., -1.0214,  1.6695,  0.7288],
        [ 1.1908, -0.8032,  0.7891,  ..., -0.5562, -1.4553, -0.8918],
        ...,
        [ 0.8753,  2.1445,  1.9234,  ...,  0.7838,  0.9486,  1.0234],
        [ 1.0196,  0.5744,  0.3943,  ...,  1.7039, -1.3692,  0.6619],
        [-1.0942,  1.5473,  0.8378,  ...,  1.2226, -0.5648,  0.7292]])
torch.Size([21, 500])
torch.Size([10, 500])
torch.Size([39, 500])


## To see how many percent of data belong to each class.

In [8]:
labels, counts = np.unique(y, return_counts = True)
print(counts/float(len(y)))

[0.14285714 0.14285714 0.14285714 0.14285714 0.14285714 0.14285714
 0.14285714]


In [9]:
labels, counts = np.unique(y_train, return_counts = True)
print(counts/float(len(y_train)))

[0.14285714 0.14285714 0.14285714 0.14285714 0.14285714 0.14285714
 0.14285714]


In [10]:
labels, counts = np.unique(y_test, return_counts = True)
print(counts/float(len(y_val)))

[0.6 0.5 0.5 0.6 0.6 0.6 0.5]


In [11]:
labels, counts = np.unique(y_test, return_counts = True)
print(counts/float(len(y_test)))

[0.15384615 0.12820513 0.12820513 0.15384615 0.15384615 0.15384615
 0.12820513]


## Preparing train_mask and test_mask based on PyG.

In [12]:
train_mask = (x.unsqueeze(0) == train_mask.unsqueeze(1)).all(dim = 2).any(dim = 0)
print(train_mask)
print(train_mask.size())
print(sum(train_mask))

val_mask = (x.unsqueeze(0) == val_mask.unsqueeze(1)).all(dim = 2).any(dim = 0)
print(val_mask)
print(val_mask.size())
print(sum(val_mask))

test_mask = (x.unsqueeze(0) == test_mask.unsqueeze(1)).all(dim = 2).any(dim = 0)
print(test_mask)
print(test_mask.size())
print(sum(test_mask))

tensor([False, False,  True, False,  True, False, False, False, False,  True,
        False,  True,  True,  True,  True,  True, False, False,  True, False,
        False, False, False,  True, False, False, False, False, False,  True,
        False, False,  True, False,  True, False,  True,  True, False, False,
         True, False, False, False, False, False, False, False, False, False,
        False, False,  True, False,  True, False, False, False,  True,  True,
        False, False, False, False, False, False, False,  True, False, False])
torch.Size([70])
tensor(21)
tensor([False, False, False, False, False, False, False, False,  True, False,
         True, False, False, False, False, False, False, False, False, False,
         True, False, False, False, False,  True, False, False, False, False,
        False, False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True, False,
        False,  True, False, False,

## It is time to use p_link as attentions.

In [13]:
p_link = pd.read_csv('/gdrive/MyDrive/7classes.sim/p_links_7classes.csv', sep =',',header = None)
p_link = p_link.astype(np.float32)
p_link = torch.tensor(p_link.values)
#p_link = p_link.to(torch.float32)
print(p_link.size())
print(p_link.dtype)
print(p_link.type())

torch.Size([70, 70])
torch.float32
torch.FloatTensor


# Now everything is ready to develop our GNNs.

# Install torch_geometric.

In [14]:
import os
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.13.0+cu116
[K     |████████████████████████████████| 9.4 MB 2.9 MB/s 
[K     |████████████████████████████████| 4.6 MB 2.9 MB/s 
[K     |████████████████████████████████| 280 kB 5.4 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


# Save data in order to use it. 

In [15]:
from torch_geometric.data import Data
Data = Data(x = x, edge_index = edge_index, y = y, train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)
print(Data)

Data(x=[70, 500], edge_index=[2, 260], y=[70], train_mask=[70], val_mask=[70], test_mask=[70])


In [16]:
data_save_name = 'data_7classes.pt'  
path = F"/gdrive/MyDrive/7classes.sim/{data_save_name}" 
torch.save(Data.to_dict(), path)

In [17]:
p_link_save_name = 'p_links_7classes.pt'  
path = F"/gdrive/MyDrive/7classes.sim/{p_link_save_name}" 
torch.save(p_link, path)