## Mount to gdrive to read data, adjacency matrix, p_link and labels. 


In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# Install pandas to read csv files from gdrive. Install torch and numpy which are packages that are needed durind coding.

In [2]:
!pip install pandas
import pandas as pd
import torch
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## It is time to prepare x, edge_index, y, train_mask and test_mask which are needed for PyG. 

**Step1 (preparing x)**: Read multivariate dataset from gdrive and convert the type to what is needed in PyG.

In [3]:
x = pd.read_csv('/gdrive/MyDrive/Gene/gene_t.csv', sep=',',header=None)
x = x.astype(np.float32) #try to cast all DataFrame columns to specified numpy.dtype
x = torch.tensor(x.values) #convert numpy.dtype to a tensor
print(x.sum(dim = 1).unique().size()) #to see if there is a duplicate data in x or not
print(x.dtype)
print(x.type())
print(x)
print(x.size())

torch.Size([65])
torch.float32
torch.FloatTensor
tensor([[12.2066,  8.5495, 10.7969,  ...,  8.4967,  8.5979,  8.8691],
        [ 9.6155,  8.9314,  9.5201,  ...,  9.6052, 11.1579, 10.5243],
        [ 6.2624, 10.6610,  8.8817,  ...,  6.8272,  6.2315,  6.5671],
        ...,
        [ 6.0040, 11.8118, 10.3843,  ...,  7.5903, 12.6739, 12.8235],
        [ 8.8085, 12.8148, 15.4529,  ...,  9.6657, 13.1217, 10.3099],
        [ 7.3143, 13.6116, 12.9968,  ...,  8.7592,  9.8862,  9.2184]])
torch.Size([65, 60])


**Step2 (preparing edge_index)**: Read adjacency matrix from gdrive and convert the type to what is needed in PyG. 

In [4]:
from scipy.sparse import coo_matrix
edge_index_csv = pd.read_csv('/gdrive/MyDrive/Gene/adj_gene.csv',sep =',',header = None)
edge_index_numpy_ndarry = edge_index_csv.values #convert xlsx file to numpy.ndarry
edge_index_coo = coo_matrix(edge_index_numpy_ndarry) #convert symmetric matrix to coo_matrix
print(edge_index_coo)

  (0, 5)	1
  (0, 8)	1
  (0, 9)	1
  (0, 12)	1
  (0, 21)	1
  (0, 32)	1
  (1, 36)	1
  (2, 10)	1
  (2, 13)	1
  (2, 16)	1
  (2, 40)	1
  (2, 62)	1
  (3, 17)	1
  (3, 25)	1
  (4, 7)	1
  (4, 15)	1
  (4, 18)	1
  (4, 19)	1
  (4, 38)	1
  (5, 0)	1
  (5, 7)	1
  (5, 9)	1
  (5, 11)	1
  (5, 21)	1
  (5, 33)	1
  :	:
  (54, 59)	1
  (55, 36)	1
  (55, 46)	1
  (56, 15)	1
  (56, 42)	1
  (56, 64)	1
  (57, 8)	1
  (57, 23)	1
  (57, 25)	1
  (57, 38)	1
  (57, 54)	1
  (58, 21)	1
  (58, 59)	1
  (59, 54)	1
  (59, 58)	1
  (60, 13)	1
  (60, 44)	1
  (61, 35)	1
  (62, 2)	1
  (62, 48)	1
  (62, 64)	1
  (63, 34)	1
  (64, 18)	1
  (64, 56)	1
  (64, 62)	1


In [5]:
edge_index_numpy = np.vstack((edge_index_coo.row, edge_index_coo.col)) #convert coo_matrix to numpy.ndarray
edge_index_torch_int32 = torch.from_numpy(edge_index_numpy) #convert numpy.ndarray to torch.int32
edge_index = edge_index_torch_int32.to(torch.int64) #convert torch.int32 to torch.long
print(edge_index)
print(edge_index.dtype)
print(edge_index.type())
print(edge_index.size())

tensor([[ 0,  0,  0,  0,  0,  0,  1,  2,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,
          4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,
          8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
         10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14,
         14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17,
         17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
         19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
         21, 21, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 25,
         25, 26, 26, 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 29, 29, 29, 29, 30,
         30, 30, 30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 33, 33, 33, 33, 33,
         34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36,
         36, 37, 37, 37, 38, 38, 38, 38, 39, 40, 40, 41, 41, 42, 42, 42, 43, 43,
         43, 44, 44, 44, 45,

**Step3 (preparing y)**: Read labels from gdrive and convert the type to what is needed in PyG.

In [6]:
labels_csv = pd.read_csv('/gdrive/MyDrive/Gene/labels_gene.csv',sep=',',header=None)
y = torch.tensor(labels_csv.values)
y.resize_(65)
print(y)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


**Step4 (preparing train_mask, test_mask)**: split x and y into train and test set.

In [7]:
from sklearn.model_selection import train_test_split
main_mask, test_mask, y_main, y_test= train_test_split(x, y,  test_size = 15, random_state = 0, shuffle = True, stratify = y)
train_mask, val_mask, y_train, y_val= train_test_split(main_mask, y_main,  test_size= 10, random_state = 0, shuffle = True, stratify = y_main)
print(train_mask)
print(test_mask)
print(train_mask.size())
print(val_mask.size())
print(test_mask.size())
print(train_mask.type())
print(train_mask.type())

tensor([[10.3728,  9.5330, 10.5377,  ...,  7.4238,  6.7241,  8.8813],
        [10.7661, 14.7948,  6.6166,  ...,  9.0051,  5.7202,  9.6194],
        [ 6.3410,  6.3753,  6.3720,  ..., 11.3862,  6.2982, 11.8073],
        ...,
        [ 5.9675,  5.8466,  6.0180,  ...,  9.0986,  6.0203,  8.9446],
        [11.8756,  6.9120,  6.6157,  ..., 11.1637, 13.5823, 15.8961],
        [ 6.0380,  5.8927,  6.0503,  ...,  5.9728,  5.9658,  6.9342]])
tensor([[ 8.2385,  8.0172,  8.6788,  5.8845,  7.5658,  9.9586,  7.6901,  9.1844,
          7.8667,  7.3978,  6.1370,  6.2020,  7.4180,  8.9171,  7.2701,  7.8511,
          5.9772,  9.4203,  8.0586, 10.9160,  6.7655, 10.1374, 10.4430,  8.2356,
          7.6936,  7.6223,  8.2016,  8.4035,  8.9126,  8.4068,  8.5712,  9.5520,
          8.6565,  8.8856, 10.1591,  6.0529,  8.3346,  8.1694,  7.0149,  8.4008,
          9.2275,  8.0515,  7.6185,  8.2873,  7.3589,  7.4822,  6.9951,  7.7066,
          7.0688,  8.1530, 10.7130,  7.1828,  5.9988,  6.9255,  6.4817,  6.7041,

## To see how many percent of data belong to each class.

In [8]:
labels, counts = np.unique(y, return_counts = True)
print(counts/float(len(y)))

[0.33846154 0.32307692 0.33846154]


In [9]:
labels, counts = np.unique(y_train, return_counts = True)
print(counts/float(len(y_train)))

[0.325 0.325 0.35 ]


In [10]:
labels, counts = np.unique(y_test, return_counts = True)
print(counts/float(len(y_val)))

[0.5 0.5 0.5]


In [11]:
labels, counts = np.unique(y_test, return_counts = True)
print(counts/float(len(y_test)))

[0.33333333 0.33333333 0.33333333]


## Preparing train_mask and test_mask based on PyG.

In [12]:
train_mask = (x.unsqueeze(0) == train_mask.unsqueeze(1)).all(dim=2).any(dim=0)
print(train_mask)
print(train_mask.size())
print(sum(train_mask))

val_mask = (x.unsqueeze(0) == val_mask.unsqueeze(1)).all(dim=2).any(dim=0)
print(val_mask)
print(val_mask.size())
print(sum(val_mask))

test_mask = (x.unsqueeze(0) == test_mask.unsqueeze(1)).all(dim=2).any(dim=0)
print(test_mask)
print(test_mask.size())
print(sum(test_mask))

tensor([False,  True,  True, False,  True,  True, False, False, False, False,
         True,  True,  True,  True,  True, False,  True, False,  True, False,
         True,  True, False,  True,  True,  True,  True, False, False, False,
        False, False,  True,  True,  True,  True,  True, False, False,  True,
         True, False, False,  True, False,  True, False,  True, False, False,
         True,  True,  True,  True, False, False,  True,  True,  True,  True,
         True,  True,  True,  True,  True])
torch.Size([65])
tensor(40)
tensor([False, False, False, False, False, False,  True, False,  True,  True,
        False, False, False, False, False, False, False,  True, False, False,
        False, False,  True, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False,  True, False, False,
        False, False, False, False,  True, False, False, False,  True, False,
        False, False, False, False, False,  True, False, False, False,

## It is time to use p_link as attentions.

In [13]:
p_link = pd.read_csv('/gdrive/MyDrive/Gene/p_links_gene.csv', sep=',',header=None)
p_link = p_link.astype(np.float32)
p_link = torch.tensor(p_link.values)
#p_link = p_link.to(torch.float32)
print(p_link.size())
print(p_link.dtype)
print(p_link.type())

torch.Size([65, 65])
torch.float32
torch.FloatTensor


# Now everything is ready to develop our GNNs.

# Install torch_geometric.

In [14]:
import os
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.13.0+cu116
[K     |████████████████████████████████| 9.4 MB 12.5 MB/s 
[K     |████████████████████████████████| 4.6 MB 13.9 MB/s 
[K     |████████████████████████████████| 280 kB 10.0 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [15]:
from torch_geometric.data import Data
Data = Data(x=x, edge_index=edge_index, y = y, train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)
print(Data)

Data(x=[65, 60], edge_index=[2, 266], y=[65], train_mask=[65], val_mask=[65], test_mask=[65])


# Save data in order to use it. 

In [16]:
data_save_name = 'gene.pt'  
path = F"/gdrive/MyDrive/Gene/{data_save_name}" 
torch.save(Data.to_dict(), path)

In [17]:
p_link_save_name = 'p_links_gene.pt'  
path = F"/gdrive/MyDrive/Gene/{p_link_save_name}" 
torch.save(p_link, path)