## Mount to gdrive to read data, adjacency matrix, p_link and labels. 

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Install pandas to read csv files from gdrive. Install torch and numpy which are packages that are needed durind coding.

In [2]:
!pip install pandas
import pandas as pd
import torch
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## It is time to prepare x, edge_index, y, train_mask and test_mask which are needed for PyG. 


**Step1 (preparing x)**: Read multivariate dataset from gdrive and convert the type to what is needed in PyG.

In [3]:
x = pd.read_csv('/gdrive/MyDrive/Cancer/cancer_t.csv', sep=',',header=None)
x = x.astype(np.float32) #try to cast all DataFrame columns to specified numpy.dtype
x = torch.tensor(x.values) #convert numpy.dtype to a tensor
print(x.sum(dim = 1).unique().size()) #to see if there is a duplicate data in x or not
print(x.dtype)
print(x.type())
print(x)
print(x.size())

torch.Size([80])
torch.float32
torch.FloatTensor
tensor([[1.7990e+01, 1.0380e+01, 1.2280e+02,  ..., 2.6540e-01, 4.6010e-01,
         1.1890e-01],
        [2.0570e+01, 1.7770e+01, 1.3290e+02,  ..., 1.8600e-01, 2.7500e-01,
         8.9020e-02],
        [1.9690e+01, 2.1250e+01, 1.3000e+02,  ..., 2.4300e-01, 3.6130e-01,
         8.7580e-02],
        ...,
        [1.3030e+01, 1.8420e+01, 8.2610e+01,  ..., 5.0130e-02, 1.9870e-01,
         6.1690e-02],
        [1.3080e+01, 1.5710e+01, 8.5630e+01,  ..., 7.2830e-02, 3.1840e-01,
         8.1830e-02],
        [9.5040e+00, 1.2440e+01, 6.0340e+01,  ..., 6.2270e-02, 2.4500e-01,
         7.7730e-02]])
torch.Size([80, 30])


**Step2 (preparing edge_index)**: Read adjacency matrix from gdrive and convert the type to what is needed in PyG. 

In [4]:
from scipy.sparse import coo_matrix
edge_index_csv = pd.read_csv('/gdrive/MyDrive/Cancer/adj_cancer.csv',sep=',',header=None)
#print(edge_index_csv)
edge_index_numpy_ndarry = edge_index_csv.values #convert xlsx file to numpy.ndarry
edge_index_coo = coo_matrix(edge_index_numpy_ndarry) #convert symmetric matrix to coo_matrix
print(edge_index_coo)

  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 11)	1
  (0, 20)	1
  (0, 24)	1
  (0, 25)	1
  (0, 31)	1
  (0, 38)	1
  (0, 45)	1
  (0, 56)	1
  (0, 72)	1
  (0, 77)	1
  (1, 6)	1
  (1, 7)	1
  (1, 10)	1
  (1, 14)	1
  (1, 17)	1
  (1, 20)	1
  (1, 36)	1
  (1, 46)	1
  (1, 60)	1
  (1, 62)	1
  (1, 64)	1
  (1, 68)	1
  :	:
  (78, 35)	1
  (78, 48)	1
  (78, 56)	1
  (78, 64)	1
  (78, 65)	1
  (78, 76)	1
  (79, 5)	1
  (79, 10)	1
  (79, 15)	1
  (79, 16)	1
  (79, 17)	1
  (79, 35)	1
  (79, 36)	1
  (79, 43)	1
  (79, 45)	1
  (79, 46)	1
  (79, 47)	1
  (79, 52)	1
  (79, 58)	1
  (79, 61)	1
  (79, 62)	1
  (79, 70)	1
  (79, 72)	1
  (79, 74)	1
  (79, 76)	1


In [5]:
edge_index_numpy = np.vstack((edge_index_coo.row, edge_index_coo.col)) #convert coo_matrix to numpy.ndarray
edge_index_torch_int32 = torch.from_numpy(edge_index_numpy) #convert numpy.ndarray to torch.int32
edge_index = edge_index_torch_int32.to(torch.int64) #convert torch.int32 to torch.long
print(edge_index)
print(edge_index.dtype)
print(edge_index.type())
print(edge_index.size())

tensor([[ 0,  0,  0,  ..., 79, 79, 79],
        [ 2,  3,  4,  ..., 72, 74, 76]])
torch.int64
torch.LongTensor
torch.Size([2, 872])


**Step3 (preparing y)**: Read labels from gdrive and convert the type to what is needed in PyG.

In [6]:
labels_csv = pd.read_csv('/gdrive/MyDrive/Cancer/labels_cancer.csv',sep=',',header=None)
y = torch.tensor(labels_csv.values)
y.resize_((80))
print(y)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])


**Step4 (preparing train_mask, test_mask)**: split x and y into train and test set.

In [7]:
from sklearn.model_selection import train_test_split
main_mask, test_mask, y_main, y_test= train_test_split(x, y,  test_size = 48, random_state = 0, shuffle = True, stratify=y)
train_mask, val_mask, y_train, y_val= train_test_split(main_mask, y_main,  test_size = 10, random_state = 0, shuffle = True, stratify = y_main)
#train_mask = train_mask.to(torch.long)
#test_mask = test_mask.to(torch.long)
print(train_mask)
print(test_mask)
print(train_mask.size())
print(val_mask.size())
print(test_mask.size())

tensor([[1.1760e+01, 2.1600e+01, 7.4720e+01, 4.2790e+02, 8.6370e-02, 4.9660e-02,
         1.6570e-02, 1.1150e-02, 1.4950e-01, 5.8880e-02, 4.0620e-01, 1.2100e+00,
         2.6350e+00, 2.8470e+01, 5.8570e-03, 9.7580e-03, 1.1680e-02, 7.4450e-03,
         2.4060e-02, 1.7690e-03, 1.2980e+01, 2.5720e+01, 8.2980e+01, 5.1650e+02,
         1.0850e-01, 8.6150e-02, 5.5230e-02, 3.7150e-02, 2.4330e-01, 6.5630e-02],
        [1.0490e+01, 1.9290e+01, 6.7410e+01, 3.3610e+02, 9.9890e-02, 8.5780e-02,
         2.9950e-02, 1.2010e-02, 2.2170e-01, 6.4810e-02, 3.5500e-01, 1.5340e+00,
         2.3020e+00, 2.3130e+01, 7.5950e-03, 2.2190e-02, 2.8800e-02, 8.6140e-03,
         2.7100e-02, 3.4510e-03, 1.1540e+01, 2.3310e+01, 7.4220e+01, 4.0280e+02,
         1.2190e-01, 1.4860e-01, 7.9870e-02, 3.2030e-02, 2.8260e-01, 7.5520e-02],
        [1.6020e+01, 2.3240e+01, 1.0270e+02, 7.9780e+02, 8.2060e-02, 6.6690e-02,
         3.2990e-02, 3.3230e-02, 1.5280e-01, 5.6970e-02, 3.7950e-01, 1.1870e+00,
         2.4660e+00, 4.051

## To see how many percent of data belong to each class.

In [8]:
labels, counts = np.unique(y_train, return_counts = True)
print(counts/float(len(y_train)))

[0.5 0.5]


In [9]:
labels, counts = np.unique(y_test, return_counts = True)
print(counts/float(len(y_test)))

[0.5 0.5]


In [10]:
labels, counts = np.unique(y_val, return_counts = True)
print(counts/float(len(y_val)))

[0.5 0.5]


## Preparing train_mask and test_mask based on PyG.

In [11]:
train_mask = (x.unsqueeze(0) == train_mask.unsqueeze(1)).all(dim=2).any(dim=0)
print(train_mask)
print(train_mask.size())
print(sum(train_mask))

val_mask = (x.unsqueeze(0) == val_mask.unsqueeze(1)).all(dim=2).any(dim=0)
print(val_mask)
print(val_mask.size())
print(sum(val_mask))

test_mask = (x.unsqueeze(0) == test_mask.unsqueeze(1)).all(dim=2).any(dim=0)
print(test_mask)
print(test_mask.size())
print(sum(test_mask))

tensor([False, False,  True, False,  True, False, False, False, False, False,
         True,  True, False, False, False,  True, False, False, False, False,
        False, False,  True, False, False,  True, False,  True, False,  True,
         True, False, False, False, False, False, False, False, False,  True,
        False, False,  True,  True, False, False, False, False, False, False,
        False, False, False, False,  True, False,  True, False, False, False,
        False, False,  True, False,  True,  True,  True, False, False, False,
        False, False, False, False,  True, False,  True, False,  True, False])
torch.Size([80])
tensor(22)
tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True, False,
         True, False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False,  True, False,  True, False, False,
        False, False, False, False,

## It is time to use p_link as attentions.

In [12]:
p_link = pd.read_csv('/gdrive/MyDrive/Cancer/p_links_cancer.csv', sep=',',header=None)
p_link = p_link.astype(np.float32)
p_link = torch.tensor(p_link.values)
#p_link = p_link.to(torch.float32)
print(p_link.size())
print(p_link.dtype)
print(p_link.type())

torch.Size([80, 80])
torch.float32
torch.FloatTensor


# Now everything is ready to develop our GNNs.

# Install torch_geometric.

In [13]:
import os
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.13.0+cu116
[K     |████████████████████████████████| 9.4 MB 5.5 MB/s 
[K     |████████████████████████████████| 4.6 MB 5.1 MB/s 
[K     |████████████████████████████████| 280 kB 4.4 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [14]:
from torch_geometric.data import Data
Data = Data(x=x, edge_index=edge_index, y = y, train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)
print(Data)

Data(x=[80, 30], edge_index=[2, 872], y=[80], train_mask=[80], val_mask=[80], test_mask=[80])


# Save data in order to use it. 

In [15]:
data_save_name = 'cancer.pt'  
path = F"/gdrive/MyDrive/Cancer/{data_save_name}" 
torch.save(Data.to_dict(), path)

In [16]:
p_link_save_name = 'p_links_cancer.pt'  
path = F"/gdrive/MyDrive/Cancer/{p_link_save_name}" 
torch.save(p_link, path)