# Imports


In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-geometric


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
import math
from torch_geometric.nn import GCNConv
from scipy.sparse import identity
from torch_geometric.utils import from_scipy_sparse_matrix

from random import sample

if not torch.cuda.is_available():
    raise Exception("You should enable GPU runtime")
device = torch.device("cuda")


# Data preprocessing


In [None]:
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/vfayosp/project_aidl2022/main/data/mat_drug_protein.txt'

data = pd.read_csv(url, sep=' ',decimal = ',')

In [None]:
data = np.array(data)
data.shape[0]

707

In [None]:
data.shape

(707, 1512)

In [None]:
import numpy as np
import pandas as pd
import torch.utils.data
from tqdm import tqdm
import scipy.sparse as sp

def build_adj_mx(dims, interactions):
    train_mat = sp.dok_matrix((dims, dims), dtype=np.float32)
    for x in tqdm(interactions, desc="BUILDING ADJACENCY MATRIX..."):
        train_mat[x[0], x[1]] = 1.0
        train_mat[x[1], x[0]] = 1.0

    return train_mat

#Create the masks --> Train, Validation & Test
class DiseaseDrugDataset(torch.utils.data.Dataset):
  def __init__(self, full_dataset, n_train = 0.95 ,n_val = 0, num_negatives_train=4, num_negatives_test=100):
    """
    data is the rows containing all the interactions of a drug with the diseases
    n_train is the amount of training data that the dataset will compute
    n_val is the amount of validation data that the dataset will compute

    ITEMS
    data = matrix with the disease drug interaction
    num_drugs = number of drugs 
    num_diseases = number of diseases

    train_mask = boolean vector with the training mask
    train_drugs = number of the drugs in the training set
    train_data = pandas dataframe with samples for the training set

    val_mask = boolean vector with the validation mask
    val_drugs = number of the drugs in the validation set
    val_data = pandas dataframe with samples for the validation set

    test_mask = boolean vector with the test mask
    test_drugs = number of the drugs in the test set
    test_data = pandas dataframe with samples for the test set
    """
    #falta definir si aqui se le pone ya la matriz con las interacciones o solo los datos limpios
    #y aqui ya se hace la matriz con las interacciones
    
    self.nitems = full_dataset.shape[0]

    #self.full_dataset = [[i, j, full_dataset[i, j]] for i in range(full_dataset.shape[0]) for j in range(full_dataset.shape[1]) if full_dataset[i, j] == 1]
    #self.data = sample(self.full_dataset, int(self.nitems * n_train))
    #self.test_data = [i for i in self.full_dataset if i not in self.data]

    self.data = pd.read_csv("https://raw.githubusercontent.com/luciapp97/upc/main/train2.csv", sep=' ',decimal = ',')
    self.test_data = pd.read_csv("https://raw.githubusercontent.com/luciapp97/upc/main/test2.csv", sep=' ',decimal = ',')

    self.data = pd.DataFrame(self.data).to_numpy()
    self.test_data = pd.DataFrame(self.test_data).to_numpy()
    self.items = self.preprocess_items(self.data)
    self.targets = self.data[:, 2]

    self.field_dims = np.max(self.items, axis = 0) + 1
    print(len(self.items))
    self.train_mat = build_adj_mx(self.field_dims[-1], self.items.copy())
    self.negative_sampling(num_negatives = num_negatives_train)

    self.test_set = self.build_test_set(self.preprocess_items(self.test_data),
                                        num_neg_samples_test = num_negatives_test)
  def __len__(self):
      return self.targets.shape[0]

  def __getitem__(self, index):
      return self.interactions[index]

  def negative_sampling(self, num_negatives = 4):
        self.interactions = []
        data = np.c_[(self.items, self.targets)].astype(int)
        max_users, max_items = self.field_dims[:2] 

        for x in tqdm(data, desc="Performing negative sampling on test data..."):  # x are triplets (u, i , 1) 
            # Append positive interaction
            self.interactions.append(x)
            # Copy user and maintain last position to 0. Now we will need to update neg_triplet[1] with j
            neg_triplet = np.vstack([x, ] * (num_negatives))
            neg_triplet[:, 2] = np.zeros(num_negatives)

            # Generate num_negatives negative interactions
            for idx in range(num_negatives):
                j = np.random.randint(max_users, max_items)
                # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
                while (x[0], j) in self.train_mat:
                    j = np.random.randint(max_users, max_items)
                neg_triplet[:, 1][idx] = j
            self.interactions.append(neg_triplet.copy())

        self.interactions = np.vstack(self.interactions)

  def preprocess_items(self, data):
        
        reindexed_items = data[:, :2].astype(np.int)  # -1 because ID begins from 1
        #users, items = np.max(reindexed_items, axis=0)[:2] + 1 # [ 943, 1682])
        reindexed_items[:, 1] = reindexed_items[:, 1] + self.nitems

        return reindexed_items

  def build_test_set(self, gt_test_interactions, num_neg_samples_test=99):
        max_users, max_items = self.field_dims[:2] 
        test_set = []
        for pair in tqdm(gt_test_interactions, desc="BUILDING TEST SET..."):
            negatives = []
            for t in range(num_neg_samples_test):
                j = np.random.randint(max_users, max_items)
                while (pair[0], j) in self.train_mat or j == pair[1]:
                    j = np.random.randint(max_users, max_items)
                negatives.append(j)

            single_user_test_set = np.vstack([pair, ] * (len(negatives)+1))
            single_user_test_set[:, 1][1:] = negatives
            test_set.append(single_user_test_set.copy())
        return test_set

In [None]:
dataset = DiseaseDrugDataset(data, num_negatives_test=709)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1742


BUILDING ADJACENCY MATRIX...: 100%|██████████| 1742/1742 [00:00<00:00, 35559.31it/s]
Performing negative sampling on test data...: 100%|██████████| 1742/1742 [00:00<00:00, 17885.90it/s]
BUILDING TEST SET...: 100%|██████████| 368/368 [00:01<00:00, 226.28it/s]


In [None]:
dataset.data[1][0]

623

In [None]:
for i in range(len(dataset.data)):
  #print(f'Iteration {i} : {dataset.data[i][0]}')
  # assert dataset.test_set[i][0][0] == i
    if dataset.data[i][0] == 8:
      print ('hola')

print("todo ok")


todo ok


In [None]:
import os
logs_base_dir = "runs"
os.makedirs(logs_base_dir, exist_ok=True)

tb = True 

%load_ext tensorboard 
from torch.utils.tensorboard import SummaryWriter

tb_fm = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_FM/')
tb_gcn = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_GCN/')
tb_gcn_attention = SummaryWriter(log_dir=f'{logs_base_dir}/{logs_base_dir}_GCN_att/')

# FactorizationMachineModel

In [None]:
from torch.utils.data import DataLoader

data_loader = DataLoader(dataset, batch_size=256, shuffle=True, num_workers=0)


def getHitRatio(recommend_list, gt_item):
    if gt_item in recommend_list:
        return 1
    else:
        return 0

def getNDCG(recommend_list, gt_item):
    idx = np.where(recommend_list == gt_item)[0]
    if len(idx) > 0:
        return math.log(2)/math.log(idx+2)
    else:
        return 0

# Linear part of the equation
class FeaturesLinear(torch.nn.Module):

    def __init__(self, field_dims, output_dim=1):
        super().__init__()

        self.fc = torch.nn.Embedding(field_dims, output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        # self.fc(x).shape --> [batch_size, num_fields, 1]
        # torch.sum(self.fc(x), dim=1).shape --> ([batch_size, 1])
        return torch.sum(self.fc(x), dim=1) + self.bias
        #return self.fc(x).squeeze(1) + self.bias

# FM part of the equation
class FM_operation(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix

class FactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        # field_dims == total of nodes (sum users + context)
        #self.linear = torch.nn.Linear(field_dims, 1, bias=True)
        self.linear = FeaturesLinear(field_dims)
        self.embedding = torch.nn.Embedding(field_dims, embed_dim, sparse=False)
        self.fm = FM_operation(reduce_sum=True)

        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs):
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.linear(interaction_pairs) + self.fm(self.embedding(interaction_pairs))
        
        return out.squeeze(1)
        
    def predict(self, interactions, device):
        # return the score, inputs are numpy arrays, outputs are tensors
 
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores

from statistics import mean

def train_one_epoch(model, optimizer, data_loader, criterion, device, log_interval=100):
    model.train()
    total_loss = []

    for i, (interactions) in enumerate(data_loader):
        interactions = interactions.to(device)
        targets = interactions[:,2]
        predictions = model(interactions[:,:2])
        
        loss = criterion(predictions, targets.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss.append(loss.item())

    return mean(total_loss)

def test(model, full_dataset, device, topk=11):
    # Test the HR and NDCG for the model @topK
    model.eval()

    HR, NDCG = [], []
    i = 0
    for user_test in full_dataset.test_set:
        i+=1
        gt_item = user_test[0][1]
        predictions = model.predict(user_test, device)
        _, indices = torch.topk(predictions, topk)
        indices = indices.cpu().detach().numpy()
        recommend_list = user_test[indices][:, 1]
        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
    return mean(HR), mean(NDCG)

model = FactorizationMachineModel(dataset.field_dims[-1], 32).to(device)
criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

# DO EPOCHS NOW
tb = True
topk = 10
for epoch_i in range(150):
    #data_loader.dataset.negative_sampling()
    train_loss = train_one_epoch(model, optimizer, data_loader, criterion, device)
    hr, ndcg = test(model, dataset, device, topk=topk)

    print('\n')

    print(f'epoch {epoch_i}:')
    print(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} ')
    print('\n')

    if tb:
        tb_fm.add_scalar('train/loss', train_loss, epoch_i)
        tb_fm.add_scalar('eval/HR@{topk}', hr, epoch_i)
        tb_fm.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)



epoch 0:
training loss = 0.9399 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0051 




epoch 1:
training loss = 0.9320 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 2:
training loss = 0.9273 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 3:
training loss = 0.9200 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 4:
training loss = 0.9138 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 5:
training loss = 0.9076 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 6:
training loss = 0.8996 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 7:
training loss = 0.8920 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0052 




epoch 8:
training loss = 0.8834 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0051 




epoch 9:
training loss = 0.8738 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0051 




epoch 10:
training loss = 0.8670 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0051 




epoch 11:
training loss = 0.8584 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0050 




epoch 12:
training loss = 0.8493 | Eval: HR@10 = 0.0136, NDC

In [None]:
%tensorboard --logdir runs

<IPython.core.display.Javascript object>

# FM with GCN

In [None]:
from scipy.sparse import identity
from torch_geometric.utils import from_scipy_sparse_matrix


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """ Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

from torch_geometric.nn import GCNConv, GATConv # https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html?highlight=GCNConv#torch_geometric.nn.conv.GCNConv
from torch_geometric.utils import from_scipy_sparse_matrix

class GraphModel(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, features, train_mat, attention=False):

        super().__init__()

        self.A = train_mat
        self.features = features
        if attention:
            self.GCN_module = GATConv(int(field_dims), embed_dim, heads=8, dropout=0.6)
        else:  
            self.GCN_module = GCNConv(field_dims, embed_dim)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        return self.GCN_module(self.features, self.A)[x]

class FactorizationMachineModel_withGCN(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, field_dims, embed_dim, X, A, attention=False):
        super().__init__()

        self.linear = FeaturesLinear(field_dims)
        #self.embedding = torch.nn.Embedding(field_dims, embed_dim, sparse=False)
        self.embedding = GraphModel(field_dims, embed_dim, X, A, attention=attention)
        self.fm = FM_operation(reduce_sum=True)

        #torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs):
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.linear(interaction_pairs) + self.fm(self.embedding(interaction_pairs))
        return out.squeeze(1)
        
    def predict(self, interactions, device):
        # return the score, inputs are numpy arrays, outputs are tensors
 
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores

In [None]:
X = sparse_mx_to_torch_sparse_tensor(identity(dataset.train_mat.shape[0]))
edge_idx, edge_attr = from_scipy_sparse_matrix(dataset.train_mat)
model_gcn = FactorizationMachineModel_withGCN(dataset.field_dims[-1],
                                              64,
                                              X.to(device),
                                              edge_idx.to(device),
                                              ).to(device)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model_gcn.parameters(), lr=0.001)


In [None]:
topk = 10
for epoch_i in range(150):
    #data_loader.dataset.negative_sampling()
    train_loss = train_one_epoch(model_gcn, optimizer, data_loader, criterion, device)
    hr, ndcg = test(model_gcn, dataset, device, topk=topk)

    print('\n')

    print(f'epoch {epoch_i}:')
    print(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} ')
    print('\n')
    if tb:
        tb_gcn.add_scalar('train/loss', train_loss, epoch_i)
        tb_gcn.add_scalar('eval/HR@{topk}', hr, epoch_i)
        tb_gcn.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)



epoch 0:
training loss = 0.8761 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 1:
training loss = 0.8722 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 2:
training loss = 0.8643 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 3:
training loss = 0.8560 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 4:
training loss = 0.8507 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 5:
training loss = 0.8400 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 6:
training loss = 0.8315 | Eval: HR@10 = 0.0027, NDCG@10 = 0.0027 




epoch 7:
training loss = 0.8203 | Eval: HR@10 = 0.0054, NDCG@10 = 0.0035 




epoch 8:
training loss = 0.8084 | Eval: HR@10 = 0.0082, NDCG@10 = 0.0043 




epoch 9:
training loss = 0.7970 | Eval: HR@10 = 0.0136, NDCG@10 = 0.0061 




epoch 10:
training loss = 0.7797 | Eval: HR@10 = 0.0190, NDCG@10 = 0.0079 




epoch 11:
training loss = 0.7647 | Eval: HR@10 = 0.0299, NDCG@10 = 0.0124 




epoch 12:
training loss = 0.7468 | Eval: HR@10 = 0.0353, NDC

# GCN ATT

In [None]:
model_gcn_att = FactorizationMachineModel_withGCN(dataset.field_dims[-1],
                                                  64,
                                                  X.to(device),
                                                  edge_idx.to(device),
                                                  attention=True
                                                  ).to(device)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model_gcn_att.parameters(), lr=0.001)
for epoch_i in range(150):
    #data_loader.dataset.negative_sampling()
    train_loss = train_one_epoch(model_gcn_att, optimizer, data_loader, criterion, device)
    hr, ndcg = test(model_gcn_att, dataset, device, topk=topk)

    print('\n')

    print(f'epoch {epoch_i}:')
    print(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} ')
    print('\n')
    if tb:
        tb_gcn_attention.add_scalar('train/loss', train_loss, epoch_i)
        tb_gcn_attention.add_scalar('eval/HR@{topk}', hr, epoch_i)
        tb_gcn_attention.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)




epoch 0:
training loss = 0.7947 | Eval: HR@10 = 0.0245, NDCG@10 = 0.0083 




epoch 1:
training loss = 0.7884 | Eval: HR@10 = 0.0245, NDCG@10 = 0.0084 




epoch 2:
training loss = 0.7793 | Eval: HR@10 = 0.0272, NDCG@10 = 0.0092 




epoch 3:
training loss = 0.7711 | Eval: HR@10 = 0.0272, NDCG@10 = 0.0093 




epoch 4:
training loss = 0.7606 | Eval: HR@10 = 0.0272, NDCG@10 = 0.0094 




epoch 5:
training loss = 0.7515 | Eval: HR@10 = 0.0272, NDCG@10 = 0.0096 




epoch 6:
training loss = 0.7378 | Eval: HR@10 = 0.0272, NDCG@10 = 0.0096 




epoch 7:
training loss = 0.7276 | Eval: HR@10 = 0.0299, NDCG@10 = 0.0106 




epoch 8:
training loss = 0.7127 | Eval: HR@10 = 0.0326, NDCG@10 = 0.0118 




epoch 9:
training loss = 0.6987 | Eval: HR@10 = 0.0353, NDCG@10 = 0.0129 




epoch 10:
training loss = 0.6764 | Eval: HR@10 = 0.0462, NDCG@10 = 0.0174 




epoch 11:
training loss = 0.6562 | Eval: HR@10 = 0.0679, NDCG@10 = 0.0299 




epoch 12:
training loss = 0.6315 | Eval: HR@10 = 0.0707, NDC