In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\thesis-recsys


In [2]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.tensorboard import SummaryWriter
import torch_geometric
import torch_geometric.transforms as T
from torch_geometric.sampler import NegativeSampling
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_scipy_sparse_matrix

from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

from models.gnn.sage import GraphSAGE
from scripts.train_graph import train_epoch, test

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x16302d6f810>

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
with open(os.path.join("data/steam", 'graph.pkl'), "rb") as f:
    graph = pd.read_pickle(f)

train_data = graph['train_data']
valid_data = graph['valid_data']

user_shape = train_data['user'].x.shape
app_shape = train_data['app'].x.shape

In [None]:
next(iter(train_loader))[('user', 'recommends', 'app')]['edge_label'].to(torch.float).mean()

In [None]:
arr = mp_matrix.sum(axis=1).getA()
unique_values, counts = np.unique(arr, return_counts=True)
normalized_counts = counts / len(arr)

# Create a dictionary with normalized value counts
value_counts_normalized = {value: count for value, count in zip(unique_values, normalized_counts)}

# Print the result
print(value_counts_normalized)

In [None]:
# real_cols = ['positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount']

# scaler = StandardScaler()
# app_features_norm = scaler.fit_transform(app_features[real_cols].numpy())

In [None]:
# Dataloader:
#  - user: x->attributes of sampled nodes, n_id->mapping of sampled nodes to ids from whole graph
#  - app: x->attributes of sampled nodes, n_id->mapping of sampled nodes to ids from whole graph
#  - (user recommends app): 
#      edge_index -> sampled edges with batch ids with neighbors
#      edge_label -> labels of edges which will be evaluated, size of batch size
#      e_id -> mapping of sampled edges to ids from whole graph, refers to ?????
#      input_id -> mapping of sampled edges to ids from whole graph, refers to edge_label_index
#      edge_label_index -> edge index, ids of nodes in sampled graph which will be evaluated


# To validate nodes first get sampled nodes ids from edge_label_index, then map them to whole graph
# using n_ids of user and app and then check if such edge exists in dataframe

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = nn.SAGEConv((hidden_channels, hidden_channels), hidden_channels, normalize=True)
        self.conv2 = nn.SAGEConv((hidden_channels, hidden_channels), out_channels, normalize=False)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
    
    
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_app, edge_label_index):
        x_user = x_user[edge_label_index[0]]
        x_app = x_app[edge_label_index[1]]
        return (x_user * x_app).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, entities, hidden_channels, out_channels, metadata):
        super().__init__()
        
        self.user_emb = torch.nn.Embedding(entities[0].x.shape[0], hidden_channels)
        self.app_emb = torch.nn.Embedding(entities[1].x.shape[0], hidden_channels)
        self.app_lin = torch.nn.Linear(entities[1].x.shape[1], hidden_channels)
        
        self.gnn = GNN(hidden_channels=hidden_channels, out_channels=out_channels)
        self.gnn = nn.to_hetero(self.gnn, metadata=metadata, aggr='sum')
        
        self.clf = Classifier()
        
    def forward(self, batch):  
        x_dict = {
          "user": self.user_emb(batch['user'].n_id),
          "app": self.app_emb(batch['app'].n_id) + self.app_lin(batch['app'].x),
        } 
        
        x_dict = self.gnn(x_dict, batch.edge_index_dict)
        pred = self.clf(
            x_dict["user"],
            x_dict["app"],
            batch['user', 'recommends', 'app'].edge_label_index,
        )
        return pred
    
    def evaluate(self, batch):
        x_dict = {
          "user": self.user_emb(batch['user'].n_id),
          "app": self.app_emb(batch['app'].n_id) + self.app_lin(batch['app'].x),
        } 

        x_dict = self.gnn(x_dict, batch.edge_index_dict)

        return x_dict

def xavier_init(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, torch_geometric.nn.dense.linear.Linear):
        torch.nn.init.xavier_normal_(m.weight, gain=1.41)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)
    
model = Model(entities=(train_data['user'], train_data['app']), 
              hidden_channels=32, out_channels=32, metadata=train_data.metadata())
model.apply(xavier_init)
model = model.to(device)

In [5]:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(('user', 'recommends', 'app'), train_data['user', 'recommends', 'app'].edge_label_index),
    edge_label=train_data['user', 'recommends', 'app'].edge_label,
    batch_size=1024,
    shuffle=True,
    drop_last=True
)
valid_loader = LinkNeighborLoader(
    data=valid_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(('user', 'recommends', 'app'), valid_data['user', 'recommends', 'app'].edge_label_index),
    edge_label=valid_data['user', 'recommends', 'app'].edge_label,
    batch_size=1024,
    shuffle=True,
    drop_last=True
)

In [6]:
model = GraphSAGE(
    entities_shapes={"user": user_shape, "app": app_shape},
    hidden_channels=32,
    out_channels=32,
    metadata=train_data.metadata()
).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=1e-4, momentum=0.9)

In [7]:
n_epochs = 10
for epoch in tqdm(range(n_epochs)):
    train_loss, train_roc_auc = train_epoch(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        val_loader=valid_loader,
        device=device
    )
    test_loss, test_roc_auc = test(
        model=model,
        criterion=criterion,
        val_loader=valid_loader,
        device=device
    )
    print(f"""Epoch <{epoch}>\ntrain_loss: {train_loss} - train_roc_auc: {train_roc_auc}
test_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n""")

  0%|                                                                                                                                                                                       | 0/10 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:02<21:13,  2.38s/it][A
  0%|▋                                                                                                                                                                             | 2/537 [00:02<09:41,  1.09s/it][A
  1%|▉                                                                                                                                         

Epoch <0>
train_loss: 0.4491476302262347 - train_roc_auc: 0.8478392530311663
test_loss: 0.36380964849514985 - test_roc_auc: 0.9089817166280234




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<02:04,  4.32it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<01:55,  4.63it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<01:54,  4.67it/s][A
  1%|█▎                                                                                                                                    

Epoch <1>
train_loss: 0.3601565227273234 - train_roc_auc: 0.9088201024153
test_loss: 0.3371437845213153 - test_roc_auc: 0.9325937432625452




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<04:15,  2.10it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<02:46,  3.22it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<02:21,  3.77it/s][A
  1%|█▎                                                                                                                                    

Epoch <2>
train_loss: 0.321069308554882 - train_roc_auc: 0.9283290979509333
test_loss: 0.29611034585401347 - test_roc_auc: 0.9401597950832374




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<04:28,  2.00it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<02:45,  3.22it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<02:22,  3.74it/s][A
  1%|█▎                                                                                                                                    

Epoch <3>
train_loss: 0.30043698777921596 - train_roc_auc: 0.9373530886152405
test_loss: 0.279517573344199 - test_roc_auc: 0.9460874201862057




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<04:56,  1.81it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<02:59,  2.98it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<02:27,  3.62it/s][A
  1%|█▎                                                                                                                                    

Epoch <4>
train_loss: 0.287307278903503 - train_roc_auc: 0.9425962293389337
test_loss: 0.28450332356007746 - test_roc_auc: 0.9489820295970038




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<03:25,  2.61it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<02:23,  3.74it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<02:09,  4.13it/s][A
  1%|█▎                                                                                                                                    

Epoch <5>
train_loss: 0.27435687525645314 - train_roc_auc: 0.9475462972199894
test_loss: 0.26212141285010426 - test_roc_auc: 0.9525156925637159




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<04:28,  2.00it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<02:53,  3.09it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<02:26,  3.64it/s][A
  1%|█▎                                                                                                                                    

Epoch <6>
train_loss: 0.2710418008004265 - train_roc_auc: 0.948764814635892
test_loss: 0.27023594664030165 - test_roc_auc: 0.9513456812444926




  0%|                                                                                                                                                                                      | 0/537 [00:00<?, ?it/s][A
  0%|▎                                                                                                                                                                             | 1/537 [00:00<04:42,  1.90it/s][A
  0%|▋                                                                                                                                                                             | 2/537 [00:00<02:54,  3.07it/s][A
  1%|▉                                                                                                                                                                             | 3/537 [00:00<02:30,  3.55it/s][A
  1%|█▎                                                                                                                                    

In [20]:
# from datetime import datetime
# torch.save(model.state_dict(), f"models/{model.__class__.__name__}/{datetime.now().strftime('%Y%m%d%H%M%S')}.pth")

In [None]:
def train(n_epochs=5, print_loss=500):
    model.train()
    
    for epoch in range(n_epochs):
        running_loss = 0.
        preds, ground_truths = [], []
        print(f"Metrics: {evaluate_nn(model, mp_matrix, val_matrix, k=10)}")
        
        for i_batch, batch in enumerate(tqdm(train_loader)):
            batch = batch.to(device)
            
            y_pred = model(batch)
            y_true = batch['user', 'recommends', 'app'].edge_label
            loss = criterion(y_pred, y_true.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds.append(y_pred)
            ground_truths.append(y_true)
            running_loss += loss.item()
            
            if not ((i_batch+1) % print_loss):
                pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
                ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
                last_loss = running_loss / print_loss
                #writer.add_scalar("Loss/train", last_loss, epoch*len(train_loader) + i_batch + 1)
                train_roc_auc = roc_auc_score(ground_truth, pred)
                test_loss, test_roc_auc = test()
                
                
                preds, ground_truths = [], []
                running_loss = 0.
                
                print(f"batch <{i_batch}>\ntrain_loss: {last_loss} - train_roc_auc: {train_roc_auc}\ntest_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n")
        print(f"Epoch: {epoch}, Loss: {running_loss / len(train_loader):.4f}")

In [None]:
@torch.no_grad()
def test():
    model.eval()
    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, batch in enumerate(val_loader):
        batch = batch.to(device)
        y_pred = model(batch)
        y_true = batch['user', 'recommends', 'app'].edge_label
        loss = criterion(y_pred, y_true.float())
        
        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()
        
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    
    test_loss = running_loss / len(val_loader)
    test_score = roc_auc_score(ground_truth, pred)

    return test_loss, test_score

In [None]:
def precision_k(reco_relevance, relevance, k=10):
    v = np.asarray(relevance.sum(axis=1).flatten(), dtype=int)[0].clip(1, k)
    bool_2d = np.vstack([np.concatenate((np.ones(i), np.zeros(k - i))) for i in v]).astype(bool)
    
    prec_k = (reco_relevance.getA().sum(axis=1, where=bool_2d) / v).mean()
    return prec_k

# def mean_average_prec(reco_relevance):
#     K = reco_relevance.shape[1]
    
#     mean_ap = 0.0
#     for k in range(1, K+1):
#         mean_ap += prec_k(reco_relevance[:, :k]) # DODAC MNOŻNIK 1/0 GDY ITEM JEST RELEWANTNY!!!
#     return mean_ap / K

def recall_k(reco_relevance, relevance, k=10):
    sum_relevant = relevance.sum(axis=1)
    return (reco_relevance.sum(axis=1) / sum_relevant).mean()

def ndcg_k(reco_relevance, relevance, k=10):
    v = np.asarray(relevance.sum(axis=1).flatten(), dtype=int)[0].clip(1, k)
    ideal_relevance = np.vstack([np.concatenate((np.ones(i), np.zeros(k - i))) for i in v])
    
    discount = 1 / np.log2(np.arange(2, k+2))
    idcg = (ideal_relevance * discount).sum(axis=1)
    dcg = (reco_relevance * discount).sum(axis=1)
    ndcg = (dcg / idcg).mean()
    
    return ndcg



@torch.no_grad()
def generate_embeddings(model, data):
    model.eval()
    data = data.to(device)
    x_dict = model.evaluate(data)
    return x_dict

@torch.no_grad()
def recommend_k(user_emb, item_emb, past_interactions=None, k=10, user_batch_size=1000):
    def remove_past_interactions(prob, user_batch):
        id_x = np.repeat(np.arange(user_batch.shape[0]), np.diff(past_interactions[user_batch].indptr))
        id_y = past_interactions[user_batch].indices
        prob[id_x, id_y] = -torch.inf
        return prob
    
    recommended_batches = []
    user_batches = torch.arange(user_emb.shape[0]).split(user_batch_size)
    for user_batch in user_batches:
        prob = (user_emb[user_batch] @ item_emb.T).sigmoid()
        prob = remove_past_interactions(prob, user_batch)
        recommended_batches.append(prob.topk(k, 1)[1])
    
    recommendations = torch.cat(recommended_batches, 0)
    return recommendations

def recommendation_relevance(recommendations, ground_truth):
    """
    Computes the relevance matrix of recommended items based on ground truth data.

    This function takes a matrix of recommended items and a ground truth sparse matrix, and calculates
    binary relevance of recommended items for each user. The relevance is determined by
    comparing the recommended items with the actual items in the ground truth.

    Args:
        recommendations (numpy.ndarray): A 2D matrix of shape (n_users, k) where k is the number of 
            recommended items per user. Each row contains indices representing the recommended 
            items for a user.
        ground_truth (scipy.csr_matrix): A sparse matrix of shape (n_users, n_items). The matrix 
            contains binary values indicating whether an item is relevant (1) or not (0) for each user.

    Returns:
        numpy.matrix: A 2D matrix of shape (n_users, k) containing the relevance scores of the
        recommended items for each user.
        
    Raises:
        ValueError: If the dimensions of 'recommendations' and 'ground_truth' do not match or
            are incompatible for matrix operations.
    """
    n_users, n_items = ground_truth.shape
    k = recommendations.shape[1]
    
    if recommendations.shape[0] != n_users:
        raise ValueError("Number of users in 'recommendations' should match 'ground_truth'.")
    
    user_idx = np.repeat(np.arange(n_users), k)
    item_idx = recommendations.flatten()
    relevance = ground_truth[user_idx, item_idx].reshape((n_users, k))  # get values under arrays of indices 
                                                                        # (user_idx and item_idx) from ground truth
    relevance_mask = np.asarray((ground_truth.sum(axis=1) != 0)).ravel()
    
    return relevance, relevance_mask

def evaluate_nn(model, mp_matrix, val_matrix, k):
    x_emb = generate_embeddings(model, val_data)
    recommendations = recommend_k(x_emb['user'], x_emb['app'], past_interactions=mp_matrix, 
                                  k=10, user_batch_size=10000).cpu().numpy()
    reco_relevance, relevance_mask = recommendation_relevance(recommendations, val_matrix)
    
    prec_k = precision_k(reco_relevance[relevance_mask], val_matrix[relevance_mask], k)
    rec_k = recall_k(reco_relevance[relevance_mask], val_matrix[relevance_mask], k)
    n_k = ndcg_k(reco_relevance[relevance_mask].getA(), val_matrix[relevance_mask], k)

    return {f"precision@{k}": prec_k, f"recall@{k}": rec_k, f"ndcg@{k}": n_k}

In [None]:
train(n_epochs=5, print_loss=30)

In [None]:
evaluate_nn(model, mp_matrix, val_matrix, k=10)

In [None]:
p2 = retrieval.RetrievalPrecision(top_k=5)
r2 = retrieval.RetrievalRecall(top_k=2)
ndcg = retrieval.RetrievalNormalizedDCG(top_k=7)

In [None]:
preds = torch.tensor([0.7, 0.8, 0.1, 0.2, 0.4, 0.6, 0.5, 0.9, 0.3, 0.15])
targets = torch.tensor([True, True, False, False, False, False, False, False, False, False])
indices = torch.tensor([0,0,0,0,0,0,0,0,0,0])

mask = torch.ones(preds.shape, dtype=torch.bool)
mask[[4, 5, 7]] = False
print(preds[mask])
print(targets[mask])

In [None]:
preds = torch.tensor([1.0, 0.85, 0.8, 0.7, 0.65])
targets = torch.tensor([False, False, False, False, False])
indices = torch.tensor([0,0,0,0,0])

In [None]:
p2(preds, targets, indexes=indices)

In [None]:
r2(preds, targets, indexes=indices)

In [None]:
ndcg(preds, targets, indexes=indices)

In [None]:
def dcg(rel):
    g = 0.
    for i in range(1,6):
        g+= (2**rel[i-1] - 1)/np.log2(i+1)
    return g

In [None]:
rel = [1,0,1,0,1]
rel_idcg = [1,1,1,0,0]
print(dcg(rel))
print(dcg(rel_idcg))
print(dcg(rel)/dcg(rel_idcg))

In [None]:
torch.tensor([0.9, 0.7, 0.6])

In [None]:
(preds[i][:3] / torch.log2(torch.arange(3)+2)).sum() / (torch.tensor([0.9, 0.7, 0.6]) / torch.log2(torch.arange(3)+2)).sum()

In [None]:
torch.log2(torch.arange(3)+2)

In [None]:
cm = confusion_matrix(y_true.detach().cpu().numpy(), y_pred.detach().cpu().numpy().round())
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[False, True])
cm_display.plot()
plt.show()

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path):
    model = Model(hidden_channels=32, out_channels=32, metadata=train_data.metadata())
    model.load_state_dict(torch.load(path))
    model = model.to(device)
    return model

In [None]:
#save_model(model, "models/gnn_03.pth")

In [None]:
model = load_model("models/gnn_03.pth")

In [None]:
print(nn.summary(model, next(iter(train_loader)).to(device)))

In [None]:
model