In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\Desktop\python\thesis-recsys


In [21]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import torch_geometric
import torch_geometric.transforms as T
from torch_geometric import nn
from torch_geometric.sampler import NegativeSampling
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_scipy_sparse_matrix

from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x1c31fbfd5b0>

In [96]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [97]:
with open("data/steam/graph.pkl", "rb") as f:
    graph = pickle.load(f)
    
train_data = graph['train_data']
train_loader = graph['train_loader']
val_data = graph['valid_data']
val_loader = graph['valid_loader']
mp_matrix = graph['mp_matrix']
val_matrix = graph['val_matrix']

n_users, n_items = train_data['user'].x.shape[0], train_data['app'].x.shape[0] 

In [98]:
arr = mp_matrix.sum(axis=1).getA()
unique_values, counts = np.unique(arr, return_counts=True)
normalized_counts = counts / len(arr)

# Create a dictionary with normalized value counts
value_counts_normalized = {value: count for value, count in zip(unique_values, normalized_counts)}

# Print the result
print(value_counts_normalized)

{1.0: 0.6645077918728179, 2.0: 0.1796214914887921, 3.0: 0.070225820999041, 4.0: 0.03321006377821784, 5.0: 0.017800771573286255, 6.0: 0.010632202929448099, 7.0: 0.006738467568455037, 8.0: 0.004447421203298246, 9.0: 0.0030941843095606022, 10.0: 0.0021632225429049465, 11.0: 0.001598449940506489, 12.0: 0.0011729140016323624, 13.0: 0.0009277009548635171, 14.0: 0.0007020527788475053, 15.0: 0.0005295558350433574, 16.0: 0.0004369487801466126, 17.0: 0.0003619501089274179, 18.0: 0.0002849949506329399, 19.0: 0.0002249960136575841, 20.0: 0.00018097505446370895, 21.0: 0.0001633666707861589, 22.0: 0.00014478004357096715, 23.0: 0.00010923719503665315, 24.0: 8.967232428381976e-05, 25.0: 8.15202948034725e-05, 26.0: 6.293366758828077e-05, 27.0: 5.9346774616927985e-05, 28.0: 5.282515103265018e-05, 29.0: 3.652109207195568e-05, 30.0: 3.652109207195568e-05, 31.0: 2.6738656695538982e-05, 32.0: 2.5434331978683422e-05, 33.0: 2.510825079946953e-05, 34.0: 2.2825682544972302e-05, 35.0: 2.1521357828116743e-05, 36.

In [None]:
# real_cols = ['positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount']

# scaler = StandardScaler()
# app_features_norm = scaler.fit_transform(app_features[real_cols].numpy())

In [None]:
# Dataloader:
#  - user: x->attributes of sampled nodes, n_id->mapping of sampled nodes to ids from whole graph
#  - app: x->attributes of sampled nodes, n_id->mapping of sampled nodes to ids from whole graph
#  - (user recommends app): 
#      edge_index -> sampled edges with batch ids with neighbors
#      edge_label -> labels of edges which will be evaluated, size of batch size
#      e_id -> mapping of sampled edges to ids from whole graph, refers to ?????
#      input_id -> mapping of sampled edges to ids from whole graph, refers to edge_label_index
#      edge_label_index -> edge index, ids of nodes in sampled graph which will be evaluated


# To validate nodes first get sampled nodes ids from edge_label_index, then map them to whole graph
# using n_ids of user and app and then check if such edge exists in dataframe

In [112]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = nn.SAGEConv((hidden_channels, hidden_channels), hidden_channels, normalize=True)
        self.conv2 = nn.SAGEConv((hidden_channels, hidden_channels), out_channels, normalize=False)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
    
    
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_app, edge_label_index):
        x_user = x_user[edge_label_index[0]]
        x_app = x_app[edge_label_index[1]]
        return (x_user * x_app).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, entities, hidden_channels, out_channels, metadata):
        super().__init__()
        
        self.user_emb = torch.nn.Embedding(entities[0].x.shape[0], hidden_channels)
        self.app_emb = torch.nn.Embedding(entities[1].x.shape[0], hidden_channels)
        self.app_lin = torch.nn.Linear(entities[1].x.shape[1], hidden_channels)
        
        self.gnn = GNN(hidden_channels=hidden_channels, out_channels=out_channels)
        self.gnn = nn.to_hetero(self.gnn, metadata=metadata, aggr='sum')
        
        self.clf = Classifier()
        
    def forward(self, batch):  
        x_dict = {
          "user": self.user_emb(batch['user'].n_id),
          "app": self.app_emb(batch['app'].n_id) + self.app_lin(batch['app'].x),
        } 
        
        x_dict = self.gnn(x_dict, batch.edge_index_dict)
        pred = self.clf(
            x_dict["user"],
            x_dict["app"],
            batch['user', 'recommends', 'app'].edge_label_index,
        )
        return pred
    
    def evaluate(self, batch):
        x_dict = {
          "user": self.user_emb(batch['user'].n_id),
          "app": self.app_emb(batch['app'].n_id) + self.app_lin(batch['app'].x),
        } 

        x_dict = self.gnn(x_dict, batch.edge_index_dict)

        return x_dict

def xavier_init(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, torch_geometric.nn.dense.linear.Linear):
        torch.nn.init.xavier_normal_(m.weight, gain=1.41)
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)
    
model = Model(entities=(train_data['user'], train_data['app']), 
              hidden_channels=32, out_channels=32, metadata=train_data.metadata())
model.apply(xavier_init)
model = model.to(device)

In [113]:
criterion = torch.nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-2)
#optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-1, momentum=0.9)
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=0.001, momentum=0.9)
#writer = SummaryWriter()

In [114]:
def train(n_epochs=5, print_loss=500):
    model.train()
    
    for epoch in range(n_epochs):
        running_loss = 0.
        preds, ground_truths = [], []
        print(f"Metrics: {evaluate_nn(model, mp_matrix, val_matrix, k=10)}")
        
        for i_batch, batch in enumerate(tqdm(train_loader)):
            batch = batch.to(device)
            
            y_pred = model(batch)
            y_true = batch['user', 'recommends', 'app'].edge_label
            loss = criterion(y_pred, y_true.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds.append(y_pred)
            ground_truths.append(y_true)
            running_loss += loss.item()
            
            if not ((i_batch+1) % print_loss):
                pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
                ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
                last_loss = running_loss / print_loss
                #writer.add_scalar("Loss/train", last_loss, epoch*len(train_loader) + i_batch + 1)
                train_roc_auc = roc_auc_score(ground_truth, pred)
                test_loss, test_roc_auc = test()
                
                
                preds, ground_truths = [], []
                running_loss = 0.
                
                print(f"batch <{i_batch}>\ntrain_loss: {last_loss} - train_roc_auc: {train_roc_auc}\ntest_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n")
        print(f"Epoch: {epoch}, Loss: {running_loss / len(train_loader):.4f}")

In [115]:
@torch.no_grad()
def test():
    model.eval()
    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, batch in enumerate(tqdm(val_loader)):
        batch = batch.to(device)
        y_pred = model(batch)
        y_true = batch['user', 'recommends', 'app'].edge_label
        loss = criterion(y_pred, y_true.float())
        
        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()
        
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    
    test_loss = running_loss / len(val_loader)
    test_score = roc_auc_score(ground_truth, pred)

    return test_loss, test_score

In [116]:
def precision_k(reco_relevance, relevance, k=10):
    v = np.asarray(relevance.sum(axis=1).flatten(), dtype=int)[0].clip(1, k)
    bool_2d = np.vstack([np.concatenate((np.ones(i), np.zeros(k - i))) for i in v]).astype(bool)
    
    prec_k = (reco_relevance.getA().sum(axis=1, where=bool_2d) / v).mean()
    return prec_k

# def mean_average_prec(reco_relevance):
#     K = reco_relevance.shape[1]
    
#     mean_ap = 0.0
#     for k in range(1, K+1):
#         mean_ap += prec_k(reco_relevance[:, :k]) # DODAC MNOŻNIK 1/0 GDY ITEM JEST RELEWANTNY!!!
#     return mean_ap / K

def recall_k(reco_relevance, relevance, k=10):
    sum_relevant = relevance.sum(axis=1)
    return (reco_relevance.sum(axis=1) / sum_relevant).mean()

def ndcg_k(reco_relevance, relevance, k=10):
    v = np.asarray(relevance.sum(axis=1).flatten(), dtype=int)[0].clip(1, k)
    ideal_relevance = np.vstack([np.concatenate((np.ones(i), np.zeros(k - i))) for i in v])
    
    discount = 1 / np.log2(np.arange(2, k+2))
    idcg = (ideal_relevance * discount).sum(axis=1)
    dcg = (reco_relevance * discount).sum(axis=1)
    ndcg = (dcg / idcg).mean()
    
    return ndcg



@torch.no_grad()
def generate_embeddings(model, data):
    model.eval()
    data = data.to(device)
    x_dict = model.evaluate(data)
    return x_dict

@torch.no_grad()
def recommend_k(user_emb, item_emb, past_interactions=None, k=10, user_batch_size=1000):
    def remove_past_interactions(prob, user_batch):
        id_x = np.repeat(np.arange(user_batch.shape[0]), np.diff(past_interactions[user_batch].indptr))
        id_y = past_interactions[user_batch].indices
        prob[id_x, id_y] = -torch.inf
        return prob
    
    recommended_batches = []
    user_batches = torch.arange(user_emb.shape[0]).split(user_batch_size)
    for user_batch in user_batches:
        prob = (user_emb[user_batch] @ item_emb.T).sigmoid()
        prob = remove_past_interactions(prob, user_batch)
        recommended_batches.append(prob.topk(k, 1)[1])
    
    recommendations = torch.cat(recommended_batches, 0)
    return recommendations

def recommendation_relevance(recommendations, ground_truth):
    """
    Computes the relevance matrix of recommended items based on ground truth data.

    This function takes a matrix of recommended items and a ground truth sparse matrix, and calculates
    binary relevance of recommended items for each user. The relevance is determined by
    comparing the recommended items with the actual items in the ground truth.

    Args:
        recommendations (numpy.ndarray): A 2D matrix of shape (n_users, k) where k is the number of 
            recommended items per user. Each row contains indices representing the recommended 
            items for a user.
        ground_truth (scipy.csr_matrix): A sparse matrix of shape (n_users, n_items). The matrix 
            contains binary values indicating whether an item is relevant (1) or not (0) for each user.

    Returns:
        numpy.matrix: A 2D matrix of shape (n_users, k) containing the relevance scores of the
        recommended items for each user.
        
    Raises:
        ValueError: If the dimensions of 'recommendations' and 'ground_truth' do not match or
            are incompatible for matrix operations.
    """
    n_users, n_items = ground_truth.shape
    k = recommendations.shape[1]
    
    if recommendations.shape[0] != n_users:
        raise ValueError("Number of users in 'recommendations' should match 'ground_truth'.")
    
    user_idx = np.repeat(np.arange(n_users), k)
    item_idx = recommendations.flatten()
    relevance = ground_truth[user_idx, item_idx].reshape((n_users, k))  # get values under arrays of indices 
                                                                        # (user_idx and item_idx) from ground truth
    relevance_mask = np.asarray((ground_truth.sum(axis=1) != 0)).ravel()
    
    return relevance, relevance_mask

def evaluate_nn(model, mp_matrix, val_matrix, k):
    x_emb = generate_embeddings(model, val_data)
    recommendations = recommend_k(x_emb['user'], x_emb['app'], past_interactions=mp_matrix, 
                                  k=10, user_batch_size=10000).cpu().numpy()
    reco_relevance, relevance_mask = recommendation_relevance(recommendations, val_matrix)
    
    prec_k = precision_k(reco_relevance[relevance_mask], val_matrix[relevance_mask], k)
    rec_k = recall_k(reco_relevance[relevance_mask], val_matrix[relevance_mask], k)
    n_k = ndcg_k(reco_relevance[relevance_mask].getA(), val_matrix[relevance_mask], k)

    return {f"precision@{k}": prec_k, f"recall@{k}": rec_k, f"ndcg@{k}": n_k}

In [117]:
train(n_epochs=5, print_loss=100)

Metrics: {'precision@10': 8.782875193861385e-07, 'recall@10': 5.5057648e-05, 'ndcg@10': 2.0807841897096314e-05}


 18%|██████████████▉                                                                  | 99/538 [01:10<05:09,  1.42it/s]
  0%|                                                                                          | 0/423 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/423 [00:00<01:11,  5.88it/s][A
  0%|▍                                                                                 | 2/423 [00:00<01:14,  5.65it/s][A
  1%|▌                                                                                 | 3/423 [00:00<01:15,  5.55it/s][A
  1%|▊                                                                                 | 4/423 [00:00<01:16,  5.51it/s][A
  1%|▉                                                                                 | 5/423 [00:00<01:14,  5.63it/s][A
  1%|█▏                                                                                | 6/423 [00:01<01:14,  5.57it/s][A
  2%|█▎            

batch <99>
train_loss: 0.5226540392637253 - train_roc_auc: 0.7755805668115615
test_loss: 0.4642387435509522 - test_roc_auc: 0.8393375689479461



 37%|█████████████████████████████▌                                                  | 199/538 [03:47<03:57,  1.43it/s]
  0%|                                                                                          | 0/423 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/423 [00:00<01:19,  5.32it/s][A
  0%|▍                                                                                 | 2/423 [00:00<01:12,  5.82it/s][A
  1%|▌                                                                                 | 3/423 [00:00<01:02,  6.68it/s][A
  1%|▊                                                                                 | 4/423 [00:00<01:00,  6.91it/s][A
  1%|▉                                                                                 | 5/423 [00:00<00:59,  7.07it/s][A
  1%|█▏                                                                                | 6/423 [00:00<01:00,  6.93it/s][A
  2%|█▎            

batch <199>
train_loss: 0.4545658805966377 - train_roc_auc: 0.8412296646356583
test_loss: 0.4436995692850568 - test_roc_auc: 0.855152691186845



 56%|████████████████████████████████████████████▍                                   | 299/538 [06:14<02:50,  1.40it/s]
  0%|                                                                                          | 0/423 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/423 [00:00<01:29,  4.69it/s][A
  0%|▍                                                                                 | 2/423 [00:00<01:18,  5.34it/s][A
  1%|▌                                                                                 | 3/423 [00:00<01:14,  5.63it/s][A
  1%|▊                                                                                 | 4/423 [00:00<01:12,  5.80it/s][A
  1%|▉                                                                                 | 5/423 [00:00<01:13,  5.70it/s][A
  1%|█▏                                                                                | 6/423 [00:01<01:12,  5.77it/s][A
  2%|█▎            

batch <299>
train_loss: 0.44561696231365205 - train_roc_auc: 0.8486367516517639
test_loss: 0.4351776857325371 - test_roc_auc: 0.8588360267188204



 74%|███████████████████████████████████████████████████████████▎                    | 399/538 [08:52<01:40,  1.38it/s]
  0%|                                                                                          | 0/423 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/423 [00:00<01:11,  5.93it/s][A
  0%|▍                                                                                 | 2/423 [00:00<01:03,  6.64it/s][A
  1%|▌                                                                                 | 3/423 [00:00<01:03,  6.64it/s][A
  1%|▊                                                                                 | 4/423 [00:00<00:59,  7.05it/s][A
  1%|▉                                                                                 | 5/423 [00:00<01:02,  6.64it/s][A
  1%|█▏                                                                                | 6/423 [00:00<01:02,  6.66it/s][A
  2%|█▎            

batch <399>
train_loss: 0.43658892512321473 - train_roc_auc: 0.8556494982004166
test_loss: 0.4312614073567357 - test_roc_auc: 0.8617084037328624



 93%|██████████████████████████████████████████████████████████████████████████▏     | 499/538 [11:21<00:28,  1.38it/s]
  0%|                                                                                          | 0/423 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/423 [00:00<01:30,  4.65it/s][A
  0%|▍                                                                                 | 2/423 [00:00<01:16,  5.47it/s][A
  1%|▌                                                                                 | 3/423 [00:00<01:16,  5.47it/s][A
  1%|▊                                                                                 | 4/423 [00:00<01:13,  5.74it/s][A
  1%|▉                                                                                 | 5/423 [00:00<01:27,  4.78it/s][A
  1%|█▏                                                                                | 6/423 [00:01<01:26,  4.81it/s][A
  2%|█▎            

batch <499>
train_loss: 0.4302865853905678 - train_roc_auc: 0.860192322421074
test_loss: 0.4234188738021445 - test_roc_auc: 0.8714076123837073



100%|████████████████████████████████████████████████████████████████████████████████| 538/538 [13:22<00:00,  1.49s/it]


Epoch: 0, Loss: 0.0299
Metrics: {'precision@10': 0.003407697937599757, 'recall@10': 0.046660796, 'ndcg@10': 0.018845766975732}


 18%|██████████████▉                                                                  | 99/538 [01:13<05:19,  1.37it/s]
  0%|                                                                                          | 0/423 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/423 [00:00<01:28,  4.76it/s][A
  0%|▍                                                                                 | 2/423 [00:00<01:25,  4.95it/s][A
  1%|▌                                                                                 | 3/423 [00:00<01:32,  4.56it/s][A
  1%|▊                                                                                 | 4/423 [00:00<01:24,  4.95it/s][A
  1%|▉                                                                                 | 5/423 [00:00<01:19,  5.23it/s][A
  1%|█▏                                                                                | 6/423 [00:01<01:20,  5.19it/s][A
  2%|█▎            

batch <99>
train_loss: 0.42036893039941786 - train_roc_auc: 0.8680521760225296
test_loss: 0.41401761727975617 - test_roc_auc: 0.8738778201530017



 29%|███████████████████████▏                                                        | 156/538 [03:25<08:23,  1.32s/it]


KeyboardInterrupt: 

In [118]:
evaluate_nn(model, mp_matrix, val_matrix, k=10)

{'precision@10': 0.017306350913520577,
 'recall@10': 0.10469692,
 'ndcg@10': 0.05372430134708632}

In [None]:
p2 = retrieval.RetrievalPrecision(top_k=5)
r2 = retrieval.RetrievalRecall(top_k=2)
ndcg = retrieval.RetrievalNormalizedDCG(top_k=7)

In [None]:
preds = torch.tensor([0.7, 0.8, 0.1, 0.2, 0.4, 0.6, 0.5, 0.9, 0.3, 0.15])
targets = torch.tensor([True, True, False, False, False, False, False, False, False, False])
indices = torch.tensor([0,0,0,0,0,0,0,0,0,0])

mask = torch.ones(preds.shape, dtype=torch.bool)
mask[[4, 5, 7]] = False
print(preds[mask])
print(targets[mask])

In [None]:
preds = torch.tensor([1.0, 0.85, 0.8, 0.7, 0.65])
targets = torch.tensor([False, False, False, False, False])
indices = torch.tensor([0,0,0,0,0])

In [None]:
p2(preds, targets, indexes=indices)

In [None]:
r2(preds, targets, indexes=indices)

In [None]:
ndcg(preds, targets, indexes=indices)

In [None]:
def dcg(rel):
    g = 0.
    for i in range(1,6):
        g+= (2**rel[i-1] - 1)/np.log2(i+1)
    return g

In [None]:
rel = [1,0,1,0,1]
rel_idcg = [1,1,1,0,0]
print(dcg(rel))
print(dcg(rel_idcg))
print(dcg(rel)/dcg(rel_idcg))

In [None]:
torch.tensor([0.9, 0.7, 0.6])

In [None]:
(preds[i][:3] / torch.log2(torch.arange(3)+2)).sum() / (torch.tensor([0.9, 0.7, 0.6]) / torch.log2(torch.arange(3)+2)).sum()

In [None]:
torch.log2(torch.arange(3)+2)

In [None]:
cm = confusion_matrix(y_true.detach().cpu().numpy(), y_pred.detach().cpu().numpy().round())
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[False, True])
cm_display.plot()
plt.show()

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path):
    model = Model(hidden_channels=32, out_channels=32, metadata=train_data.metadata())
    model.load_state_dict(torch.load(path))
    model = model.to(device)
    return model

In [None]:
#save_model(model, "models/gnn_03.pth")

In [None]:
model = load_model("models/gnn_03.pth")

In [None]:
print(nn.summary(model, next(iter(train_loader)).to(device)))

In [None]:
model