In [11]:
import torch
torch.manual_seed(500)

from torch import Tensor
import csv
import pandas as pd


def read_movies():
    # read csv with movies for budget and imdb_id
    columns_of_interest = ['budget', 'imdb_id', 'revenue', 'vote_average', 'directors', 'vote_count']
    data = []
    with open('./data/movie_data_tmbd.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='|')
        for row in reader:
            extracted_row = {col: row[col] for col in columns_of_interest}
            data.append(extracted_row)

    movies_budget_df = pd.DataFrame(data)
    movies_budget_df = movies_budget_df.fillna({
        'budget': 0,
        'imdb_id': '',
        'title': '',
        'director': '',
        'revenue': 0,
        'vote_count': 0,
    })

    # merge movie budget with id
    link_df = pd.read_csv("./data/small/links.csv")
    link_df['imdbId'] = link_df['imdbId'].apply(lambda x: f'tt0{int(x)}')

    movies_id_df = pd.merge(movies_budget_df, link_df, left_on='imdb_id', right_on='imdbId', how='inner')
    movies_id_df['budget'] = pd.to_numeric(movies_id_df['budget'])
    movies_id_df['revenue'] = pd.to_numeric(movies_id_df['revenue'])
    movies_id_df['vote_count'] = pd.to_numeric(movies_id_df['vote_count'])
    movies_id_df = movies_id_df[movies_id_df.budget != 0]
    movies_id_df = movies_id_df[movies_id_df.revenue != 0]

    movies_info_df = pd.read_csv("./data/small/movies.csv")
    movies_df = pd.merge(movies_id_df, movies_info_df, on="movieId", how="inner")

    ratings_df = pd.read_csv("./data/small/ratings.csv")
    #ratings_df = ratings_df.iloc[:ratings_df.shape[0]//10]
    ratings_df = ratings_df[ratings_df['movieId'].isin(movies_df['movieId'])]

    return movies_df, ratings_df

movies_df, ratings_df = read_movies()
#print(movies_df.head())
print(ratings_df.head())
print(len(movies_df))
print(len(ratings_df))

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
5       1       70     3.0  964982400
2427
55445


In [12]:
genres = movies_df['genres'].str.get_dummies('|')
movie_feat = torch.from_numpy(genres.values).to(torch.float)
movies_df['vote_average'].fillna(5.0, inplace=True)

vote_average = torch.from_numpy(movies_df['vote_average'].astype(float).values).to(torch.float).unsqueeze(-1)
revenue = torch.from_numpy(movies_df['revenue'].values).to(torch.float).unsqueeze(-1)
budget = torch.from_numpy(movies_df['budget'].values).to(torch.float).unsqueeze(-1)
vote_count = torch.from_numpy(movies_df['vote_count'].values).to(torch.float).unsqueeze(-1)

standardize = lambda x: (x - x.mean()) / x.std()

vote_average = standardize(vote_average)
revenue = standardize(revenue)
budget = standardize(budget)
vote_count = standardize(vote_count)

movie_feat = torch.cat([movie_feat, vote_count, vote_average, revenue, budget], dim=1)

print(movie_feat.shape)

torch.Size([2427, 23])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['vote_average'].fillna(5.0, inplace=True)


In [13]:
import pandas as pd
from ast import literal_eval

# Check if conversion is needed and perform it safely
def safe_literal_eval(s):
    try:
        if isinstance(s, str):
            return literal_eval(s)
        else:
            return s  # assuming s is already the correct format
    except (ValueError, SyntaxError):
        return None

movies_df['directors'] = movies_df['directors'].apply(safe_literal_eval)

# Explode the DataFrame on the 'directors' column if the data is correctly formatted as lists of dictionaries
directors_expanded = movies_df.explode('directors').dropna()

# Normalize the directors' dictionary data into separate DataFrame columns
if not directors_expanded['directors'].isnull().all():  # Check if all values are not None
    directors_expanded[['director_id', 'director_name']] = directors_expanded['directors'].apply(lambda d: pd.Series({'id': d['id'], 'name': d['name']}) if pd.notna(d) else pd.Series({'id': None, 'name': None}))
else:
    print("No valid director data available.")

directors_expanded['vote_average'] = pd.to_numeric(directors_expanded['vote_average'], errors='coerce')

# Group by director name and calculate mean ratings and count of movies
director_stats = directors_expanded.groupby('director_name').agg({
    'vote_average': ['mean', 'count']
}).reset_index()

# Flatten the multi-level column headers
director_stats.columns = ['director', 'average_rating', 'movie_counts']
director_stats = director_stats[['average_rating','movie_counts']]

director_stats['average_rating'] = standardize(director_stats['average_rating'])
director_stats['movie_counts'] = standardize(director_stats['movie_counts'])
director_stats = torch.from_numpy(director_stats.values).to(torch.float)

directors_expanded = directors_expanded[['movieId', 'director_name', 'director_id']]
movies_extended_df = pd.merge(movies_df, directors_expanded, on='movieId', how='left')
movies_extended_df = movies_extended_df.dropna(subset=['director_name'])


In [14]:
unique_director_id = movies_extended_df['director_id'].unique()
unique_director_id = pd.DataFrame(data={
    'director_id': unique_director_id,
    'mappedId': pd.RangeIndex(len(unique_director_id))
})
unique_movie_id = movies_extended_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedId': pd.RangeIndex(len(unique_movie_id))
})

director_director_id = pd.merge(movies_extended_df['director_id'], unique_director_id, on='director_id', how='left')
director_director_id = torch.from_numpy(director_director_id['mappedId'].values)
director_movie_id = pd.merge(movies_extended_df['movieId'], unique_movie_id, on='movieId', how='left')
director_movie_id = torch.from_numpy(director_movie_id['mappedId'].values)

edge_index_director_to_movie = torch.stack([director_director_id, director_movie_id], dim=0)

In [15]:
# construct a compact representation of the data
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedId': pd.RangeIndex(len(unique_user_id)),
})
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedId': pd.RangeIndex(len(unique_movie_id)),
})

ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id, on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedId'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id, on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedId'].values)

edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)

In [16]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
# save node indices
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(unique_movie_id))
data['director'].node_id = torch.arange(len(unique_director_id))

# add node features
data['movie'].x = movie_feat
data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie

data['user', 'rates', 'movie'].edge_label = torch.from_numpy(ratings_df['rating'].values).to(torch.long) # TODO: this cuts off the .5 steps
mask = data['user', 'rates', 'movie'].edge_label >= 4
del data['user', 'rates', 'movie'].edge_label 
data['user', 'rates', 'movie'].edge_index = data['user', 'rates', 'movie'].edge_index[:, mask]

data['director'].x = director_stats
data['director', 'directs', 'movie'].edge_index = edge_index_director_to_movie

data = T.ToUndirected()(data)

print(data)

HeteroData(
  user={ node_id=[609] },
  movie={
    node_id=[2427],
    x=[2427, 23],
  },
  director={
    node_id=[1229],
    x=[1229, 2],
  },
  (user, rates, movie)={ edge_index=[2, 25903] },
  (director, directs, movie)={ edge_index=[2, 2605] },
  (movie, rev_rates, user)={ edge_index=[2, 25903] },
  (movie, rev_directs, director)={ edge_index=[2, 2605] }
)


In [38]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=[("user", "rates", "movie"), ("director", "directs", "movie")],
    rev_edge_types=[("movie", "rev_rates", "user"), ("movie", "rev_directs", "director")],
)
train_data, val_data, test_data = transform(data)

from torch_geometric.loader import LinkNeighborLoader
# Example configuration with correct tuple formatting for edge_label_index:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[10,10,10],
    edge_label_index=(('user', 'rates', 'movie'), train_data['user', 'rates', 'movie'].edge_label_index),  # Correct tuple format
    edge_label=train_data['user', 'rates', 'movie'].edge_label,  # Labels for the edges
    batch_size=128,
    shuffle=True,
    neg_sampling_ratio=2.0  # Example negative sampling ratio
)

# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[10,10,10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128*3,
    shuffle=False,
)
print(next(iter(train_loader))['user', 'rates', 'movie'].edge_label)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [39]:
from torch_geometric.nn import GraphConv, to_hetero, SAGEConv, GraphSAGE

class GNN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, layers):
        super().__init__()
        input_dim = num_features
        self.conv = torch.nn.ModuleList()
        self.batch_norm = torch.nn.BatchNorm1d(hidden_dim)
        for _ in range(layers):
            self.conv.append(SAGEConv(input_dim, hidden_dim))
            input_dim = hidden_dim
    
    def forward(self, x, edge_index):#, edge_weight):
        for layer in self.conv:
            x = layer(x, edge_index)
            # x = torch.dropout(x, 0.1, True)
            # x = self.batch_norm(x)
            x = torch.relu(x)
        return x

class Classifier(torch.nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(2*hidden_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_dim, 1)  # Final layer for link prediction

    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Extract features for users and movies according to the edge indices
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Concatenate user and movie features
        x = torch.cat((edge_feat_user, edge_feat_movie), dim=1)  # Concatenation along the feature dimension

        # Pass the concatenated vector through the dense layers
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x.view(-1)


class Model(torch.nn.Module):
    # TODO: add data in function header?
    def __init__(self, num_features, hidden_dim, layers):
        super().__init__()
        self.movie_lin = torch.nn.Linear(23, hidden_dim) # TODO: thats the number of neighbors?
        self.director_lin = torch.nn.Linear(2, hidden_dim)
        self.user_emb = torch.nn.Embedding(data['user'].num_nodes, hidden_dim)
        self.movie_emb = torch.nn.Embedding(data['movie'].num_nodes, hidden_dim)
        self.director_emb = torch.nn.Embedding(data['director'].num_nodes, hidden_dim)
        self.gnn = GNN(num_features, hidden_dim, layers)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata(), aggr='sum')
        self.classifier = Classifier(hidden_dim)

    def forward(self, data:HeteroData) -> Tensor:
        x_dict = {
            "user": self.user_emb(data['user'].node_id),
            "movie": self.movie_lin(data['movie'].x) + self.movie_emb(data['movie'].node_id),
            "director": self.director_lin(data['director'].x) + self.director_emb(data['director'].node_id) 
        }
        x_dict = self.gnn(x_dict, data.edge_index_dict)#, data.edge_weight_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred

model = Model(128, 128, 2)
print(model)

Model(
  (movie_lin): Linear(in_features=23, out_features=128, bias=True)
  (director_lin): Linear(in_features=2, out_features=128, bias=True)
  (user_emb): Embedding(609, 128)
  (movie_emb): Embedding(2427, 128)
  (director_emb): Embedding(1229, 128)
  (gnn): GraphModule(
    (conv): ModuleList(
      (0-1): 2 x ModuleDict(
        (user__rates__movie): SAGEConv(128, 128, aggr=mean)
        (director__directs__movie): SAGEConv(128, 128, aggr=mean)
        (movie__rev_rates__user): SAGEConv(128, 128, aggr=mean)
        (movie__rev_directs__director): SAGEConv(128, 128, aggr=mean)
      )
    )
    (batch_norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (classifier): Classifier(
    (fc1): Linear(in_features=256, out_features=128, bias=True)
    (relu): ReLU()
    (fc2): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [40]:
import tqdm
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

#model = torch.load("./model/auc_8092.pt")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
save = False


for epoch in range(150):
    total_loss = 0
    total_examples = 0
    model.train()
    for train_sample in train_loader:
        optimizer.zero_grad()
        train_sample.to(device)
        pred = model(train_sample)
        label = train_sample['user', 'rates', 'movie'].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * label.size(0) # TODO: numel?!
        total_examples += label.size(0)

    
    # Validation phase
    model.eval()
    preds = []
    ground_truths = []
    for sampled_data in val_loader:
        with torch.no_grad():
            sampled_data.to(device)
            preds.append(model(sampled_data))
            ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)
    #pred = torch.sigmoid(pred)
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    if epoch % 10 == 0:
        print(f'Epoch {epoch + 1:03d}, Loss: {total_loss / len(train_loader.dataset):.4f}, Validation AUC: {auc:.4f}')



if save: torch.save(model, f"./model/auc_{int(auc*10000)}.pt")

Device: cuda
Epoch 001, Loss: 1.5788, Validation AUC: 0.8575
Epoch 011, Loss: 0.9019, Validation AUC: 0.8861
Epoch 021, Loss: 0.7846, Validation AUC: 0.8888
Epoch 031, Loss: 0.7224, Validation AUC: 0.8956
Epoch 041, Loss: 0.6396, Validation AUC: 0.8954
Epoch 051, Loss: 0.5799, Validation AUC: 0.8985
Epoch 061, Loss: 0.5139, Validation AUC: 0.8985
Epoch 071, Loss: 0.5000, Validation AUC: 0.8975
Epoch 081, Loss: 0.4428, Validation AUC: 0.8994
Epoch 091, Loss: 0.4433, Validation AUC: 0.8964
Epoch 101, Loss: 0.4171, Validation AUC: 0.8978
Epoch 111, Loss: 0.4316, Validation AUC: 0.8974
Epoch 121, Loss: 0.4043, Validation AUC: 0.8978
Epoch 131, Loss: 0.3561, Validation AUC: 0.8957
Epoch 141, Loss: 0.3458, Validation AUC: 0.8947
