In [1]:
import torch
from torch import Tensor

In [2]:
import csv
import pandas as pd

def read_movies():
    # read csv with movies for budget and imdb_id
    columns_of_interest = ['budget', 'imdb_id', 'revenue', 'vote_average']
    data = []
    with open('./data/movie_data_tmbd.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='|')
        for row in reader:
            extracted_row = {col: row[col] for col in columns_of_interest}
            data.append(extracted_row)

    movies_budget_df = pd.DataFrame(data)
    movies_budget_df = movies_budget_df.fillna({
        'budget': 0,
        'imdb_id': '',
        'title': ''
    })

    # merge movie budget with id
    link_df = pd.read_csv("./data/links.csv")
    link_df['imdbId'] = link_df['imdbId'].apply(lambda x: f'tt0{int(x)}')

    movies_id_df = pd.merge(movies_budget_df, link_df, left_on='imdb_id', right_on='imdbId', how='inner')
    movies_id_df['budget'] = pd.to_numeric(movies_id_df['budget'])
    movies_id_df['revenue'] = pd.to_numeric(movies_id_df['revenue'])
    movies_id_df = movies_id_df[movies_id_df.budget != 0]
    movies_id_df = movies_id_df[movies_id_df.revenue != 0]

    movies_info_df = pd.read_csv("./data/movies.csv")
    movies_df = pd.merge(movies_id_df, movies_info_df, on="movieId", how="inner")

    ratings_df = pd.read_csv("./data/ratings.csv")
    ratings_df = ratings_df[ratings_df['movieId'].isin(movies_df['movieId'])]

    return movies_df, ratings_df

movies_df, ratings_df = read_movies()
print(movies_df.head())
print(ratings_df.head())
print(len(movies_df))
print(len(ratings_df))

     budget    imdb_id      revenue vote_average  movieId     imdbId   tmdbId  \
0  30000000  tt0758752  102820008.0          7.0    82167  tt0758752  43347.0   
1   4600000  tt0382383    6700000.0          6.3    48239  tt0382383  43410.0   
2  24000000  tt0464154   83188165.0          5.3    79879  tt0464154  43593.0   
3    600000  tt0281680    1023156.0          5.2    59366  tt0281680  43664.0   
4  16000000  tt0339727     174318.0          5.6     8996  tt0339727  43670.0   

                                               title  \
0                        Love and Other Drugs (2010)   
1                                        Yuva (2004)   
2                        Piranha (Piranha 3D) (2010)   
3  Bread, My Sweet, The (a.k.a. Wedding for Bella...   
4                                   Stateside (2004)   

                         genres  
0          Comedy|Drama|Romance  
1  Action|Adventure|Crime|Drama  
2        Action|Horror|Thriller  
3                 Drama|Romance  
4     

In [3]:
genres = movies_df['genres'].str.get_dummies('|')
movie_feat = torch.from_numpy(genres.values).to(torch.float)
print(movie_feat.shape)

torch.Size([2986, 20])


In [4]:
print(len(ratings_df['movieId'].unique()))
print(len(movies_df))

2986
2986


In [5]:
# construct a compact representation of the data
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedId': pd.RangeIndex(len(unique_user_id)),
})
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedId': pd.RangeIndex(len(unique_movie_id)),
})

ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id, on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedId'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id, on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedId'].values)

edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
print(edge_index_user_to_movie.shape)

torch.Size([2, 17627731])


In [21]:
ratings_movie_id[:5]

tensor([0, 1, 2, 3, 4])

In [18]:
edge_index_user_to_movie[:5, :5]

tensor([[0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4]])

In [6]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
# save node indices
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(movies_df))

# add node features
data['movie'].x = movie_feat
data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie

# we also need reverse edges from movies to users TODO: why?
data = T.ToUndirected()(data)
print(data)

HeteroData(
  user={ node_id=[323487] },
  movie={
    node_id=[2986],
    x=[2986, 20],
  },
  (user, rates, movie)={ edge_index=[2, 17627731] },
  (movie, rev_rates, user)={ edge_index=[2, 17627731] }
)


In [17]:
data = HeteroData()# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))# Add the node features and edge indices:
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)
print(data)

NameError: name 'T' is not defined

In [7]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)
train_data, val_data, test_data = transform(data)

In [8]:
# create mini-batching so it fits on the gpu
from torch_geometric.loader import LinkNeighborLoader

edge_label_index = train_data['user', 'rates', 'movie'].edge_label_index
edge_label = train_data['user', 'rates', 'movie'].edge_label


train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10], #TODO: this can and should be changed
    neg_sampling_ratio=2.0,
    edge_label_index=(('user', 'rates', 'movie'), edge_label_index),
    edge_label=edge_label,
    batch_size = 64,
    shuffle=True
)

sampled_data = next(iter(train_loader))
print(sampled_data)



HeteroData(
  user={
    node_id=[10921],
    n_id=[10921],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2391],
    x=[2391, 20],
    n_id=[2391],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 12182],
    edge_label=[192],
    edge_label_index=[2, 192],
    e_id=[12182],
    num_sampled_edges=[2],
    input_id=[64],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 33757],
    e_id=[33757],
    num_sampled_edges=[2],
  }
)


In [16]:
sampled_data['user', 'rates', 'movie'].edge_label.shape


torch.Size([192])

In [31]:
from torch_geometric.nn import SAGEConv, to_hetero

class GNN(torch.nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.conv1 = SAGEConv(hidden_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
    
    def forward(self, x, edge_index):
        x = torch.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    # final classifier, applies dot product on tail and head embeddings for prediction
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
    # TODO: add data in function header?
    def __init__(self, hidden_dim):
        super().__init__()
        self.movie_lin = torch.nn.Linear(20, hidden_dim) # TODO: thats the number of neighbors?
        self.user_emb = torch.nn.Embedding(data['user'].num_nodes, hidden_dim)
        self.movie_emb = torch.nn.Embedding(data['movie'].num_nodes, hidden_dim)
        self.gnn = GNN(hidden_dim)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()

    def forward(self, data:HeteroData) -> Tensor:
        x_dict = {
            "user": self.user_emb(data['user'].node_id),
            "movie": self.movie_lin(data['movie'].x) + self.movie_emb(data['movie'].node_id),
        }
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred

model = Model(hidden_dim=64)
print(model)


Model(
  (movie_lin): Linear(in_features=20, out_features=64, bias=True)
  (user_emb): Embedding(323487, 64)
  (movie_emb): Embedding(2986, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [25]:
sampled_data['user', 'rates', 'movie'].edge_label

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
import tqdm
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(2):
    total_loss = 0
    total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        label = sampled_data['user', 'rates', 'movie'].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, label)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) + pred.numel() # TODO: numel?!
        total_examples += pred.numel()
    print(f'Epoch: {epoch:03d}, Loss:{total_loss/total_examples:.4f}')

Device: cpu


  1%|▏         | 968/66104 [01:45<1:58:36,  9.15it/s]


KeyboardInterrupt: 

In [36]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)
sampled_data = next(iter(val_loader))   

In [37]:
from sklearn.metrics import roc_auc_score
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

  1%|          | 81/13772 [00:01<04:47, 47.63it/s]


KeyboardInterrupt: 