In [1]:
import csv
import pandas as pd

def read_movies():
    # read csv with movies for budget and imdb_id
    columns_of_interest = ['budget', 'imdb_id', 'revenue', 'vote_average']
    data = []
    with open('./data/movie_data_tmbd.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='|')
        for row in reader:
            extracted_row = {col: row[col] for col in columns_of_interest}
            data.append(extracted_row)

    movies_budget_df = pd.DataFrame(data)
    movies_budget_df = movies_budget_df.fillna({
        'budget': 0,
        'imdb_id': '',
        'title': ''
    })

    # merge movie budget with id
    link_df = pd.read_csv("./data/links.csv")
    link_df['imdbId'] = link_df['imdbId'].apply(lambda x: f'tt0{int(x)}')

    movies_id_df = pd.merge(movies_budget_df, link_df, left_on='imdb_id', right_on='imdbId', how='inner')
    movies_id_df['budget'] = pd.to_numeric(movies_id_df['budget'])
    movies_id_df['revenue'] = pd.to_numeric(movies_id_df['revenue'])
    movies_id_df = movies_id_df[movies_id_df.budget != 0]
    movies_id_df = movies_id_df[movies_id_df.revenue != 0]

    movies_info_df = pd.read_csv("./data/movies.csv")
    movies_df = pd.merge(movies_id_df, movies_info_df, on="movieId", how="inner")

    ratings_df = pd.read_csv("./data/ratings.csv")
    ratings_df = ratings_df[ratings_df['movieId'].isin(movies_df['movieId'])]

    return movies_df, ratings_df

movies_df, ratings_df = read_movies()
print(movies_df.head())
print(ratings_df.head())
print(len(movies_df))
print(len(ratings_df))

     budget    imdb_id      revenue vote_average  movieId     imdbId   tmdbId  \
0  30000000  tt0758752  102820008.0          7.0    82167  tt0758752  43347.0   
1   4600000  tt0382383    6700000.0          6.3    48239  tt0382383  43410.0   
2  24000000  tt0464154   83188165.0          5.3    79879  tt0464154  43593.0   
3    600000  tt0281680    1023156.0          5.2    59366  tt0281680  43664.0   
4  16000000  tt0339727     174318.0          5.6     8996  tt0339727  43670.0   

                                               title  \
0                        Love and Other Drugs (2010)   
1                                        Yuva (2004)   
2                        Piranha (Piranha 3D) (2010)   
3  Bread, My Sweet, The (a.k.a. Wedding for Bella...   
4                                   Stateside (2004)   

                         genres  
0          Comedy|Drama|Romance  
1  Action|Adventure|Crime|Drama  
2        Action|Horror|Thriller  
3                 Drama|Romance  
4     

In [2]:
# Construct the relations
user_movie_rating_triples = []
for index, row in ratings_df.iterrows():
    user_movie_rating_triples.append((f"user_{row['userId']}", f"rated_{row['rating']}", f"movie_{row['movieId']}"))

In [3]:
movie_budget_triples = []
movie_revenue_triples = []
movie_ratings_triples = []
for index, row in movies_df.iterrows():
    movie_budget_triples .append((f"movie_{row['movieId']}", "used_budget", f"budget_{row['budget']}"))
    movie_revenue_triples.append((f"movie_{row['movieId']}", "used_revenue", f"revenue_{row['revenue']}"))
    movie_ratings_triples.append((f"movie_{row['movieId']}", "used_ratings", f"ratings_{row['vote_average']}"))

In [10]:
def triple_df(head, relation, tail):
    return pd.DataFrame({
        'head': head,
        'relation': relation,
        'tail': tail
    })

# Correct way to create the Series for head and tail
head_series = 'movie_' + movies_df['movieId'].astype(str)
tail_series_budget = 'budget_' + movies_df['budget'].astype(str)
tail_series_revenue = 'revenue_' + movies_df['revenue'].astype(str)
tail_series_ratings = 'avgrating_' + movies_df['vote_average'].astype(str)
head_series_users = 'user_' + ratings_df['userId'].astype(str)
tail_series_individual_movie_rating = 'rating'

# Use the function with the corrected series
movie_budget_triples = triple_df(head_series, 'used_budget', tail_series_budget)
movie_revenue_triples = triple_df(head_series, 'used_revenue', tail_series_revenue)
movie_ratings_triples = triple_df(head_series, 'used_avgrating', tail_series_ratings)

# If you want to concatenate them into one DataFrame:
all_triples_df = pd.concat([movie_budget_triples, movie_revenue_triples, movie_ratings_triples], ignore_index=True)

# Display the DataFrame
print(all_triples_df.head())

          head     relation             tail
0  movie_82167  used_budget  budget_30000000
1  movie_48239  used_budget   budget_4600000
2  movie_79879  used_budget  budget_24000000
3  movie_59366  used_budget    budget_600000
4   movie_8996  used_budget  budget_16000000


In [14]:
import numpy as np
from pykeen.triples import TriplesFactory

# the split should be done not over all the triples but for each relation triples. So that the training data is kinda balanced
triples_array = all_triples_df[['head', 'relation', 'tail']].to_numpy()

# Create a TriplesFactory from the numpy array
tf = TriplesFactory.from_labeled_triples(triples_array)

# Optionally, split the data into training, testing, and validation sets
training, testing, validation = tf.split([0.8, 0.1, 0.1])

using automatically assigned random_state=440209930


In [15]:
from pykeen.pipeline import pipeline

# Configure and run the pipeline
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='TransE',  # You can choose other models like DistMult, ComplEx, etc.
    training_kwargs={'num_epochs': 1, 'batch_size': 256},  # Adjust these parameters as needed
    random_seed=42,
    device='cuda'  # Use 'cuda' if you have a GPU, otherwise use 'cpu'
)

# Access the results
print(result)

No cuda devices were available. The model runs on CPU
Training epochs on cpu:   4%|▍         | 4/100 [00:03<01:34,  1.02epoch/s, loss=0.734, prev_loss=0.838]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f0c532ca0c0>>
Traceback (most recent call last):
  File "/home/piragi/Documents/Uni/knowledge_graphs/kg-env/lib64/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Training epochs on cpu: 100%|██████████| 100/100 [01:37<00:00,  1.03epoch/s, loss=0.0144, prev_loss=0.0114]
Evaluating on cpu: 100%|██████████| 896/896 [00:01<00:00, 557triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 1.63s seconds


PipelineResult(random_seed=42, model=TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(6357, 50)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(3, 50)
    )
  )
  (weight_regularizers): ModuleList()
), training=TriplesFactory(num_entities=6357, num_relations=3, create_inverse_triples=False, num_triples=7166), training_loop=<pykeen.training.slcwa.SLCWATrainingLoop object at 0x7f0a283e6bd0>, losses=[1.0749339248452867, 0.940182055745806, 0.8377121942383903, 0.7343105013881411, 0.6503036575657981, 0.5685225884829249, 0.4995496847799846, 0.450326963194779, 0.41334578501326696, 0.3652692937425205, 0.3340562273349081, 0.2986153555767877, 0.2762088844818728, 0.2546400363956179, 0.2304127620799201, 0.2083234116435051, 0.1927810843501772, 0.17360497372491018, 0.1675507857331208, 0.1484218596