### **Recommender**

In [None]:
import re
import pickle
import os

from collections import namedtuple
import networkx as nx
from node2vec import Node2Vec
from gensim.models import Word2Vec

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from pprint import pprint

# 1. Download data
#   1.1 Movies
#   1.2 Stars
#   1.3 Producers

# https://triplydb.com/Triply/linkedmdb/insights/classFrequency?graph=https%3A%2F%2Ftriplydb.com%2FTriply%2Flinkedmdb%2Fgraphs%2Fdata

#### Download data

##### **recommender.py** to extract the movies and serialize them.

In [None]:
def load_movies_from_files(folder_path='./movies_data'):
    all_movies = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pkl'):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, 'rb') as f:
                movies = pickle.load(f)
                all_movies.extend(movies)

    return all_movies

def extract_objects(movies:list[str]) -> dict:
    
    objects = {
        "actors": set(),
        "directors": set(),
        "genres": set(),
        "subjects": set()
    }
    
    for movie in movies:
        
        if len(movie.actors) > 0:
            for actor in movie.actors.split(","):
                objects["actors"].add(actor)
                
        if len(movie.directors) > 0:
            for director in movie.directors.split(","):
                objects["directors"].add(director)
                
        if len(movie.subjects) > 0:
            for subject in movie.subjects.split(","):
                objects["subjects"].add(subject)
                
        if len(movie.genres) > 0:
            for genre in movie.genres.split(","):
                objects["genres"].add(genre)            
    
    return objects


def get_year(movie):
    match = re.search(r"\d{4}", movie)
    if match:
        return int(match.group(1))
    else: 
        return 0

In [None]:
# Define the namedtuples
Movie = namedtuple("Movie", "film_id title release_date genres subjects runtime actors sequel_id prequel_id directors")

# Initialize lists to store data
movies = load_movies_from_files()

release_dates = [movie.release_date for movie in movies if movie.release_date != 0]

other_objects = extract_objects(movies)

actors = list(other_objects["actors"])
directors = list(other_objects["directors"])
genres = list(other_objects["genres"])
subjects = list(other_objects["subjects"])

In [None]:
data = len(movies), len(subjects), len(actors), len(directors), len(genres)
column_names = ["Movies", "Subjects", "Actors", "Directors", "Genres"]
widths = [max(len(str(d)), len(column_name)) + 2 for d, column_name in zip(data, column_names)]

for column_name, width in zip(column_names, widths):
    print(column_name.center(width), end="")
print()

# Print data
for d, width in zip(data, widths):
    print(str(d).center(width), end="")
print()

#### Construct Graph

In [None]:
def create_helper_nodes(G, nodes:list[str], node_type:str):
    for node in nodes:
        G.add_node(node, type=node_type, label=node_type)

In [None]:
# Make all unique attributes into nodes from KG.
# Idea: Relate movies by their matching unique attributes (that are now nodes), and can be traversed over them as a bridge between the movies.

# Note: Graphen werden aus ID gebildet, und es werden keine weiteren Infos in betracht gezogen. Daher UNIQUE ID für jeden Node.

G = nx.Graph() # LIEBER BI-DIRECTIONAL

helper_nodes = zip([actors, directors, genres, subjects], ["actor", "director", "genre", "subject"])
for category, node_type in helper_nodes:
    print(category, node_type)
    create_helper_nodes(G, category, node_type=node_type)

# runtime 
G.add_node("runtime_short", type="runtime", label="short runtime")
G.add_node("runtime_long", type="runtime", label="long runtime")

for movie in movies:
    # Create a movie node
    G.add_node(movie.film_id, type='movie', label=movie.title)
    
    # Splitting and adding genre nodes
    if movie.genres:
        for genre in movie.genres.split(","):
            G.add_edge(movie.film_id, genre, relationship='HAS_GENRE')

    # Splitting and adding subject nodes
    if movie.subjects:
        for subject in movie.subjects.split(","):
            G.add_edge(movie.film_id, subject, relationship='HAS_SUBJECT')

    # Splitting and adding actor nodes
    if movie.actors:
        for actor in movie.actors.split(","):
            G.add_edge(movie.film_id, actor, relationship='HAS_ACTOR')

    # Splitting and adding director nodes
    if movie.directors:
        for director in movie.directors.split(","):
            G.add_edge(movie.film_id, director, relationship='HAS_DIRECTOR')
            
    # "predicate" nodes     
    # if movie.release_dates != 0 and movie.release_dates < :
    #     G.add_edge(movie.film_id, movie.release_date, relationship='RELEASED_IN')
        
    if movie.sequel_id != "":
        G.add_edge(movie.film_id, movie.sequel_id, relationship='SEQUEL')

    if movie.prequel_id != "":
        G.add_edge(movie.film_id, movie.prequel_id, relationship='PREQUEL')

    if movie.runtime != 0:
        # we are only interested in short and very long movies
        runtime = int(float(movie.runtime))
        if runtime < 50:
            G.add_edge(movie.film_id, "runtime_short", relationship='RUNTIME')
        elif runtime > 170:
            G.add_edge(movie.film_id, "runtime_long", relationship='RUNTIME')




In [None]:
import pickle

with open('graph.pkl', 'wb') as f:
    pickle.dump(G, f)



In [None]:
# make space
del movies
del actors
del release_dates
del directors
del subjects

In [None]:
data = len(G.nodes()), len(G.edges())
column_names = ["# Nodes", "# Edges"]
widths = [max(len(str(d)), len(column_name)) + 2 for d, column_name in zip(data, column_names)]

for column_name, width in zip(column_names, widths):
    print(column_name.center(width), end="")
print()

# Print data
for d, width in zip(data, widths):
    print(str(d).center(width), end="")
print()

In [None]:
# # Using the previous graph G
# pos = nx.spring_layout(G)  # Positioning of nodes
# plt.figure(figsize=(12, 12))  # Set figure size

# # Draw nodes based on their type for differentiated visualization
# nx.draw_networkx_nodes(G, pos, nodelist=[node for node, attr in G.nodes(data=True) if attr['type'] == 'movie'], node_color='blue', node_size=500, label='Movies')
# nx.draw_networkx_nodes(G, pos, nodelist=[node for node, attr in G.nodes(data=True) if attr['type'] == 'genre'], node_color='red', node_size=300, label='Genres')
# nx.draw_networkx_nodes(G, pos, nodelist=[node for node, attr in G.nodes(data=True) if attr['type'] == 'actor'], node_color='yellow', node_size=300, label='Actors')
# nx.draw_networkx_nodes(G, pos, nodelist=[node for node, attr in G.nodes(data=True) if attr['type'] == 'director'], node_color='green', node_size=300, label='Directors')
# nx.draw_networkx_nodes(G, pos, nodelist=[node for node, attr in G.nodes(data=True) if attr['type'] == 'subject'], node_color='purple', node_size=300, label='Subjects')

# # Draw edges
# nx.draw_networkx_edges(G, pos)

# # Draw labels
# labels = {node: attr['label'] for node, attr in G.nodes(data=True)}
# nx.draw_networkx_labels(G, pos, labels=labels, font_size=10)

# # Add legend
# plt.legend()

# plt.title("Movie Graph")
# plt.axis('off')  # Hide axis
# plt.show()

#### Train Model

In [None]:
dimensions = 64
walk_length = 20
num_walks = 10

node2vec = Node2Vec(G, dimensions=64, walk_length=40, num_walks=200, workers=4, temp_folder="./model")

In [None]:
del G
del category

In [None]:
# store file

import pickle
import gzip

with gzip.open('./node2vec.pkl.gz', 'wb') as file:
    pickle.dump(node2vec, file, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
model = node2vec.fit(window=10, min_count=1)

In [17]:
# Save embeddings for later use
model.save("./embeddings.model")
model = Word2Vec.load("embeddings.model")

In [None]:
# Get vector for a specific node

vector = model.wv['65517']

print(vector)

In [None]:
# labels = []
# tokens = []

# labels = model.wv.index_to_key  
# tokens = [model.wv[word] for word in labels]

# # Reduce dimensionality with t-SNE
# tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23, verbose=1)
# new_values = tsne_model.fit_transform(np.array(tokens))

# x = []
# y = []
# for value in new_values:
#     x.append(value[0])
#     y.append(value[1])

# node_types = [G.nodes[node]['type'] for node in labels]

# plt.figure(figsize=(12, 12))
# for i in range(len(x)):
#     plt.scatter(x[i], y[i])
#     plt.annotate(node_types[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
# plt.show()

In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def get_nearest_neighbors(node, model, G, k=10):
    target_embedding = model.wv[node]
    
    similarities = {}
    for other_node in model.wv.index_to_key:
        if other_node == node or G.nodes[other_node]['type'] != 'movie':  # Skip the given node itself and non-movie nodes
            continue
        other_embedding = model.wv[other_node]
        similarities[other_node] = cosine_similarity(target_embedding, other_embedding)
    
    sorted_nodes = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    nearest_neighbors = [node[0] for node in sorted_nodes[:k]]
    
    return nearest_neighbors


In [None]:
target_node = "65517"
neighbors = get_nearest_neighbors(target_node, model, G, k=5)
print(neighbors)