In [1]:
import pandas as pd

ratings = pd.read_csv("../ml-latest-small/ratings.csv")

In [2]:
movies = pd.read_csv("../ml-latest-small/movies.csv")

In [3]:
user_movie_matrix = pd.merge(ratings, movies, on="movieId")
user_movie_matrix.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [4]:
mapping_score = {
    0.5:-1.2,
    1:-1.1,
    1.5:-1,
    2:-0.5,
    2.5:-0.1,
    3:0.1,
    3.5:0.5,
    4:1,
    4.5:1.1,
    5:1.2
}

In [5]:
import networkx as nx
import time

user_movie_graph = nx.Graph()

start_time = time.time()
for _, row in user_movie_matrix.iterrows():
    user_movie_graph.add_node(row["userId"], bipartite=0)
    user_movie_graph.add_node(row["title"], bipartite=1, genre=row["genres"], movieId=row["movieId"])
    user_movie_graph.add_edge(row["userId"], row["title"], weight=mapping_score[row["rating"]])
print("--- %s seconds ---" % (time.time() - start_time))

--- 5.702256441116333 seconds ---


In [6]:
print(f"Nodes in the graph: {list(user_movie_graph.nodes(data=True))[:10]}")
print(f"Edges in the graph: {list(user_movie_graph.edges(data=True))[:10]}")

Nodes in the graph: [(1, {'bipartite': 0}), ('Toy Story (1995)', {'bipartite': 1, 'genre': 'Adventure|Animation|Children|Comedy|Fantasy', 'movieId': 1}), ('Grumpier Old Men (1995)', {'bipartite': 1, 'genre': 'Comedy|Romance', 'movieId': 3}), ('Heat (1995)', {'bipartite': 1, 'genre': 'Action|Crime|Thriller', 'movieId': 6}), ('Seven (a.k.a. Se7en) (1995)', {'bipartite': 1, 'genre': 'Mystery|Thriller', 'movieId': 47}), ('Usual Suspects, The (1995)', {'bipartite': 1, 'genre': 'Crime|Mystery|Thriller', 'movieId': 50}), ('From Dusk Till Dawn (1996)', {'bipartite': 1, 'genre': 'Action|Comedy|Horror|Thriller', 'movieId': 70}), ('Bottle Rocket (1996)', {'bipartite': 1, 'genre': 'Adventure|Comedy|Crime|Romance', 'movieId': 101}), ('Braveheart (1995)', {'bipartite': 1, 'genre': 'Action|Drama|War', 'movieId': 110}), ('Rob Roy (1995)', {'bipartite': 1, 'genre': 'Action|Drama|Romance|War', 'movieId': 151})]
Edges in the graph: [(1, 'Toy Story (1995)', {'weight': 1}), (1, 'Grumpier Old Men (1995)', {

In [6]:
users = {n for n, d in user_movie_graph.nodes(data=True) if d["bipartite"] == 0}
print(f"Users: {list(users)[:10]}")
print(f"Number of users: {len(users)}")

Users: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Number of users: 610


In [14]:
movies = {n for n, d in user_movie_graph.nodes(data=True) if d["bipartite"] == 1}
print(f"Movies: {list(movies)[:10]}")
print(f"Number of movies: {len(movies)}")

Movies: ["Draughtsman's Contract, The (1982)", 'Saints and Soldiers (2003)', 'Treasure of the Sierra Madre, The (1948)', 'Yes Man (2008)', "It's a Wonderful Life (1946)", 'Angus (1995)', 'Nuremberg (2000)', 'Charlie, the Lonesome Cougar (1967)', 'Awful Truth, The (1937)', 'Fire in the Sky (1993)']
Number of movies: 9719


In [8]:
print(nx.is_bipartite(user_movie_graph))
print(nx.is_connected(user_movie_graph))

True
True


In [7]:
start_time = time.time()
user_user_graph = nx.bipartite.weighted_projected_graph(user_movie_graph, users)
print("--- %s seconds ---" % (time.time() - start_time))

print(len(user_user_graph.nodes()))
print(len(user_user_graph.edges()))

--- 5.238262176513672 seconds ---
610
164054


In [None]:
print(f"Nodes in the user-user graph: {list(user_user_graph.nodes(data=True))[:10]}")
print(f"Edges in the user-user graph: {list(user_user_graph.edges(data=True))[:10]}")

In [34]:
# Check memory consumption
import sys

edge_mem = sum([sys.getsizeof(e) for e in user_user_graph.edges])
node_mem = sum([sys.getsizeof(n) for n in user_user_graph.nodes])

print("Edge memory:", edge_mem / (1024**2),"MB")
print("Node memory:", node_mem / (1024**2),"MB")
print("Total memory:", (edge_mem + node_mem) / (1024**2), "MB")

Edge memory: 8.761428833007812 MB
Node memory: 0.01628875732421875 MB
Total memory: 8.777717590332031 MB


In [35]:
print(nx.is_connected(user_user_graph))

True


In [8]:
# 0: User, 1: Movie
def filter_nodes(graph: nx.Graph, node_type: int):
    return [n for n, d in graph.nodes(data=True) if d["bipartite"] == node_type]

In [9]:
def find_similar_users_with_graph(user_id, user_user_graph, top_n=10):
    pagerank_scores = nx.pagerank(user_user_graph, alpha=0.1, weight="weight", personalization = {user_id : 1})

    similar_users = sorted(
        [(u, score) for u, score in pagerank_scores.items() if u != user_id],
        key=lambda x: x[1],
        reverse=True
    )[:top_n]

    return [user for user, _ in similar_users]


In [12]:
user = 3
start_time = time.time()
similar_users = find_similar_users_with_graph(user, user_user_graph)
print("--- %s seconds ---" % (time.time() - start_time))
print(similar_users)

--- 0.3963148593902588 seconds ---
[414, 599, 288, 474, 68, 274, 555, 448, 590, 561]


In [13]:
def get_highest_rated_movies(user_id, user_movie_graph, already_seen, top_n=5):
    edges = [(movie, weight) for _, movie, weight in user_movie_graph.edges(user_id, data="weight")]
    
    unseen_movies = [(movie, weight) for movie, weight in edges if movie not in already_seen]

    top_movies = sorted(unseen_movies, key=lambda x: x[1], reverse=True)[:top_n]
    return [movie for movie, _ in top_movies]

In [14]:
def recommend_movies(user_id, user_movie_graph, similar_users, already_seen, top_n=10):
    recommended_movies = []
    
    for similar_user in similar_users:
        top_movies = get_highest_rated_movies(similar_user, user_movie_graph, already_seen, top_n=5)
        for index in range(5):
            if top_movies[index] in recommended_movies:
                continue
            else:
                recommended_movies.append(top_movies[index])
                break

        if len(recommended_movies) >= top_n:
            break

    return list(recommended_movies)[:top_n]

In [15]:
already_seen = [movie for _, movie, _ in user_movie_graph.edges(user, data="weight")]
print(f"Movies already seen by user {user}: {list(already_seen)[:10]}")

Movies already seen by user 3: ['Dangerous Minds (1995)', "Schindler's List (1993)", 'Courage Under Fire (1996)', 'Operation Dumbo Drop (1995)', 'Wallace & Gromit: The Best of Aardman Animation (1996)', 'Escape from L.A. (1996)', 'My Fair Lady (1964)', 'Doors, The (1991)', 'On Golden Pond (1981)', 'Deer Hunter, The (1978)']


In [16]:
start_time = time.time()
recommendation = recommend_movies(user, user_movie_graph, similar_users, already_seen)
print("--- %s seconds ---" % (time.time() - start_time))
print(f"Recommended movies for user {user}: {recommendation}")

--- 0.018771886825561523 seconds ---
Recommended movies for user 3: ['American President, The (1995)', 'Rumble in the Bronx (Hont faan kui) (1995)', 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Sense and Sensibility (1995)', 'Star Wars: Episode IV - A New Hope (1977)', 'Pulp Fiction (1994)', 'Grumpier Old Men (1995)', 'Toy Story (1995)', 'Forrest Gump (1994)', 'Braveheart (1995)']


In [67]:
# Connect to Neo4j
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"
password = "testtest"

driver = GraphDatabase.driver(uri, auth=(username, password))

In [68]:
upload_test = True

In [81]:
# Delete old test
if upload_test:
    def delete_test(tx, userId):
        tx.run("MATCH (u:User {userId: $userId})-[r:UU_RECOMMENDED]->(m:Movie) DELETE r", 
               userId=userId)

    with driver.session() as session:
        for title in recommendation:           
            session.execute_write(delete_test, user)

In [80]:
# Upload the test
if upload_test:
    def create_recommendations(tx, userId, recs):
            for rec in recs:
                tx.run("MATCH (u:User {userId: $userId}), (m:Movie {title: $title})"
                    "MERGE (u)-[:UU_RECOMMENDED]->(m)",
                    userId=userId, title=rec)

    with driver.session() as session:
        session.execute_write(create_recommendations, user, recommendation)

In [83]:
upload_predictions = False

In [None]:
def create_recommendations(tx, userId, recs):
    for rec in recs:
        tx.run("MATCH (u:User {userId: $userId}), (m:Movie {title: $title})"
            "MERGE (u)-[:UU_RECOMMENDED]->(m)",
            userId=userId, title=rec)
            
if upload_predictions:
    with driver.session() as session:
        start_time = time.time()
        for user in filter_nodes(user_movie_graph, 0):
            similar_users = find_similar_users_with_graph(user, user_user_graph)
            already_seen = [movie for _, movie, _ in user_movie_graph.edges(user, data="weight")]
            recs = recommendation = recommend_movies(user, user_movie_graph, similar_users, already_seen)   
            session.execute_write(create_recommendations, user, recs)
        print("--- %s seconds ---" % (time.time() - start_time))

In [85]:
driver.close()

Di seguito viene riportato un altro approccio che non è stato continuato date le dimensioni e tempistiche richieste per l'esecuzione.

Si intendeva utilizzare diverse ricerche PageRank per ottenere i migliori film per ogni utente simile al target.

In [None]:
# Normalize movie popularity to better recommend using similar users
global popularity_penalty

alpha = 0.2

popularity_penalty = {
  movie: alpha * len(list(user_movie_graph.neighbors(movie)))  # Numero di connessioni del nodo film
    for movie in filter_nodes(user_movie_graph, 1)
}

def create_preference_vector(debug: bool, user_id: int, user_movie_graph: nx.Graph):
    edges = {m: v for _, m, v in user_movie_graph.edges(user_id, data="weight")}

    if debug:
        print(f"Edges for user {user_id}: {list(edges)[:10]}")
        print(f"Number of edges for user {user_id}: {len(edges)}")

        for k, v in edges.items():
            print(k,v)

    tot = sum(edges.values())

    if debug:
        print(f"Total for user {user_id}: {tot}")
    
    if tot > 0:
        print(f"User {user_id} has rated movies")
        return len(edges), {
            movie: edges.get(movie, 0) / tot
            for movie in filter_nodes(user_movie_graph, 1) # 1 : Movie
        }
    else:
        print(f"User {user_id} has not rated any movies or the sum of all weighted ratings is zero / negative. All movies will have a weight of 1")
        temp = len(user_movie_graph.nodes())
        return len(edges), {
            movie: 1/temp for movie in filter_nodes(user_movie_graph, 1) 
        }

def predict_user_with_similars(user_id, user_movie_graph, movie_movie_graph, similar_users, already_seen):
    _, p_vec = create_preference_vector(False, user_id, user_movie_graph)
    
    if len(p_vec) < 1 or len(already_seen) < 1:
        return []
    
    item_rank = nx.pagerank(movie_movie_graph, personalization= p_vec, alpha=0.95, weight="weight")
    
    penalized_rank = {
        movie: item_rank[movie] / (1 + popularity_penalty[movie])  # Penalizzazione per film popolari
        for movie in item_rank
    }
    
    recommended_movies = [
        x for x in sorted(
            movie_movie_graph.nodes(),
            key=lambda x: penalized_rank[x] if x in penalized_rank  else 0,
            reverse=True
        )
        if x not in already_seen
    ]
    
    return recommended_movies

recommendation = []
for u in similar_users:
    s_t = predict_user_with_similars(u, user_movie_graph, movie_movie_graph, similar_users, already_seen)
    print(f"Predicted movies for user {u}: {s_t[:10]}")
    for m in s_t[:10]:
        if m not in recommendation:
            recommendation.append(m)

print(f"Recommended movies for user {user}: {recommendation[:10]}")
print(f"The number of movies recommended is: {len(recommendation)}")