In [9]:
import torch
torch.manual_seed(500)

from torch import Tensor
import csv
import pandas as pd


def read_movies():
    # read csv with movies for budget and imdb_id
    columns_of_interest = ['budget', 'imdb_id', 'revenue', 'vote_average', 'directors', 'vote_count', 'production_companies']
    data = []
    with open('./data/movie_data_tmbd.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='|')
        for row in reader:
            extracted_row = {col: row[col] for col in columns_of_interest}
            data.append(extracted_row)

    movies_budget_df = pd.DataFrame(data)
    movies_budget_df = movies_budget_df.fillna({
        'budget': 0,
        'imdb_id': '',
        'title': '',
        'director': '',
        'revenue': 0,
        'vote_count': 0,
    })

    # merge movie budget with id
    link_df = pd.read_csv("./data/small/links.csv")
    link_df['imdbId'] = link_df['imdbId'].apply(lambda x: f'tt0{int(x)}')

    movies_id_df = pd.merge(movies_budget_df, link_df, left_on='imdb_id', right_on='imdbId', how='inner')
    movies_id_df['budget'] = pd.to_numeric(movies_id_df['budget'])
    movies_id_df['revenue'] = pd.to_numeric(movies_id_df['revenue'])
    movies_id_df['vote_count'] = pd.to_numeric(movies_id_df['vote_count'])
    movies_id_df = movies_id_df[movies_id_df.budget != 0]
    movies_id_df = movies_id_df[movies_id_df.revenue != 0]

    movies_info_df = pd.read_csv("./data/small/movies.csv")
    movies_df = pd.merge(movies_id_df, movies_info_df, on="movieId", how="inner")

    ratings_df = pd.read_csv("./data/small/ratings.csv")
    print(len(ratings_df['userId'].unique()))
    #ratings_df = ratings_df.iloc[:ratings_df.shape[0]//5]
    ratings_df = ratings_df[ratings_df['movieId'].isin(movies_df['movieId'])]
    print(len(ratings_df['userId'].unique()))

    return movies_df, ratings_df

movies_df, ratings_df = read_movies()

610
609


In [10]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Load data
# Assume ratings_df is pre-loaded as your full dataset
random_seed = 42
data_subset = ratings_df.sample(frac=0.001, random_state=random_seed)  # Takes 10% of the data randomly

# Create user-item matrix from the subset
user_item_matrix = data_subset.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

# Configure Truncated SVD
n_components = 50
svd = TruncatedSVD(n_components=n_components)

# Fit SVD to the data
user_features = svd.fit_transform(user_item_matrix)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(user_features)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

def get_similar_users(user_id, similarity_df, top_n=5):
    if user_id not in similarity_df.index:
        print("User not found.")
        return
    
    # Get the similarity scores for this user with all others
    sim_scores = similarity_df[user_id]

    # Sort the scores in descending order
    sorted_scores = sim_scores.sort_values(ascending=False)

    # Get the top n most similar users, skipping the first entry since it's the user themselves
    most_similar_users = sorted_scores.iloc[1:top_n+1]

    return most_similar_users

# Create a new DataFrame for similar users
def create_similar_users_df(similarity_df, top_n=5):
    similar_users_list = []
    
    # Iterate over all user IDs in the similarity matrix
    for user_id in similarity_df.index:
        most_similar = get_similar_users(user_id, similarity_df, top_n)
        similar_users_list.append({
            "user_id": user_id,
            "similar_users": ', '.join(map(str, most_similar.index))
        })
    
    return pd.DataFrame(similar_users_list)

# Example usage
similar_users_df = create_similar_users_df(similarity_df)
print(similar_users_df.head())

# Test with a specific user ID
user_id = 1
print(f"Top 5 similar users to User {user_id}:\n{get_similar_users(user_id, similarity_df)}")

   user_id            similar_users
0       28  434, 432, 387, 189, 427
1       58  382, 414, 570, 275, 250
2       87  597, 414, 147, 259, 570
3       96  372, 447, 456, 387, 580
4      103  594, 339, 250, 490, 318
User not found.
Top 5 similar users to User 1:
None


In [11]:
similar_users_df.to_csv('./data/similar_users.csv', index=False)

In [12]:
# Read the CSV file
similar_users_df = pd.read_csv('./data/similar_users.csv')

# Convert 'user_id' to numeric
similar_users_df['user_id'] = pd.to_numeric(similar_users_df['user_id'])

# Convert 'similar_users' from a comma-separated string to a list of numerics
similar_users_df['similar_users'] = similar_users_df['similar_users'].apply(lambda x: list(map(int, x.split(','))))

# Display the updated DataFrame
print(similar_users_df.head())

   user_id              similar_users
0       28  [434, 432, 387, 189, 427]
1       58  [382, 414, 570, 275, 250]
2       87  [597, 414, 147, 259, 570]
3       96  [372, 447, 456, 387, 580]
4      103  [594, 339, 250, 490, 318]
