In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Define file paths
movies_path = 'dataset/movies.dat'
ratings_path = 'dataset/ratings.dat'
users_path = 'dataset/users.dat'

# Load datasets
movies = pd.read_csv(movies_path, delimiter='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')
ratings = pd.read_csv(ratings_path, delimiter='::', header=None, names=['UserID', 'MovieID', 'Rating'], engine='python', encoding='latin1', usecols=[0,1,2])
users = pd.read_csv(users_path, delimiter='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation'], engine='python', encoding='latin1', usecols=[0,1,2,3])

# Merge dataframes
movies_ratings = pd.merge(movies, ratings, how='inner', on='MovieID')
movielens_data = pd.merge(movies_ratings, users, how='inner', on='UserID')

# Display movielens_data
print(movielens_data)

# Define IMDb file paths
basics_path = 'dataset/title.basics.tsv'
ratings_path = 'dataset/title.ratings.tsv'

# Load IMDb datasets
imdb_basics = pd.read_csv(basics_path, delimiter='\t', usecols=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear', 'genres'])
imdb_ratings = pd.read_csv(ratings_path, delimiter='\t', usecols=['tconst', 'averageRating'])

# Merge IMDb dataframes
imdb_data = pd.merge(imdb_basics, imdb_ratings, how='inner', on='tconst')

# Cleaning and transformations
imdb_data.replace({'\\N': np.nan}, inplace=True)
imdb_data['startYear'] = imdb_data['startYear'].astype(float)
imdb_data['averageRating'] = imdb_data['averageRating'].astype(float)
imdb_data['genres'].fillna('', inplace=True)
imdb_data['titleType'].fillna('', inplace=True)
imdb_data['primaryTitle'].fillna('', inplace=True)
imdb_data['originalTitle'].fillna('', inplace=True)
imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)
imdb_data['startYear'].fillna(0, inplace=True)
imdb_data['startYear'] = imdb_data['startYear'].astype(int)

# Print cleaned IMDb data and check datatypes
print(imdb_data[['startYear', 'averageRating', 'genres', 'titleType', 'primaryTitle', 'originalTitle']].head())
print(imdb_data.dtypes)

# Prepare for data merging by standardizing titles and years
movielens_data['NewTitle'] = movielens_data['Title'].str.extract(r'^(.*?) \(\d{4}\)$', expand=False).str.lower().str.strip()
imdb_data['NewTitle'] = imdb_data['primaryTitle'].str.lower().str.strip()
movielens_data['Year'] = movielens_data['Title'].str.extract(r'\((\d{4})\)', expand=False).astype(int)
imdb_data['Year'] = imdb_data['startYear']

# Merge the datasets based on clean titles and year
merged_data = pd.merge(movielens_data, imdb_data, on=['NewTitle', 'Year'], how='inner')

# Output the result
print(merged_data)

         MovieID                  Title                       Genres  UserID  \
0              1       Toy Story (1995)  Animation|Children's|Comedy       1   
1              1       Toy Story (1995)  Animation|Children's|Comedy       6   
2              1       Toy Story (1995)  Animation|Children's|Comedy       8   
3              1       Toy Story (1995)  Animation|Children's|Comedy       9   
4              1       Toy Story (1995)  Animation|Children's|Comedy      10   
...          ...                    ...                          ...     ...   
1000204     3952  Contender, The (2000)               Drama|Thriller    5812   
1000205     3952  Contender, The (2000)               Drama|Thriller    5831   
1000206     3952  Contender, The (2000)               Drama|Thriller    5837   
1000207     3952  Contender, The (2000)               Drama|Thriller    5927   
1000208     3952  Contender, The (2000)               Drama|Thriller    5998   

         Rating Gender  Age  Occupation

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['genres'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['titleType'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

   startYear  averageRating                    genres titleType  \
0       1894            5.7         Documentary,Short     short   
1       1892            5.6           Animation,Short     short   
2       1892            6.5  Animation,Comedy,Romance     short   
3       1892            5.4           Animation,Short     short   
4       1893            6.2              Comedy,Short     short   

             primaryTitle           originalTitle  
0              Carmencita              Carmencita  
1  Le clown et ses chiens  Le clown et ses chiens  
2          Pauvre Pierrot          Pauvre Pierrot  
3             Un bon bock             Un bon bock  
4        Blacksmith Scene        Blacksmith Scene  
tconst            object
titleType         object
primaryTitle      object
originalTitle     object
startYear          int32
genres            object
averageRating    float64
dtype: object
        MovieID                    Title                       Genres  UserID  \
0             1

In [6]:
# Group by UserID and Title to handle duplicates by averaging the ratings
ratings_aggregated = merged_data.groupby(['UserID', 'Title'], as_index=False)['Rating'].mean()

# Pivot the DataFrame to create the user-item interaction matrix
interaction_matrix = ratings_aggregated.pivot(index='UserID', columns='Title', values='Rating').fillna(0)

print(interaction_matrix)

Title   'Til There Was You (1997)  1-900 (1994)  \
UserID                                            
1                             0.0           0.0   
2                             0.0           0.0   
3                             0.0           0.0   
4                             0.0           0.0   
5                             0.0           0.0   
...                           ...           ...   
6036                          0.0           0.0   
6037                          0.0           0.0   
6038                          0.0           0.0   
6039                          0.0           0.0   
6040                          0.0           0.0   

Title   10 Things I Hate About You (1999)  101 Dalmatians (1996)  \
UserID                                                             
1                                     0.0                    0.0   
2                                     0.0                    0.0   
3                                     0.0                    0.0

In [None]:
def preprocess_data(all_data):
    # Create label encoders
    gender_encoder = LabelEncoder()
    genres_encoder = LabelEncoder()

    # Fill missing values (example: fill with median for Age and Occupation)
    all_data['Age'] = pd.to_numeric(all_data['Age'], errors='coerce')
    all_data['Occupation'] = pd.to_numeric(all_data['Occupation'], errors='coerce')
    all_data['Year'] = pd.to_numeric(all_data['Year'], errors='coerce')
    
    # Fill NaN values with 0 in numeric columns
    numeric_columns = ['Age', 'Occupation', 'Year']
    all_data[numeric_columns] = all_data[numeric_columns].fillna(0)

    # Ensure that 'Genres' are strings before encoding
    all_data['Genres'] = all_data['Genres'].astype(str)

    # Label encode 'Gender'
    all_data['Gender'] = gender_encoder.fit_transform(all_data['Gender'])

    # Label encode 'Genres'
    all_data['Genres'] = genres_encoder.fit_transform(all_data['Genres'])

    # Standardize 'Age', 'Occupation', 'Year', and 'runtimeMinutes'
    scaler = StandardScaler()
    all_data[['Age', 'Occupation', 'Year', ]] = scaler.fit_transform(all_data[['Age', 'Occupation', 'Year', ]])

    # Combine all features
    X = np.concatenate([all_data[['Gender', 'Genres', 'Age', 'Occupation', 'Year', 'runtimeMinutes', 'isAdult']].values], axis=1)
    
    return X

In [None]:
X = preprocess_data(merged_data)
y = merged_data['Rating'].values
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict ratings
predictions = model.predict(X_test)

In [None]:
def get_user_info(user_id, ratings_movies_users):
    user_info = ratings_movies_users[ratings_movies_users['UserID'] == user_id]
    return user_info[['Gender', 'Age', 'Occupation']].drop_duplicates()

def get_movie_info(title, ratings_movies_users):
    movie_info = ratings_movies_users[ratings_movies_users['Title'] == title]
    return movie_info[['Genres', 'Year', 'runtimeMinutes', 'isAdult']].drop_duplicates()

In [None]:
# Main process to get and preprocess movie information
def get_unrated_movie_features(user_info, interaction_matrix, ratings_movies_users, user_id):
    # Get the list of unrated movies for the user
    unrated_movies = interaction_matrix.loc[user_id][interaction_matrix.loc[user_id] == 0].index.tolist()
    
    # Initialize an empty list to store features
    movie_features_list = []

    for movie in tqdm(unrated_movies, desc="Processing unrated movies"):  # Add tqdm here
        # Get movie information
        movie_info = get_movie_info(movie, ratings_movies_users)
        
        # Add user demographic information (for preprocessing purposes)
        all_data = pd.concat([user_info, movie_info], axis=1)
        
        # Preprocess the data
        movie_features = preprocess_data(all_data)
        
        # Append the features to the list
        movie_features_list.append(movie_features)
    
    # Combine all features into a single numpy array
    X = np.concatenate(movie_features_list, axis=0)
    
    return X, unrated_movies

In [None]:
def get_top_recommendations(user1_id, user2_id, model, interaction_matrix, ratings_movies_users):
    # Get user information
    user1_info = get_user_info(user1_id, ratings_movies_users)
    user2_info = get_user_info(user2_id, ratings_movies_users)

    # Get combination of user information and movie features
    user1_combination, user1_unrated_movies = get_unrated_movie_features(user1_info, interaction_matrix, ratings_movies_users, user1_id)
    user2_combination, user2_unrated_movies = get_unrated_movie_features(user2_info, interaction_matrix, ratings_movies_users, user2_id)

    # Get predictions
    user1_predictions = model.predict(user1_combination)
    user2_predictions = model.predict(user2_combination)

    # Find common movies
    common_movies = set(user1_unrated_movies).intersection(set(user2_unrated_movies))

    # Initialize list to store averaged predictions
    averaged_predictions = []

    # Calculate average predictions for common movies
    for movie in common_movies:
        idx1 = user1_unrated_movies.index(movie)
        idx2 = user2_unrated_movies.index(movie)

        avg_prediction = (user1_predictions[idx1] + user2_predictions[idx2]) / 2
        averaged_predictions.append((movie, avg_prediction))

    # Convert to DataFrame for better readability
    averaged_predictions_df = pd.DataFrame(averaged_predictions, columns=['Movie', 'Average_Prediction'])

    # Sort the averaged predictions and get the top 5
    top_prediction = averaged_predictions_df.sort_values(by='Average_Prediction', ascending=False).head(1)

    return top_prediction