In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF

plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
ratings = pd.read_csv("C:/Users/User/Downloads/u.data", sep='\t', header=None)
ratings.columns = ['userid','movieId','rating','timestamp']
train_df, valid_df = train_test_split(ratings, test_size=0.2)

train_df = train_df.reset_index(drop=True)
test_df = valid_df.reset_index(drop=True)

In [2]:
column_names = [
    "movieId", "title", "ReleaseDate", "VideoReleaseDate", "IMDbURL",
    "Unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_df = pd.read_csv("C:/Users/User/Downloads/u.item", sep='|', names = column_names, header=None, encoding='latin-1')

In [3]:
user_item_matrix = train_df.pivot_table(index='userid', columns='movieId', values='rating')
user_item_matrix = user_item_matrix.fillna(0)

model = NMF(n_components=10)
W = model.fit_transform(user_item_matrix)
H = model.components_



In [4]:
user_id = 5
user_row_number = user_id - 1
user_predicted_ratings = np.dot(W[user_row_number], H)
sorted_user_predictions = np.argsort(user_predicted_ratings)[::-1]

# movies_df = pd.read_csv("C:/Users/User/Downloads/movies.csv")
user_data = ratings[ratings.userid == user_id]
user_full = (user_data.merge(movies_df, how='left', left_on='movieId', right_on='movieId')
             .sort_values(['rating'], ascending=False))
print('User {0} has already rated {1} movies.'.format(user_id, user_full.shape[0]))
print('Top 5 recommended movies:')
for movie_id in sorted_user_predictions[:10]:
    try:
        title = movies_df[movies_df.movieId == movie_id].title.values[0]
        print(title)
    except IndexError:
        print(f"Movie title not found for movie_id {movie_id}")


User 5 has already rated 175 movies.
Top 5 recommended movies:
I.Q. (1994)
Private Benjamin (1980)
Empire Strikes Back, The (1980)
Apocalypse Now (1979)
Sleepless in Seattle (1993)
Breaking the Waves (1996)
Movie title not found for movie_id 0
Princess Bride, The (1987)
Delicatessen (1991)
Star Trek VI: The Undiscovered Country (1991)


In [5]:
movies_df[movies_df.movieId == movie_id].title.values

array(['Star Trek VI: The Undiscovered Country (1991)'], dtype=object)

## Hyperparameter tuning 1: n_components

In [6]:
from sklearn.metrics import mean_squared_error
n_components_values = [5, 10, 15, 20, 25]
best_n_components = None
best_mse = float('inf')

for n_components in n_components_values:
    model = NMF(n_components=n_components)
    W = model.fit_transform(user_item_matrix)
    H = model.components_
    
    user_row_number = user_id - 1
    user_predicted_ratings = np.dot(W[user_row_number], H)
    
    # Calculate Mean Squared Error between real ratings and predicted ratings
    mse = mean_squared_error(user_item_matrix.iloc[user_row_number], user_predicted_ratings)
    
    if mse < best_mse:
        best_mse = mse
        best_n_components = n_components




In [7]:
best_model = NMF(n_components=best_n_components, random_state = 32)
W = best_model.fit_transform(user_item_matrix)
H = best_model.components_

user_id = 5
user_row_number = user_id - 1
user_predicted_ratings = np.dot(W[user_row_number], H)
sorted_user_predictions = np.argsort(user_predicted_ratings)[::-1]

user_data = ratings[ratings.userid == user_id]
user_full = (user_data.merge(movies_df, how='left', left_on='movieId', right_on='movieId')
             .sort_values(['rating'], ascending=False))
print('User {0} has already rated {1} movies.'.format(user_id, user_full.shape[0]))
print('Top 5 recommended movies:')
for movie_id in sorted_user_predictions[:10]:
    try:
        title = movies_df[movies_df.movieId == movie_id].title.values[0]
        print(title)
    except IndexError:
        print(f"Movie title not found for movie_id {movie_id}")


User 5 has already rated 175 movies.
Top 5 recommended movies:
Monty Python and the Holy Grail (1974)
Private Benjamin (1980)
Empire Strikes Back, The (1980)
Sleepless in Seattle (1993)
Movie title not found for movie_id 0
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
I.Q. (1994)
Raiders of the Lost Ark (1981)
Brazil (1985)
Spy Hard (1996)




## Hyperparameter tuning 2: beta loss and solver

In [8]:
beta_loss_values = ['frobenius', 'kullback-leibler']
solvers = ['cd', 'mu']

best_beta_loss = None
best_solver = None
best_mse = float('inf')

for beta_loss in beta_loss_values:
    for solver in solvers:
        if solver == 'cd' and beta_loss == 'kullback-leibler':
            continue  # Skip invalid combination

        model = NMF(n_components=best_n_components, beta_loss=beta_loss, solver=solver, random_state = 32)
        W = model.fit_transform(user_item_matrix)
        H = model.components_

        user_row_number = user_id - 1
        user_predicted_ratings = np.dot(W[user_row_number], H)

        mse = mean_squared_error(user_item_matrix.iloc[user_row_number], user_predicted_ratings)
        print("beta loss: ", beta_loss)
        print("Solver: ", solvers)
        print("mse: ", mse, "\n")
        if mse < best_mse:
            best_mse = mse
            best_beta_loss = beta_loss
            best_solver = solver

print(f"Best beta loss value: {best_beta_loss}")
print(f"Best solver: {best_solver}")




beta loss:  frobenius
Solver:  ['cd', 'mu']
mse:  0.46318584214427894 

beta loss:  frobenius
Solver:  ['cd', 'mu']
mse:  0.45232067979690865 

beta loss:  kullback-leibler
Solver:  ['cd', 'mu']
mse:  0.45327713222174476 

Best beta loss value: frobenius
Best solver: mu


In [9]:
best_model = NMF(n_components=best_n_components, beta_loss=best_beta_loss, solver=best_solver, random_state = 32)
W = best_model.fit_transform(user_item_matrix)
H = best_model.components_

user_id = 5
user_row_number = user_id - 1
user_predicted_ratings = np.dot(W[user_row_number], H)
sorted_user_predictions = np.argsort(user_predicted_ratings)[::-1]

user_data = ratings[ratings.userid == user_id]
user_full = (user_data.merge(movies_df, how='left', left_on='movieId', right_on='movieId')
             .sort_values(['rating'], ascending=False))
print('User {0} has already rated {1} movies.'.format(user_id, user_full.shape[0]))
print('Top 5 recommended movies:')
for movie_id in sorted_user_predictions[:10]:
    try:
        title = movies_df[movies_df.movieId == movie_id].title.values[0]
        print(title)
    except IndexError:
        print(f"Movie title not found for movie_id {movie_id}")


User 5 has already rated 175 movies.
Top 5 recommended movies:
Private Benjamin (1980)
Sleepless in Seattle (1993)
I.Q. (1994)
Breaking the Waves (1996)
Empire Strikes Back, The (1980)
Movie title not found for movie_id 0
Star Trek VI: The Undiscovered Country (1991)
Brazil (1985)
Sleeper (1973)
Monty Python and the Holy Grail (1974)


## Hyperparameter tuning 3: initialisation

In [10]:
init_methods = ['nndsvd', 'nndsvda', 'nndsvdar', 'random']
best_init = None
best_mse = float('inf')

for init in init_methods:
    model = NMF(n_components=best_n_components, beta_loss=best_beta_loss, solver=best_solver, random_state = 32, init=init)
    W = model.fit_transform(user_item_matrix)
    H = model.components_

    user_row_number = user_id - 1
    user_predicted_ratings = np.dot(W[user_row_number], H)

    mse = mean_squared_error(user_item_matrix.iloc[user_row_number], user_predicted_ratings)

    print("init method: ", init)
    print("mse: ", mse, "\n")
    if mse < best_mse:
        best_mse = mse
        best_beta_loss = beta_loss
        best_solver = solver
        best_init = init

print(f"Best initialization method: {best_init}")



init method:  nndsvd
mse:  0.4575481717950192 

init method:  nndsvda
mse:  0.45327713222174476 





init method:  nndsvdar
mse:  0.47513822752902546 

init method:  random
mse:  0.48959390650236945 

Best initialization method: nndsvda


In [11]:
best_model = NMF(n_components=best_n_components, beta_loss=best_beta_loss, solver=best_solver, init = best_init, random_state = 32)
W = best_model.fit_transform(user_item_matrix)
H = best_model.components_

user_id = 5
user_row_number = user_id - 1
user_predicted_ratings = np.dot(W[user_row_number], H)
sorted_user_predictions = np.argsort(user_predicted_ratings)[::-1]

user_data = ratings[ratings.userid == user_id]
user_full = (user_data.merge(movies_df, how='left', left_on='movieId', right_on='movieId')
             .sort_values(['rating'], ascending=False))
print('User {0} has already rated {1} movies.'.format(user_id, user_full.shape[0]))
print('Top 5 recommended movies:')
for movie_id in sorted_user_predictions[:10]:
    try:
        title = movies_df[movies_df.movieId == movie_id].title.values[0]
        print(title)
    except IndexError:
        print(f"Movie title not found for movie_id {movie_id}")


User 5 has already rated 175 movies.
Top 5 recommended movies:
Monty Python and the Holy Grail (1974)
Star Trek VI: The Undiscovered Country (1991)
I.Q. (1994)
Breaking the Waves (1996)
Private Benjamin (1980)
Swingers (1996)
Empire Strikes Back, The (1980)
Movie title not found for movie_id 0
So I Married an Axe Murderer (1993)
Sleepless in Seattle (1993)


In [12]:
temp = ratings[ratings['userid'] == 5]
groupby = temp.groupby('movieId')['rating'].count().reset_index()
groupby = groupby.rename(columns = {'rating': 'rating count'})
groupby[groupby['rating count'] > 1]

Unnamed: 0,movieId,rating count


No repeated ratings from each user for each movie.

In [13]:
sorted_user_predictions[:10]

array([168, 227,  49, 221, 167, 150, 172,   0,  90,  88], dtype=int64)