In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: surprise
Successfully installed surprise-0.1


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
import requests
import zipfile
import io
import os

In [11]:
def download_and_extract_dataset():
    if not os.path.exists('ml-100k'):
        print("Downloading MovieLens 100k dataset...")
        url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
        r = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall()
        print("Movielens 100k dataset downloaded and extracted successfully.")
    else:
        print("The dataset already exists. Download skipped.")

In [13]:
download_and_extract_dataset()

Downloading MovieLens 100k dataset...
Movielens 100k dataset downloaded and extracted successfully.


In [17]:
ratings_df = pd.read_csv("ml-100k/u.data", sep="\t", names = ['user_id', 'item_id', 'rating', 'timestamp'])

print(f"Dataset shape: {ratings_df.shape}")
print(f"Number of unique users: {ratings_df['user_id'].nunique()}")
print(f"Number of unique movies: {ratings_df['item_id'].nunique()}")
print(f"Range of ratings: {ratings_df['rating'].min()} to {ratings_df['rating'].max()}")

Dataset shape: (100000, 4)
Number of unique users: 943
Number of unique movies: 1682
Range of ratings: 1 to 5


In [19]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [31]:
model = SVD(n_factors = 20, lr_all = 0.01, reg_all = 0.01, n_epochs = 20, random_state = 42)
model.fit(train)

predictions = model.test(test)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.9576
MAE:  0.7455


In [41]:
cv_results = cross_validate(model, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

print(f"Average RMSE: {cv_results['test_rmse'].mean(): .4f}")
print(f"Average MAE: {cv_results['test_mae'].mean(): .4f}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9574  0.9587  0.9601  0.9560  0.9649  0.9594  0.0030  
MAE (testset)     0.7471  0.7483  0.7488  0.7488  0.7532  0.7492  0.0021  
Fit time          0.59    0.48    0.53    0.53    0.56    0.54    0.04    
Test time         0.19    0.10    0.07    0.16    0.10    0.13    0.04    
Average RMSE:  0.9594
Average MAE:  0.7492


In [43]:
model = SVD(n_factors = 50, lr_all = 0.005, reg_all = 0.05, n_epochs = 50, random_state = 42)
model.fit(train)

predictions = model.test(test)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.9242
MAE:  0.7264


In [45]:
cv_results = cross_validate(model, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

print(f"Average RMSE: {cv_results['test_rmse'].mean(): .4f}")
print(f"Average MAE: {cv_results['test_mae'].mean(): .4f}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9265  0.9238  0.9223  0.9249  0.9231  0.9241  0.0014  
MAE (testset)     0.7276  0.7247  0.7255  0.7235  0.7272  0.7257  0.0015  
Fit time          1.57    1.77    1.65    1.65    1.98    1.72    0.14    
Test time         0.14    0.11    0.12    0.17    0.10    0.13    0.03    
Average RMSE:  0.9241
Average MAE:  0.7257


In [47]:
from surprise import SVDpp

model = SVDpp(n_epochs = 30, random_state = 42)
results = cross_validate(model, data, measures = ["RMSE", "MAE"], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9331  0.9410  0.9337  0.9361  0.9346  0.9357  0.0028  
MAE (testset)     0.7312  0.7367  0.7257  0.7303  0.7311  0.7310  0.0035  
Fit time          25.17   24.38   23.43   31.92   25.71   26.12   3.00    
Test time         3.13    3.15    3.07    3.30    3.08    3.15    0.08    


In [49]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [20, 50],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.05]
}

gs = GridSearchCV(SVDpp, param_grid, measures = ['rmse', 'mae'], cv=3)
gs.fit(data)

best_model = gs.best_estimator['rmse']
best_model.fit(train)

predictions = best_model.test(test)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.9147
MAE:  0.7196


In [51]:
def get_movie_names():
    movies_df = pd.read_csv('ml-100k/u.item', sep = '|', encoding = 'latin-1', header = None, usecols = [0, 1], names = ['item_id', 'title'])
    return movies_df

movies_df = get_movie_names()

In [83]:
def recommend_movies(user_id, n = 10):
    all_movies = movies_df['item_id'].unique()

    rated_movies = ratings_df[ratings_df['user_id'] == user_id]['item_id'].values

    unrated_movies = np.setdiff1d(all_movies, rated_movies)

    predictions = []
    for item_id in unrated_movies:
        predicted_rating = best_model.predict(user_id, item_id).est
        predictions.append((item_id, predicted_rating))
    # print(predictions)
    predictions.sort(key= lambda x: x[1], reverse = True)

    top_n_recommendations = predictions[:n]

    recommendations = pd.DataFrame(top_n_recommendations, columns = ['item_id', 'predicted_rating'])
    recommendations = recommendations.merge(movies_df, on='item_id')

    return recommendations

In [109]:
user_id = 42
recommendations = recommend_movies(user_id, n = 10)

print(f"Top 10 recommended movies for user {user_id}")
print(recommendations[['title', 'predicted_rating']])

Top 10 recommended movies for user 42
                                   title  predicted_rating
0                      Braveheart (1995)          4.948321
1                         Titanic (1997)          4.681675
2                      Casablanca (1942)          4.474301
3          Affair to Remember, An (1957)          4.354818
4             Wrong Trousers, The (1993)          4.353312
5                   Air Force One (1997)          4.305944
6              African Queen, The (1951)          4.305092
7  Day the Earth Stood Still, The (1951)          4.287849
8                    Pretty Woman (1990)          4.283626
9                  Rainmaker, The (1997)          4.279505


In [89]:
movies_df

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [107]:
rated_movies = ratings_df[ratings_df['user_id'] == 42]['item_id'].values
for items in rated_movies:
    print(movies_df[movies_df['item_id'] == items].title)

422    E.T. the Extra-Terrestrial (1982)
Name: title, dtype: object
402    Batman (1989)
Name: title, dtype: object
95    Terminator 2: Judgment Day (1991)
Name: title, dtype: object
793    It Could Happen to You (1994)
Name: title, dtype: object
545    Broken Arrow (1996)
Name: title, dtype: object
273    Sabrina (1995)
Name: title, dtype: object
587    Beauty and the Beast (1991)
Name: title, dtype: object
43    Dolores Claiborne (1994)
Name: title, dtype: object
1027    Grumpier Old Men (1995)
Name: title, dtype: object
624    Sword in the Stone, The (1963)
Name: title, dtype: object
97    Silence of the Lambs, The (1991)
Name: title, dtype: object
952    Unstrung Heroes (1995)
Name: title, dtype: object
684    Executive Decision (1996)
Name: title, dtype: object
175    Aliens (1986)
Name: title, dtype: object
194    Terminator, The (1984)
Name: title, dtype: object
184    Psycho (1960)
Name: title, dtype: object
68    Forrest Gump (1994)
Name: title, dtype: object
683    In the Lin