In [79]:
# #Downloading the data
# import zipfile
# import urllib.request

# url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
# urllib.request.urlretrieve(url, "ml-100k.zip")

# with zipfile.ZipFile("ml-100k.zip", "r") as zip_ref:
#     zip_ref.extractall("data/raw")


In [4]:
%load_ext autoreload
%autoreload 2

from data_loader import MoviesLensLoader
from content_based import content_based
from collaborative.item_based import (
    build_user_item_matrix,
    compute_item_similarity,
    recommend_items_item_based
)
from collaborative.funk_svd import (
    create_id_mappings,
    create_latent_matrix,
    gradient_update,
    funk_svd_recommend
)
from hybrid_rec import hybrid_recommendation

import pandas as pd
import numpy as np


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
loader = MoviesLensLoader()
ratings, items = loader.load_all()


In [83]:
print(f"The ratings counts is {ratings['rating'].value_counts().sort_index()}")
print(f"Number of unique users are: {ratings['user_id'].nunique()}")
print(f"Number of unique movies are : {ratings['item_id'].nunique()}")

The ratings counts is rating
1     6110
2    11370
3    27145
4    34174
5    21201
Name: count, dtype: int64
Number of unique users are: 943
Number of unique movies are : 1682


In [84]:
items.isna().sum()
items['release_date'] = items['release_date'].ffill()
items = items.drop(columns = ['video_release'])
items = items.set_index('item_id')
items

Unnamed: 0_level_0,title,release_date,imdb_url,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,Mat' i syn (1997),06-Feb-1998,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1679,B. Monkey (1998),06-Feb-1998,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1680,Sliding Doors (1998),01-Jan-1998,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1681,You So Crazy (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
items.iloc[1]

title                                            GoldenEye (1995)
release_date                                          01-Jan-1995
imdb_url        http://us.imdb.com/M/title-exact?GoldenEye%20(...
genre_0                                                         0
genre_1                                                         1
genre_2                                                         1
genre_3                                                         0
genre_4                                                         0
genre_5                                                         0
genre_6                                                         0
genre_7                                                         0
genre_8                                                         0
genre_9                                                         0
genre_10                                                        0
genre_11                                                        0
genre_12  

### Implementing Content Based Recommedation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words="english")
tf_title = tf.fit_transform(items["title"].str.lower())

tf_df = pd.DataFrame(
    tf_title.toarray(),
    columns=tf.get_feature_names_out()
).reset_index(drop=True)

genre_cols = [f"genre_{i}" for i in range(19)]
content_features = pd.concat(
    [tf_df, items[genre_cols].reset_index(drop=True)],
    axis=1
)


### Implementing Item based Collaboratie

In [7]:
user_item_matrix = build_user_item_matrix(ratings)
item_similarity_df = compute_item_similarity(user_item_matrix)


In [9]:
uid = int(input("User id for Item-CF: "))
ids = recommend_items_item_based(uid, user_item_matrix, item_similarity_df)
items[items["item_id"].isin(ids)][["item_id", "title"]]


Unnamed: 0,item_id,title
55,56,Pulp Fiction (1994)
81,82,Jurassic Park (1993)
95,96,Terminator 2: Judgment Day (1991)
194,195,"Terminator, The (1984)"
201,202,Groundhog Day (1993)


### Implementing FunkSVD Recommedation

In [10]:
user_map, item_map, u_len, i_len = create_id_mappings(ratings)
U, I = create_latent_matrix(u_len, i_len)

U, I, user_bias, item_bias, global_mean = gradient_update(
    U, user_map, I, item_map, ratings
)


In [11]:
uid = int(input("User id for FunkSVD: "))
ids = funk_svd_recommend(
    U, user_map, I, item_map,
    uid, ratings,
    user_bias, item_bias, global_mean
)

items[items["item_id"].isin(ids)][["item_id", "title"]]


Unnamed: 0,item_id,title
6,7,Twelve Monkeys (1995)
113,114,Wallace & Gromit: The Best of Aardman Animatio...
174,175,Brazil (1985)
245,246,Chasing Amy (1997)
886,887,Eve's Bayou (1997)


### Hybrid

In [12]:
uid = int(input("User id: "))
movie = input("Movie you liked: ")

seed_idx = items[items["title"] == movie].index[0]

ids = hybrid_recommendation(
    uid, seed_idx,
    items, ratings,
    content_features,
    U, I, user_map, item_map,
    user_bias, item_bias, global_mean
)

items[items["item_id"].isin(ids)][["item_id", "title"]]


Unnamed: 0,item_id,title
90,91,"Nightmare Before Christmas, The (1993)"
239,240,Beavis and Butt-head Do America (1996)
624,625,"Sword in the Stone, The (1963)"
945,946,"Fox and the Hound, The (1981)"
968,969,Winnie the Pooh and the Blustery Day (1968)


### Save the Models

In [13]:
import pickle
import numpy as np

np.save("artifacts/U.npy", U)
np.save("artifacts/I.npy", I)
np.save("artifacts/user_bias.npy", user_bias)
np.save("artifacts/item_bias.npy", item_bias)
np.save("artifacts/global_mean.npy", global_mean)

with open("artifacts/user_map.pkl", "wb") as f:
    pickle.dump(user_map, f)

with open("artifacts/item_map.pkl", "wb") as f:
    pickle.dump(item_map, f)
