In [1]:
#Loading Data
import pandas as pd
ratings = pd.read_csv(
    "/content/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [2]:
movies = pd.read_csv(
    "/content/u.item",
    sep="|",
    encoding="latin-1",
    names=[
        "movie_id","title","release_date","video_release_date","IMDb_URL",
        "unknown","Action","Adventure","Animation","Children","Comedy","Crime",
        "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
        "Romance","Sci-Fi","Thriller","War","Western"
    ]
)

movies = movies[["movie_id", "title"]]
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [3]:
ratings = ratings.merge(movies, on="movie_id")
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)
...,...,...,...,...,...
99995,880,476,3,880175444,"First Wives Club, The (1996)"
99996,716,204,5,879795543,Back to the Future (1985)
99997,276,1090,1,874795795,Sliver (1993)
99998,13,225,2,882399156,101 Dalmatians (1996)


In [4]:
#EDA
ratings.isnull().sum()
ratings.head()
ratings.shape

(100000, 5)

In [5]:
#Creating user item matrix
user_item = ratings.pivot_table(
    index="user_id",
    columns="movie_id",
    values="rating"
).fillna(0)
user_item

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Implementing Similarity Model
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(user_item)
predicted_ratings = np.dot(user_similarity, user_item.values) / np.sum(user_similarity, axis=1).reshape(-1,1)

In [16]:
#Making Recommendations
import numpy as np
def recommend_movies(user_id, k=5):
    user_index = user_id - 1
    user_predictions = predicted_ratings[user_index]

    user_rated = user_item.iloc[user_index]
    user_predictions[user_rated > 0] = 0

    top_movie_ids = np.argsort(user_predictions)[-k:][::-1]
    return movies[movies.movie_id.isin(top_movie_ids)][["movie_id","title"]]
recommend_movies(10, 5)

Unnamed: 0,movie_id,title
77,78,Free Willy (1993)
170,171,Delicatessen (1991)
179,180,Apocalypse Now (1979)
202,203,Unforgiven (1992)
316,317,In the Name of the Father (1993)


In [17]:
#Evaluation using K value
def precision_at_k(user_id, k=5):
    recs = recommend_movies(user_id, k)
    actual = ratings[(ratings.user_id == user_id) & (ratings.rating >= 4)].movie_id.values

    hits = sum([1 for m in recs.movie_id if m in actual])
    return hits / k

In [9]:
item_similarity = cosine_similarity(user_item.T)

In [10]:
#Matrix Factorization
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20)
latent = svd.fit_transform(user_item)
latent

array([[ 4.21564507e+01, -1.46291249e+00, -1.33595568e+00, ...,
        -1.54387023e+00, -2.21411367e+00,  2.78188938e+00],
       [ 8.98235172e+00,  1.14157456e+01,  1.14540431e+01, ...,
        -3.44437644e-01,  9.37375722e-01, -1.17890183e+00],
       [ 3.62469291e+00,  6.27232590e+00,  5.08928830e+00, ...,
         8.33095648e-01, -1.66568558e+00,  9.20411084e-03],
       ...,
       [ 4.76921060e+00,  6.12612138e+00,  1.34309118e+00, ...,
         3.24590514e+00,  2.68244241e+00,  4.99004688e-01],
       [ 1.53951858e+01, -1.98222402e+00,  4.98592566e+00, ...,
         1.38328777e+00,  1.92555330e-01, -1.10599804e+00],
       [ 2.70617041e+01,  2.67536209e+00, -1.27540356e+01, ...,
        -1.73397752e+00, -3.36338461e+00, -2.44130380e+00]])