<a href="https://colab.research.google.com/github/mohammedaadhilkma/Movie-Recommendation-System/blob/main/MR_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
from google.colab import drive
drive.mount ('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
file_path='/content/drive/MyDrive/Dataset/movies.csv'

In [52]:
import pandas as pd
movies = pd.read_csv(file_path)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [53]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [54]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [55]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices].iloc[::-1]
  return results

In [62]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        display(data)
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names="value")


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [63]:
ratings = pd.read_csv('/content/drive/MyDrive/Dataset/ratings.csv')

In [64]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
433953,3000,5952,4.0,1111264588
433954,3000,6016,5.0,1111264838
433955,3000,7199,5.0,1111264863
433956,3000,8368,4.0,1111265040


In [65]:
ratings.dtypes

Unnamed: 0,0
userId,int64
movieId,int64
rating,float64
timestamp,int64


In [66]:
movie_id = 1

In [67]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [68]:
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  111,  120,  127,
        143,  152,  158,  160,  162,  171,  186,  188,  211,  217,  229,
        230,  235,  249,  257,  259,  297,  298,  302,  323,  329,  355,
        359,  369,  371,  381,  392,  402,  411,  428,  435,  439,  447,
        449,  468,  469,  477,  484,  513,  519,  537,  540,  541,  548,
        551,  553,  561,  567,  573,  582,  593,  607,  609,  611,  623,
        624,  626,  628,  631,  644,  653,  654,  670,  683,  686,  694,
        697,  702,  709,  727,  733,  741,  749,  752,  765,  768,  773,
        785,  791,  793,  796,  803,  805,  807,  811,  830,  834,  839,
        848,  856,  896,  904,  905,  911,  927,  947,  950,  956,  966,
        969,  986,  997, 1007, 1010, 1013, 1036, 1038, 1042, 1065, 1079,
       1092, 1096, 1101, 1118, 1123, 1131, 1138, 1140, 1141, 1143, 1146,
       1150, 1159, 1166, 1167, 1169, 1171, 1176, 1179, 1192, 1196, 1198,
       1199, 1200, 1228, 1230, 1232, 1240, 1242, 12

In [69]:
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [70]:
similar_users_recs

Unnamed: 0,movieId
5101,1
5105,34
5111,110
5114,150
5127,260
...,...
433421,26865
433422,31878
433430,35836
433431,40819


In [71]:
similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
similar_users_recs = similar_users_recs[similar_users_recs > .1]

In [72]:
similar_users_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,1.000000
318,0.449704
260,0.396450
593,0.325444
296,0.322485
...,...
89745,0.100592
2542,0.100592
1028,0.100592
3996,0.100592


In [73]:
all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]

In [74]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [76]:
all_users_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,0.342476
296,0.283652
2571,0.238542
527,0.222663
593,0.220859
...,...
2115,0.038975
50872,0.038253
78499,0.033201
1028,0.026344


In [77]:
rec_persentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
rec_persentages.columns = ["similar", "all"]

In [78]:
rec_persentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.121978
318,0.449704,0.342476
260,0.396450,0.217250
593,0.325444,0.220859
296,0.322485,0.283652
...,...,...
89745,0.100592,0.045110
2542,0.100592,0.050884
1028,0.100592,0.026344
3996,0.100592,0.067124


In [79]:
rec_persentages["score"] = rec_persentages["similar"] / rec_persentages["all"]

In [80]:
rec_persentages = rec_persentages.sort_values("score", ascending=False)

In [81]:
rec_persentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.121978,8.198225
3114,0.269231,0.053049,5.075092
2355,0.121302,0.025983,4.668434
78499,0.150888,0.033201,4.544668
1028,0.100592,0.026344,3.818351
...,...,...,...
1221,0.156805,0.127391,1.230895
7361,0.121302,0.104655,1.159059
296,0.322485,0.283652,1.136904
2858,0.177515,0.163118,1.088260


In [82]:
rec_persentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.121978,8.198225,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.269231,0.053049,5.075092,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.121302,0.025983,4.668434,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.150888,0.033201,4.544668,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
1005,0.100592,0.026344,3.818351,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
580,0.224852,0.066763,3.367919,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
4780,0.227811,0.068206,3.340018,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
587,0.213018,0.064958,3.27929,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
11361,0.118343,0.038253,3.09367,50872,Ratatouille (2007),Animation|Children|Drama,Ratatouille 2007
1047,0.153846,0.050884,3.023459,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971


In [83]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"] # This line has been corrected
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_users_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [84]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()