<a href="https://colab.research.google.com/github/niikkkhiil/Portfolio/blob/main/Movie_Recommandation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
movie = pd.read_csv('/content/movies.csv')
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]","",title)


In [4]:
movie['clean_title'] = movie['title'].apply(clean_title)

In [None]:
movie.clean_title

Unnamed: 0,clean_title
0,Toy Story 1995
1,Jumanji 1995
2,Grumpier Old Men 1995
3,Waiting to Exhale 1995
4,Father of the Bride Part II 1995
...,...
62418,We 2018
62419,Window of the Soul 2001
62420,Bad Poems 2018
62421,A Girl Thing 2001


In [5]:
movie

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movie["clean_title"])


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movie.iloc[indices].iloc[::-1]
  return results

In [8]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    values = "Toy Story",
    description = "Movie Title",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='', description='Movie Title')

Output()

In [9]:
movie[movie['movieId'] == 7153].drop(columns = 'genres')

Unnamed: 0,movieId,title,clean_title
7028,7153,"Lord of the Rings: The Return of the King, The...",Lord of the Rings The Return of the King The 2003


In [10]:
ratings = pd.read_csv('/content/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [11]:
ratings.shape

(5143911, 4)

In [12]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
5143906,33415,1252,4.0,862576189
5143907,33415,1258,4.0,862576706
5143908,33415,1261,3.0,862577628
5143909,33415,1263,4.0,862576264


In [13]:
movie_Id = 296

In [14]:
similar_user = ratings[(ratings["movieId"] == movie_Id)&(ratings['movieId'] > 4)]["userId"].unique()

In [15]:
similar_user

array([    1,     3,     4, ..., 33412, 33414, 33415])

In [34]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_user)) & (ratings["rating"] > 4)]["movieId"]

In [35]:

similar_user_recs

Unnamed: 0,movieId
0,296
2,307
3,665
8,1237
16,2351
...,...
5143893,1232
5143895,1235
5143900,1245
5143903,1248


In [36]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_user)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
296,0.541947
318,0.441396
593,0.323882
50,0.299081
2959,0.298347
...,...
6016,0.103062
364,0.102756
7438,0.102633
3147,0.101960


In [38]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))&(ratings["rating"] > 4)]

In [39]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
37,1,6016,5.0,1147869090
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
...,...,...,...,...
5143568,33414,7438,5.0,1546201468
5143594,33414,79132,5.0,1540053877
5143676,33415,50,5.0,862576814
5143682,33415,110,5.0,862578722


In [41]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,0.352165
296,0.291877
2571,0.250684
356,0.238284
593,0.231754
...,...
778,0.073678
1617,0.072293
6016,0.071732
1222,0.067115


In [43]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
296,0.541947,0.291877
318,0.441396,0.352165
593,0.323882,0.231754
50,0.299081,0.207084
2959,0.298347,0.220606
...,...,...
6016,0.103062,0.071732
364,0.102756,0.087530
7438,0.102633,0.066225
3147,0.101960,0.083309


In [44]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
296,0.541947,0.291877,1.856767
1089,0.170484,0.102009,1.671270
778,0.115860,0.073678,1.572517
1213,0.162890,0.104680,1.556080
1222,0.104287,0.067115,1.553845
...,...,...,...
1197,0.135028,0.121797,1.108630
1,0.141029,0.129976,1.085038
260,0.241396,0.223640,1.079395
1210,0.157808,0.147851,1.067341


In [46]:
rec_percentages.head(10).merge(movie, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
292,0.541947,0.291877,1.856767,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
1062,0.170484,0.102009,1.67127,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller,Reservoir Dogs 1992
762,0.11586,0.073678,1.572517,778,Trainspotting (1996),Comedy|Crime|Drama,Trainspotting 1996
1182,0.16289,0.10468,1.55608,1213,Goodfellas (1990),Crime|Drama,Goodfellas 1990
1191,0.104287,0.067115,1.553845,1222,Full Metal Jacket (1987),Drama|War,Full Metal Jacket 1987
109,0.128475,0.082715,1.553228,111,Taxi Driver (1976),Crime|Drama|Thriller,Taxi Driver 1976
7299,0.102633,0.066225,1.549771,7438,Kill Bill: Vol. 2 (2004),Action|Drama|Thriller,Kill Bill Vol 2 2004
3907,0.11384,0.074008,1.538204,4011,Snatch (2000),Comedy|Crime|Thriller,Snatch 2000
46,0.227373,0.147851,1.537848,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Seven aka Se7en 1995
6751,0.130741,0.085716,1.525278,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,Kill Bill Vol 1 2003


In [47]:
def find_similar_movies(movie_id):
  similar_user = ratings[(ratings["movieId"] == movie_id)&(ratings["rating"] > 4)]["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_user)) & (ratings["rating"] > 4)]["movieId"]
  similar_user_recs = similar_user_recs.value_counts() / len(similar_user)
  similar_user_recs = similar_user_recs[similar_user_recs > .1]
  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
  rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percentages.columns = ["similar", "all"]
  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
  rec_percentages =rec_percentages.sort_values("score", ascending=False)
  return rec_percentages.head(10).merge(movie, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [50]:
movie_input_name = widgets.Text(
    value='Toy Story',
    description = 'Movie Title:',
    disabled = False
)

recommandation_list = widgets.Output()

def on_type(data):
  with recommandation_list:
    recommandation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names='value')

display(movie_input_name, recommandation_list)

Text(value='Toy Story', description='Movie Title:')

Output()