<a href="https://colab.research.google.com/github/madhumithadasarathy/BharatIntern/blob/main/Bharat_Intern_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
#Dataset Link - https://files.grouplens.org/datasets/movielens/ml-25m.zip

#Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Loading the Dataset

In [10]:
df = pd.read_csv("/content/movies.csv")
df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [11]:
df.shape

(62423, 3)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [13]:
df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [14]:
import re
def clean(title):
  return re.sub("[^a-zA-Z0-9 ]","",title)

In [15]:
df['New Title'] = df["title"].apply(clean)
df.head(2)

Unnamed: 0,movieId,title,genres,New Title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995


In [16]:
tfidf = TfidfVectorizer(ngram_range=(1,2))
tfidf_mat = tfidf.fit_transform(df["New Title"])

In [18]:
def search(title):
  title = clean(title)
  query = tfidf.transform([title])
  sty = cosine_similarity(query,tfidf_mat).flatten()
  indices = np.argpartition(sty,-5)[-5:]
  result = df.iloc[indices][::-1]
  return result

In [21]:
import ipywidgets as widgets
from IPython.display import display
user_input = widgets.Text(value = "Toy Story", description = "Movie Title:", disabled = False)
output = widgets.Output()
def on_type(data):
  with output:
    output.clear_output()
    title = data["new"]
    if len(title)>5:
      display(search(title))
user_input.observe(on_type,names='value')
display(user_input,output)

Text(value='Toy Story', description='Movie Title:')

Output()

In [22]:
ratings = pd.read_csv("/content/ratings.csv")
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828


In [24]:
movie_id = 1

In [42]:
smlr = ratings[(ratings["movieId"]== movie_id)&(ratings["rating"]>=5)]["userId"].unique()
recs = ratings[(ratings["userId"].isin(smlr))&(ratings["rating"]>4)]["movieId"]
recs = recs.value_counts()/len(smlr)
recs = recs[recs>.1]

In [35]:
recs

1       1.000000
318     0.412639
260     0.397026
356     0.343123
296     0.336803
          ...   
111     0.102602
1307    0.102230
2324    0.102230
1259    0.101859
2997    0.101115
Name: movieId, Length: 89, dtype: float64

In [43]:
all_users = ratings[(ratings["movieId"].isin(recs.index)) & (ratings["rating"]>4)]
all_users_rec = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

In [38]:
all_users_rec

318      0.348036
296      0.288963
2571     0.248313
356      0.234513
593      0.229136
           ...   
1307     0.047190
380      0.045341
745      0.036917
78499    0.035786
2355     0.024999
Name: movieId, Length: 89, dtype: float64

In [39]:
percentages = pd.concat([recs,all_users_rec],axis=1)
percentages.columns = ["similar","all"]
percentages

Unnamed: 0,similar,all
1,1.000000,0.128455
318,0.412639,0.348036
260,0.397026,0.221191
356,0.343123,0.234513
296,0.336803,0.288963
...,...,...
111,0.102602,0.081436
1307,0.102230,0.047190
2324,0.102230,0.083011
1259,0.101859,0.049313


In [40]:
percentages["score"] = percentages["similar"]/percentages["all"]
percentages = percentages.sort_values("score",ascending=False)
percentages

Unnamed: 0,similar,all,score
1,1.000000,0.128455,7.784857
3114,0.297770,0.055135,5.400725
2355,0.123792,0.024999,4.951842
78499,0.130483,0.035786,3.646165
595,0.210409,0.062909,3.344666
...,...,...,...
4226,0.144238,0.134071,1.075834
58559,0.152045,0.143831,1.057108
7153,0.179554,0.171878,1.044661
79132,0.125279,0.133660,0.937296


In [41]:
percentages.head(10).merge(df,left_index=True,right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,New Title
0,1.0,0.128455,7.784857,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.29777,0.055135,5.400725,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.123792,0.024999,4.951842,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.130483,0.035786,3.646165,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
587,0.210409,0.062909,3.344666,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.179554,0.0538,3.337462,34,Babe (1995),Children|Drama,Babe 1995
580,0.227881,0.068628,3.320536,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
729,0.111896,0.036917,3.031051,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995
1047,0.145353,0.050649,2.869816,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
359,0.237918,0.086812,2.740611,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [44]:
def sim_movies(movie_id):
  smlr = ratings[(ratings["movieId"]== movie_id)&(ratings["rating"]>=5)]["userId"].unique()
  recs = ratings[(ratings["userId"].isin(smlr))&(ratings["rating"]>4)]["movieId"]
  recs = recs.value_counts()/len(smlr)
  recs = recs[recs>.10]

  all_users = ratings[(ratings["movieId"].isin(recs.index)) & (ratings["rating"]>4)]
  all_users_rec = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

  percentages = pd.concat([recs,all_users_rec],axis=1)
  percentages.columns = ["similar","all"]
  percentages["score"] = percentages["similar"]/percentages["all"]
  percentages = percentages.sort_values("score",ascending=False)
  return percentages.head(10).merge(df,left_index=True,right_on = "movieId")[["score","title","genres"]]

In [52]:
name = widgets.Text(value="Toy Story",description = "Movie Title: ", disabled=False)
output = widgets.Output()
def on_type(data):
  with output:
    output.clear_output()
    title = data["new"]
    if len(title)>5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display (sim_movies(movie_id))
name.observe(on_type, names="value")
display(name,output)

Text(value='Toy Story', description='Movie Title: ')

Output()