In [1]:
import pandas as pd 
import re
import numpy as np
import ipywidgets as widgets 
from IPython.display  import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [4]:
movies.shape

(10329, 3)

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [6]:
movies.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [7]:
def clean_title(title):
    return re.sub("[^a-zA-z0-9]", " ",title)
    

In [8]:
movies["Clean_title"] = movies["title"].apply(clean_title)

In [9]:
movies

Unnamed: 0,movieId,title,genres,Clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy,Cosmic Scrat tastrophe 2015
10325,146878,Le Grand Restaurant (1966),Comedy,Le Grand Restaurant 1966
10326,148238,A Very Murray Christmas (2015),Comedy,A Very Murray Christmas 2015
10327,148626,The Big Short (2015),Drama,The Big Short 2015


In [10]:
movies = movies[['movieId', 'Clean_title','genres']]

In [11]:
movies

Unnamed: 0,movieId,Clean_title,genres
0,1,Toy Story 1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji 1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men 1995,Comedy|Romance
3,4,Waiting to Exhale 1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II 1995,Comedy
...,...,...,...
10324,146684,Cosmic Scrat tastrophe 2015,Animation|Children|Comedy
10325,146878,Le Grand Restaurant 1966,Comedy
10326,148238,A Very Murray Christmas 2015,Comedy
10327,148626,The Big Short 2015,Drama


In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['Clean_title'])

In [13]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity, -8) [-8:]
    results =  movies.iloc[indices][::-1]
    return results

In [14]:
search('Toy Story 1995')

Unnamed: 0,movieId,Clean_title,genres
0,1,Toy Story 1995,Adventure|Animation|Children|Comedy|Fantasy
8599,78499,Toy Story 3 2010,Adventure|Animation|Children|Comedy|Fantasy|IMAX
2496,3114,Toy Story 2 1999,Adventure|Animation|Children|Comedy|Fantasy
1667,2108,L A Story 1991,Comedy|Romance
4403,5843,Toy Soldiers 1991,Action|Drama
3838,4929,Toy The 1982,Comedy
3372,4296,Love Story 1970,Drama|Romance
2245,2804,Christmas Story A 1983,Children|Comedy


In [15]:
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title :",
    disabled = False
)
movie_list = widgets.Output() 

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
                display(search(title))
            
movie_input.observe(on_type,names='value')

display(movie_input , movie_list)



Text(value='Toy Story', description='Movie Title :')

Output()

In [16]:
ratings = pd.read_csv("ratings.csv")

In [17]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


In [18]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [19]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [20]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_rec = ratings[(ratings["userId"].isin(similar_users))  & (ratings["rating"] > 4)]["movieId"]
    
    
    similar_user_rec = similar_user_rec.value_counts() / len(similar_users)
    similar_user_rec = similar_user_rec[similar_user_rec > .10]
    
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_rec.index))  & (ratings['rating'] > 4)]
    all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
    
    rec_persentages = pd.concat([similar_user_rec , all_users_recs], axis = 1)
    rec_persentages.columns = ["similar","all"]
    
    rec_persentages["score"] = rec_persentages["similar"] / rec_persentages["all"]
    
    rec_persentages = rec_persentages.sort_values("score", ascending= False)
    return  rec_persentages.head(10).merge(movies , left_index=True , right_on="movieId")[["score", "Clean_title","genres"]]

In [21]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title :",
    disabled = False
)
recommendation_list = widgets.Output() 

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
        
movie_name_input.observe(on_type , names = "value")

display(movie_name_input , recommendation_list)



Text(value='Toy Story', description='Movie Title :')

Output()