# Recomendation System

In [351]:
import pandas as pd

## Import Datasets

In [352]:
def import_csv(csv):
    return pd.read_csv(csv)

df_movies = import_csv("movies.csv")
df_raking = import_csv("ratings.csv")

We need the movies and the ratings to build the recomendation

## Clean datasets

In [353]:
# to build the search engine we will use
import re # python regular expression library

In [354]:
# this will filter values that contains only contains a-zA-Z0-9
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [355]:
df_movies["title"] = df_movies["title"].apply(clean_title)

In [356]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji 1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men 1995,Comedy|Romance
3,4,Waiting to Exhale 1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II 1995,Comedy
...,...,...,...
62418,209157,We 2018,Drama
62419,209159,Window of the Soul 2001,Documentary
62420,209163,Bad Poems 2018,Comedy|Drama
62421,209169,A Girl Thing 2001,(no genres listed)


In [357]:
df_movies["year"] = df_movies["title"].str.extract(r'(\d{4})$')

In [358]:
df_movies["title"] = df_movies["title"].str.replace(r'(\d{4})$', "").str.strip()

  df_movies["title"] = df_movies["title"].str.replace(r'(\d{4})$', "").str.strip()


In [359]:
df_movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
62418,209157,We,Drama,2018
62419,209159,Window of the Soul,Documentary,2001
62420,209163,Bad Poems,Comedy|Drama,2018
62421,209169,A Girl Thing,(no genres listed),2001


## TDIDF

- TD (Term Frequency)
- IDF (Inverse Document Frequency)

In [360]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [361]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

**ngram_range** controls the creation of n-grams from the input text. 

- ngram_range=(1, 1): This means only unigrams (single words) will be considered.
- ngram_range=(1, 2): This means both unigrams and bigrams will be considered.
- ngram_range=(2, 2): This means only bigrams (pairs of words) will be considered.

ngram_range=(1, 2) for "Machine learning is fascinating,"

("Machine learning", "learning is", "is fascinating")

In [362]:
tfidf = vectorizer.fit_transform(df_movies["title"])

In [363]:
from sklearn.metrics.pairwise import cosine_similarity

${Cosine Similarity} = \frac{\sum_{i=1}^{n} {A}_{i} {B}_{i}}{{\sqrt\sum_{i=1}^{n} {A}_{i}^2} + {\sqrt\sum_{i=1}^{n} {B}_{i}^2}} $

Cosine Similarity tell us how similar or different things are

In [364]:
import numpy as np

In [365]:
# Search Function
def search(title):    
    title = clean_title(title)
    
    # use the vectorizer to turn the search term enter into a set of numbers
    query_vec = vectorizer.transform([title])
    
    # find the simularity between the search terms and titles
    # this will compare "query_vec", to each of the titles we have in our dataset and it will return how silimiar the title is to each of those titles
    # .flatten() function is used to convert this matrix into a 1D array (vector).
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    # find the titles that have the greastest similarity to our search term
    # this it will find the 5 most similar titles to the search term
    #  The -5 argument specifies that you want to find the indices that correspond to the smallest 5 elements.
    indices = np.argpartition(similarity, -5)[-5:]
    results = df_movies.iloc[indices][::-1]
    return results

## Build Search Box

In [366]:
import ipywidgets as widgets
from IPython.display import display

In [367]:
movie_input = widgets.Text(
    value="",
    description = "Movie Title:",
    disabled = False
)

movie_list = widgets.Output()

In [368]:
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

In [369]:
movie_input.observe(on_type, names="value")

In [370]:
display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

## Finding users who liked the same movie

In [371]:
movie_id = 1

In [372]:
# Select people that had like that same movie as me
similar_users = df_raking[(df_raking["movieId"] == movie_id) & (df_raking["rating"] >= 4)]["userId"].unique()

**df_raking[(df_raking["movieId"] == movie_id) & (df_raking["rating"] >= 4)]** \
Select rows that corresponde to the same **movie_id** and Select the ones that have a **rating** >= 4 \
This take rows with movieId == movie_id and with the **rating** >= 4

**["userId"].unique()** \
This select the unique userID's that had rank the movieId >= 4 \
This it will display an array


In [373]:
# Select others movies that the same people had like
similar_user_recs = df_raking[(df_raking["userId"].isin(similar_users)) & (df_raking["rating"] >= 4)]["movieId"]

In [374]:
type(df_raking["userId"]) 

pandas.core.series.Series

Since we trying to compare to arrays we need to use **.isin()** \
The **.isin()** method is used to check if the values in one array (or Series) are present in another array (or Series). \
It's commonly used for filtering and selecting data in DataFrames.

In [375]:
similar_user_recs

254              1
255             29
256             32
257             50
258            111
             ...  
24999332    166643
24999342    171763
24999348    177593
24999351    177765
24999378    198609
Name: movieId, Length: 5101989, dtype: int64

The resulting variable **similar_user_recs** holds the "movieId" values for movies that were highly rated (rating >= 4) by users in the **similar_users** list.

## Find only the movies that greater than 10% of the users who are similar to us liked

In [376]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

The resulting variable **similar_user_recs** holds the proportional frequencies of each "movieId" among the \
highly-rated movies by similar users.

In [377]:
similar_user_recs

1         1.000000
318       0.549604
260       0.531518
356       0.517224
296       0.495744
            ...   
153913    0.000027
153917    0.000027
6501      0.000027
41704     0.000027
198609    0.000027
Name: movieId, Length: 30595, dtype: float64

In [378]:
# Select the movies that are greater than 10%
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [379]:
similar_user_recs

1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
235     0.101249
1242    0.100931
1907    0.100772
3527    0.100613
2761    0.100135
Name: movieId, Length: 273, dtype: float64

## Finding how much all users like movies

In [380]:
# Find how much all of the users in the dataset like these movies
all_users = df_raking[(df_raking["movieId"].isin(similar_user_recs.index)) & (df_raking["rating"] >= 4)]

In [381]:
# Find what % of all users recommend each of these movies
all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

## Creating a recomendation score

In [382]:
# Concat people that like the same as us and all users 
rec_precentage = pd.concat([similar_user_recs, all_users_rec], axis=1)
rec_precentage.columns = ["similar", "all"]

In [383]:
rec_precentage

Unnamed: 0,similar,all
1,1.000000,0.235415
318,0.549604,0.440215
260,0.531518,0.325251
356,0.517224,0.367553
296,0.495744,0.389659
...,...,...
235,0.101249,0.055281
1242,0.100931,0.050805
1907,0.100772,0.039805
3527,0.100613,0.056879


In [384]:
# Score column
rec_precentage["score"] = (rec_precentage["similar"] / rec_precentage["all"]) 

In [385]:
# Sort the scores
rec_precentage.sort_values("score", ascending=False)

Unnamed: 0,similar,all,score
1,1.000000,0.235415,4.247819
3114,0.328914,0.102241,3.217054
78499,0.161924,0.057710,2.805840
2355,0.191095,0.068978,2.770367
2081,0.120714,0.047128,2.561408
...,...,...,...
99114,0.112732,0.091209,1.235967
2959,0.351826,0.292519,1.202745
6016,0.118380,0.099007,1.195678
109487,0.117426,0.102603,1.144469


In [386]:
# take our top 10 recomendations and merge with the movies data
rec_precentage.iloc[:10].merge(df_movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,year
0,1.0,0.235415,4.247819,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
314,0.549604,0.440215,1.24849,318,Shawshank Redemption The,Crime|Drama,1994
257,0.531518,0.325251,1.634178,260,Star Wars Episode IV A New Hope,Action|Adventure|Sci-Fi,1977
351,0.517224,0.367553,1.407209,356,Forrest Gump,Comedy|Drama|Romance|War,1994
292,0.495744,0.389659,1.27225,296,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
585,0.468509,0.361897,1.294592,593,Silence of the Lambs The,Crime|Horror|Thriller,1991
2480,0.467793,0.347994,1.344256,2571,Matrix The,Action|Sci-Fi|Thriller,1999
1166,0.446339,0.276225,1.615853,1196,Star Wars Episode V The Empire Strikes Back,Action|Adventure|Sci-Fi,1980
1168,0.428545,0.259325,1.652537,1198,Raiders of the Lost Ark Indiana Jones and the ...,Action|Adventure,1981
1179,0.423798,0.243456,1.740759,1210,Star Wars Episode VI Return of the Jedi,Action|Adventure|Sci-Fi,1983


## Build recomendation function

In [398]:
def recomendation(movie_id):
    # Select people that had like that same movie as me
    similar_users = df_raking[(df_raking["movieId"] == movie_id) & (df_raking["rating"] >= 4)]["userId"].unique()
    # Select others movies that the same people had like
    similar_user_recs = df_raking[(df_raking["userId"].isin(similar_users)) & (df_raking["rating"] >= 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # Find how much all of the users in the dataset like these movies
    all_users = df_raking[(df_raking["movieId"].isin(similar_user_recs.index)) & (df_raking["rating"] >= 4)]
    # Find what % of all users recommend each of these movies
    all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Concat people that like the same as us and all users 
    rec_precentage = pd.concat([similar_user_recs, all_users_rec], axis=1)
    rec_precentage.columns = ["similar", "all"]
    # Score column
    rec_precentage["score"] = (rec_precentage["similar"] / rec_precentage["all"]) 
    # Sort the scores
    rec_precentage.sort_values("score", ascending=False)
    # take our top 10 recomendations and merge with the movies data
    return rec_precentage.iloc[:10].merge(df_movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    

In [399]:
movie_input_name = widgets.Text(
    value="",
    description = "Movie Title:",
    disabled = False
)

In [400]:
recomendation_list = widgets.Output()

In [401]:
def on_type(data):
    with recomendation_list:
        recomendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(recomendation(movie_id))

In [402]:
movie_input_name.observe(on_type, names="value")

In [403]:
display(movie_input_name, recomendation_list)

Text(value='', description='Movie Title:')

Output()