In [7]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on 
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. 

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1809k      0 --:--:-- --:--:-- --:--:-- 1805k


In [8]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [9]:
# import the dataset
import pandas as pd
movies = pd.read_csv('data/ml-latest-small/movies.csv')

In [10]:
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')

In [11]:
print('The dimensions of movies dataframe are:', movies.shape,'\nThe dimensions of ratings dataframe are:', ratings.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [12]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Cleaning Movie Titles 



In [13]:
# build search engine
import re # re -> python regular expression

# define a function called clean title to remove the extra characters to make the search easier
def clean_title(title):
  # this code is going to searc through each title and look for any characters that aren't 
  # a space or digit or letter and going to remove them, it will remove parentheses and dashes for example
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [14]:
# create a new column in our data frame
movies["clean_title"] = movies["title"].apply(clean_title)

In [15]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


# Creating a TFIDF Matrix

Term frequency matrix

Inverse document frequency -> helps the search engine to find the terms that are unique, for example the word 'the' isn't unique

In [18]:
# we will import TfidfVectorizer from scikit-learn which will automatically do everything
# as turning titles into numbers because computers can't understand characters
from sklearn.feature_extraction.text import TfidfVectorizer
# initialize our class and we'll pass sth called ngram range
# ngram range -> this code instead of just looking at individual words in the title it also is going 
# to look at engrams -> groups of two words that are consecutive to make search a bit more accurate
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# use tis vectorizer to turn our set of titles into a matrix so sets of numbers
tfidf = vectorizer.fit_transform(movies["clean_title"])

## Creating a search function

In [19]:
# compute the similarity between a term we enter and all of the movies in our list
# we will use cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# define a fn called search which takes a search term in which is the title we want to search for
def search(title):
  # will call the clean fn to clean the entered title
    title = clean_title(title)
    # use the vectorizer to turn the search term into set of numbers 
    query_vec = vectorizer.transform([title])
    # compare the search term to the dataset and will return how similar our title is to the other titles
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # find the indeces of titles that have the greatest similarity to our search term
    indices = np.argpartition(similarity, -5)[-5:] # will find the 5 most similar 
    # find the titles of that indices
    results = movies.iloc[indices].iloc[::-1] #-1 to reverse the results and make most similar movie be at top 
    
    return results

# Building an Interactive search box

In [20]:
# build the interactive widget where we can actually type the name of the movie
# to do this we need to import ipyhtonwidgets
import ipywidgets as widgets
from IPython.display import display

# create an input widget which is the box we enter movie title in 
movie_input = widgets.Text(
    # our default value will be Toy Story
    value='Toy Story',
    # our description will be Movie Title
    description='Movie Title:',
    disabled=False     #to enable 
)
# create an output widget that interactive function does some processing ,it will search out dataset and put the results into it
movie_list = widgets.Output()

# this fn is going to be called whenever we type smth in the box
def on_type(data):
  # with our output widget
    with movie_list:
      # clear the output widget 
        movie_list.clear_output()
        # grab the title from the input widget
        title = data["new"]
        # the input title should be > 5 to get accurate search
        if len(title) > 5:
           #search on set of titles and display it in output widget
            display(search(title))

movie_input.observe(on_type, names='value')   


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [25]:
# print the top of the rating data
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [22]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [24]:
# mvie id for toy story
movie_id = 1

#Building a Recommendation Function

In [26]:
# Finding users who liked the same movie
def find_similar_movies(movie_id):
    # find the users who liked the same movies as us (have similar tastes as us)
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    # find the other movies that they liked
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    # what percentage of all users recommend each of these movies
    # movies that 10% or more of the users who are similar to us also liked
    # similar_user_recs.value_counts() -> counts up how many times each movie appears
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users) # divide by length to get the % 
    # only take the ones > 10%
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

                 ############ FINDING HOW MUCH ALL USERS LIKE MOVIES ############
    # find how much all of the users in our data set like these movies
    # finding anyone who has rated a movie that is in our set of recommended movies, and rating > 4
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    # finding the % of all users recommend each of these movies that are in similar_user_recs
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    # compare the percentages by concatenate method to combine tesee 2 series together
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    # gives how much users similar to us liked these movies and how much all the users liked these movies
    rec_percentages.columns = ["similar", "all"]
 
                  ############ CREATING A RECOMMENDATION SCORE ############
    # create a score by dividing one by the other
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    # sort this recommendations, ascending=False -> to make biggest values at the beginning
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    # in the output we can see that the score is the ratio between how much users similar to us liked the movie and how much just the average user liked the movie
    # the higher score the better recommendation is
    # we will take our top 10 recommendations and we'll merge them with our movies
    # merge with our movies data to get the titles of these movies 
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

# Creating an interactive recommendation widget

In [27]:
# creating an interactive recommendation widget
import ipywidgets as widgets
from IPython.display import display

# create an input widget which is the box we enter movie title in 
movie_name_input = widgets.Text(
    value='Toy Story', #initial value
    # our description will be Movie Title
    description='Movie Title:',
    disabled=False
)
# create output widget
recommendation_list = widgets.Output()

# this fn is going to be called whenever we type smth in the box
def on_type(data):
    with recommendation_list:
      # clear the output widget
        recommendation_list.clear_output()
        # grab the title from the input widget
        title = data["new"] 
        # the input title should be > 5
        if len(title) > 5:
            # search 
            results = search(title)
            # extract movie id, [0] -> first row which is highest confidence
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()