## Installing libraries

In [None]:
!pip install scikit-learn

## Logging configuration

In [57]:
import logging

# Set logging configurations
logging.basicConfig(level=logging.DEBUG)

## Search similar movies by title

In [59]:
import pandas as pd
from IPython.display import display

def read_file(link):
    '''
    This function aims to read a csv file
    '''
    try:
        data = pd.read_csv(link)
        logging.info('✅ Data loaded successfully!')
        return data
    except FileNotFoundError:
        logging.error('❌ File not found.')
        return None

def show_file(data):
    '''
    This function aims to show the first 5 lines of a Pandas dataframe
    '''
    try:
        display(data.head())
    except AttributeError:
        logging.error('❌ It was not possible to show this file.')

def show_types(data):
    '''
    This function aims to show the types of the columns of a dataframe
    '''
    try:
        display(data.dtypes)
    except AttributeError:
        logging.error('❌ It was not possible to show the types of this file.')

movies = read_file("files/movies.csv")
show_file(movies)

INFO:root:✅ Data loaded successfully!


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [61]:
import re

def clean_title(title):
    '''
    This function removes special characters from a string, retaining only letters, numbers, and white spaces.
    '''
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

movies["clean_title"] = movies["title"].apply(clean_title)
show_file(movies)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Creates an TfidfVectorizer instance
vectorizer = TfidfVectorizer(ngram_range=(1,2))
# Numeric representation of movie title based
# on words frequency and bigrams
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [63]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search_similar_movies_title(title, data):
    '''
    This function returns the movies most similar to 
    the provided movie based on its title. 
    '''
    title = clean_title(title)
    query_vec = vectorizer.transform([title]) # Transforms the title in a vector 
    similarity = cosine_similarity(query_vec, tfidf).flatten() # Calculares cosine similarity 
    indices = np.argpartition(similarity, -5)[-5:] # Takes the 5 more similar movies
    results = data.iloc[indices].iloc[::-1] # Most similar movies in the top
    
    return results

In [64]:
import ipywidgets as widgets
from IPython.display import display

# Defines the movie input to search to similar movies
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

# List with 5 most similar movies
movie_list = widgets.Output()

def on_type_title(data):
    '''
    This function is a callback called every time the movie_input 
    changes. When this happens, the movie_list will be cleared, and 
    if the title has more than 5 characters, it will search the 
    most similar movies according to its title
    '''
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search_similar_movies_title(title, movies))
        else:
            logging.info('ℹ️ Enter at least 6 characters to search for similar movies!')

# Automatically calls on_type_title when movie_input changes
movie_input.observe(on_type_title, names='value')
# Display the movie and its similar ones
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

## Search recommendation movies by ratings

In [65]:
class NoDataFiltered(Exception):
    '''
    Raised when there is no search when applying a filter 
    '''

def filter_movie_by_id(movie_id, data):
    '''
    This function searchs a movie by its id
    '''
    try:
        movie = data[data["movieId"] == movie_id]
        if len(movie) == 0:
            raise NoDataFiltered('❌ There isn\'t any movie with this ID.')
        return movie
    except NoDataFiltered as error:
        print(error)
        return None

In [66]:
ratings = read_file("files/ratings.csv")
show_file(ratings)
show_types(ratings)

INFO:root:✅ Data loaded successfully!


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [67]:
def search_similar_movies_ratings(movie_id):
    '''
    This function searches for similar movies based on users ratings
    '''
    
    # Find users who rated the movie with the specified ID and rated it higher than 4
    query=(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)
    similar_users = ratings[query]["userId"].unique()

    # Filter the movie ratings made by similar users and a rating higher than 4
    query=(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)
    similar_user_recs = ratings[query]["movieId"]

    # Calculate the recommendation percentage for each movie based on ratings from similar users
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # Filter movies with a recommendation percentage greater than 10%
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    # Filter all movie ratings made by similar users and calculates its recommendation percentage
    query=(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)
    all_users = ratings[query]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # Concatenate the recommendation percentages from similar users and all users
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # Calculate the score as the ratio of recommendation percentages from
    # similar users and all users
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # Return the top 10 recommendations
    cols=["score", "title", "genres"]
    results = rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[cols]
    return results

movie_id = 89745
movie=filter_movie_by_id(movie_id, movies)
display(movie)
similar_movies = search_similar_movies_ratings(movie_id)
display(similar_movies)

Unnamed: 0,movieId,title,genres,clean_title
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012


Unnamed: 0,score,title,genres
17067,24.716368,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
20513,19.610199,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX
25058,19.49177,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi
19678,17.867419,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
16725,17.843074,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
16312,17.299824,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX
21348,17.183667,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX
25071,16.649399,Captain America: Civil War (2016),Action|Sci-Fi|Thriller
25061,15.865628,Ant-Man (2015),Action|Adventure|Sci-Fi
14628,15.651921,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX


In [68]:
import ipywidgets as widgets
from IPython.display import display

# Defines the movie input to search to similar movies
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

# List with 10 movies to recommend
recommendation_list = widgets.Output()

def on_type_rec(data):
    '''
    This function is a callback called every time the movie_input 
    changes. When this happens, the recommendation_list will be 
    cleared, and if the title has more than 5 characters, it will 
    search 10 movies to recommend
    '''
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search_similar_movies_title(title, movies)
            movie_id = results.iloc[0]["movieId"]
            display(search_similar_movies_ratings(movie_id))
        else:
            logging.info('ℹ️ Enter at least 6 characters to search for similar movies!')

movie_name_input.observe(on_type_rec, names='value')
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

## Unit tests

OBS.: The file named `tests_project1.py` is already created in the directory `project1` 

In [None]:
!pip install pytest pytest-sugar

In [None]:
!pytest tests_project1.py