# Building a Recommender Engine

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

## Load-in Data

In [2]:
df = pd.read_csv('datasets/dataframe_for_modeling.csv')
df.drop(columns='Unnamed: 0', inplace=True)
movies = pd.read_csv('datasets/movie_dataframe_no_ratings.csv')
movies.drop(columns='Unnamed: 0', inplace=True)

In [3]:
df.head(3)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year,decade,(no genres listed),action,adventure,...,musical,mystery,romance,sci-fi,thriller,war,western,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,1990s,0,0,1,...,0,0,0,0,0,0,0,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,1990s,0,0,1,...,0,0,0,0,0,0,0,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,1990s,0,0,1,...,0,0,0,0,0,0,0,7,4.5,1106635946


In [4]:
movies.head(3)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year,decade,(no genres listed),action,adventure,...,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,1990s,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,1995,1990s,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,1995,1990s,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Create Pivot Table

In [5]:
pivot = pd.pivot_table(df,
                       index='title',   #indexing on the title since this is an item-based engine
                       columns='userId',
                       values='rating')

In [6]:
pivot.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,


In order to reduce computational costs, I created a sparse matrix to deal with the missing values

In [7]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))

The recommender engine will use cosine similarity to compare films. A lower cosine similarity value indicates a stronger similarity between films.

In [8]:
recommender = pairwise_distances(sparse_pivot, metric="cosine")

A new dataframe will serve the same function as a correlation matrix. <br>
Values closest to 0 indicate that two films are similar according to the engine.

In [9]:
recommender_df = pd.DataFrame(recommender, columns=pivot.index, index=pivot.index)
recommender_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.858347,1.0,...,1.0,0.657945,0.456695,0.292893,1.0,1.0,0.860569,0.672673,1.0,1.0
'Hellboy': The Seeds of Creation (2004),1.0,0.0,0.292893,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Round Midnight (1986),1.0,0.292893,0.0,1.0,1.0,1.0,0.823223,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Salem's Lot (2004),1.0,1.0,1.0,0.0,0.142507,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Til There Was You (1997),1.0,1.0,1.0,0.142507,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


This function will search for titles that contain the search term:

In [10]:
genre = movies['genres'].loc[0]

In [11]:
movies.loc[movies['title'].str.contains("Alice in"), 'title']

789     Alice in Wonderland (1951)
7277    Alice in Wonderland (2010)
7425    Alice in Wonderland (1933)
Name: title, dtype: object

In [12]:
title = f'Alice in Wonderland (1951)'

In [13]:
genre_list = list(movies.loc[(movies['title'] == title), 'genres'])
genre_list[-1]

'Adventure|Animation|Children|Fantasy|Musical'

In [14]:
def get_recs(film):
    for title in movies.loc[movies['title'].str.contains(film), 'title']:
        print(title)
        genre_list = list(movies.loc[(movies['title'] == title), 'genres'])
        genre_list = genre_list[-1]
        print('Genre:', genre_list) 
        print('Average rating:', pivot.loc[title, :].mean())
        print('Number of ratings:', pivot.T[title].count())
        print('')
        print('10 closest films:')
        print(recommender_df[title].sort_values()[1:11])
        print('')
        print('*******************************************************************************************')
        print('')

## Strengths:

The recommender enigne clearly differentiates between remakes as seen in all the versions of Alice and Wonderland. <br>This is the best example I've found of how effective the engine can be.

In [15]:
get_recs('Alice in Wonderland')

Alice in Wonderland (1951)
Genre: Adventure|Animation|Children|Fantasy|Musical
Average rating: 3.375
Number of ratings: 40

10 closest films:
title
Peter Pan (1953)                  0.335316
Bambi (1942)                      0.383785
Robin Hood (1973)                 0.410830
Sword in the Stone, The (1963)    0.433815
Cinderella (1950)                 0.443176
Sleeping Beauty (1959)            0.452931
Pinocchio (1940)                  0.454511
Dumbo (1941)                      0.474701
Little Mermaid, The (1989)        0.488092
Jungle Book, The (1967)           0.493763
Name: Alice in Wonderland (1951), dtype: float64

*******************************************************************************************

Alice in Wonderland (2010)
Genre: Adventure|Fantasy|IMAX
Average rating: 2.875
Number of ratings: 28

10 closest films:
title
Charlie and the Chocolate Factory (2005)                 0.485144
Madagascar (2005)                                        0.523539
Hobbit: An Unexpected

## Weaknesses:

The recommender engine has diffuculty finding similarity for films with a limited number of ratings

In [16]:
get_recs('Savages, The')

Savages, The (2007)
Genre: Comedy|Drama
Average rating: 3.5
Number of ratings: 1

10 closest films:
title
Savages, The (2007)                                                                0.0
What Have I Done to Deserve This? (¿Qué he hecho yo para merecer esto!!) (1984)    0.0
Damage (Fatale) (1992)                                                             0.0
First Monday in October (1981)                                                     0.0
The Return of Don Camillo (1953)                                                   0.0
Faraway, So Close (In weiter Ferne, so nah!) (1993)                                0.0
Glenn Miller Story, The (1953)                                                     0.0
New Adventures of Pippi Longstocking, The (1988)                                   0.0
Wrong Arm of the Law, The (1963)                                                   0.0
San Francisco (1936)                                                               0.0
Name: Savages, The (2007

Not a single Coen Brothers film in the top ten most similar films to "The Big Lebowski"

In [17]:
get_recs('Lebowski')

Big Lebowski, The (1998)
Genre: Comedy|Crime
Average rating: 3.9245283018867925
Number of ratings: 106

10 closest films:
title
Reservoir Dogs (1992)                    0.396056
Clockwork Orange, A (1971)               0.421605
Snatch (2000)                            0.432700
Truman Show, The (1998)                  0.439860
Fear and Loathing in Las Vegas (1998)    0.443134
Being John Malkovich (1999)              0.446035
Full Metal Jacket (1987)                 0.451033
Kill Bill: Vol. 2 (2004)                 0.454125
Office Space (1999)                      0.455962
Fight Club (1999)                        0.460324
Name: Big Lebowski, The (1998), dtype: float64

*******************************************************************************************

