# Content Based Recommender Systems

In [1]:
from math import*

def square_root(x):
    return sqrt(sum([a*a for a in x]))

def cosine_similarity(x,y):
    numerator=sum([x*y for x,y in zip(x,y)])
    denominator=square_root(x)*square_root(y)
    return round(numerator/denominator,3)

In [2]:
print(cosine_similarity([0.5,0.5],[0,0.3]))

0.707


We want to base on our movie data to recommend similar movies based on similar plot, genre, director, etc. For example if we search for Titanic movie, similar movies should be recommended to us, which can be done based on the plots, directors,cast etc...

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity # to perform same work as the cosine similarity we created above
from sklearn.feature_extraction.text import CountVectorizer #perform same work as the Document Term Frequency


In [5]:
pd.set_option('display.max_columns',100)
df=pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

In [6]:
df.shape

(250, 38)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [8]:
#for col in df.columns:
    #print(col)
print(list(df.columns))

['Unnamed: 0', 'Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Ratings.Source', 'Ratings.Value', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'tomatoMeter', 'tomatoImage', 'tomatoRating', 'tomatoReviews', 'tomatoFresh', 'tomatoRotten', 'tomatoConsensus', 'tomatoUserMeter', 'tomatoUserRating', 'tomatoUserReviews', 'tomatoURL', 'DVD', 'BoxOffice', 'Production', 'Website', 'Response']


In [9]:
print(len((list(df.columns))))

38


1. there are too many features some are relevant and sum are irrelevant
2. In this case we are going to use 'Title','genre','Director', 'Actors','Plot'
3. Title='movie name', genre='category(eg;comedy, romance,horror)'

we will base on the 'Title', 'Genre', 'Director','Actors','Plot'

In [11]:
df=df[['Title','Genre','Director','Actors','Plot']]

In [13]:
df.shape

(250, 5)

In [14]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [29]:
#discarding the commas between actors full names  and getting only the first three actors
df['Actors']=df['Actors'].map(lambda x: x.split(",")[:3])

In [30]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton]",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan]",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"[Al Pacino, Robert Duvall, Diane Keaton]",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart]",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"[Martin Balsam, John Fiedler, Lee J. Cobb]",A jury holdout attempts to prevent a miscarria...


In [31]:
df['Genre']=df['Genre'].map(lambda x: x.lower().split(','))

In [32]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"[crime, drama]",Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton]",Two imprisoned men bond over a number of years...
1,The Godfather,"[crime, drama]",Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan]",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"[crime, drama]",Francis Ford Coppola,"[Al Pacino, Robert Duvall, Diane Keaton]",The early life and career of Vito Corleone in ...
3,The Dark Knight,"[action, crime, drama]",Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart]",When the menace known as the Joker emerges fro...
4,12 Angry Men,"[crime, drama]",Sidney Lumet,"[Martin Balsam, John Fiedler, Lee J. Cobb]",A jury holdout attempts to prevent a miscarria...


In [33]:
df['Director']=df['Director'].map(lambda x:x.split(' '))

In [34]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"[crime, drama]","[Frank, Darabont]","[Tim Robbins, Morgan Freeman, Bob Gunton]",Two imprisoned men bond over a number of years...
1,The Godfather,"[crime, drama]","[Francis, Ford, Coppola]","[Marlon Brando, Al Pacino, James Caan]",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"[crime, drama]","[Francis, Ford, Coppola]","[Al Pacino, Robert Duvall, Diane Keaton]",The early life and career of Vito Corleone in ...
3,The Dark Knight,"[action, crime, drama]","[Christopher, Nolan]","[Christian Bale, Heath Ledger, Aaron Eckhart]",When the menace known as the Joker emerges fro...
4,12 Angry Men,"[crime, drama]","[Sidney, Lumet]","[Martin Balsam, John Fiedler, Lee J. Cobb]",A jury holdout attempts to prevent a miscarria...


In [35]:
#convert director and actors names to lower case to avoid duplicates
for index, row in df.iterrows():
    row['Actors']=[x.lower().replace(' ','') for x in row['Actors']]
    row['Director']=''.join(row['Director']).lower()

In [36]:
df['Actors']

0                 [timrobbins, morganfreeman, bobgunton]
1                    [marlonbrando, alpacino, jamescaan]
2                  [alpacino, robertduvall, dianekeaton]
3             [christianbale, heathledger, aaroneckhart]
4                 [martinbalsam, johnfiedler, leej.cobb]
                             ...                        
245                [raymilland, janewyman, phillipterry]
246     [brielarson, johngallagherjr., stephaniebeatriz]
247           [carygrant, rosalindrussell, ralphbellamy]
248    [sissyspacek, janegallowayheitz, josepha.carpe...
249                [devpatel, saurabhshukla, anilkapoor]
Name: Actors, Length: 250, dtype: object

In [37]:
df['Director']

0                  frankdarabont
1             francisfordcoppola
2             francisfordcoppola
3               christophernolan
4                    sidneylumet
                 ...            
245                  billywilder
246          destindanielcretton
247                  howardhawks
248                   davidlynch
249    dannyboyle,loveleentandan
Name: Director, Length: 250, dtype: object

In [38]:
pip install rake_nltk

Collecting rake_nltkNote: you may need to restart the kernel to use updated packages.

  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake_nltk
Successfully installed rake_nltk-1.0.6



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import rake_nltk
from rake_nltk import Rake #for extracting significant keywords in the entire text available
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Intel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Intel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [42]:
#initializing the new column
df['key_words']=""

for index, row in df.iterrows():
    plot=row['Plot']
    #instantiating Rake, by default is uses english stopwords from nltk
    #and discard all punctuation characters
    r=Rake()
    
    #extracting the words by passing th text
    r.extract_keywords_from_text(plot)
    
    #getting the dictionary with keywords and their scores
    key_words_dict_scores=r.get_word_degrees()
    
    #assigning the key words to the new column
    row['key_words']=list(key_words_dict_scores.keys())
    
#dropping the Plot column
df.drop(columns=['Plot'], inplace=True)

            

In [44]:
 key_words_dict_scores

defaultdict(<function rake_nltk.rake.Rake._build_word_co_occurance_graph.<locals>.<lambda>()>,
            {'mumbai': 3,
             'teen': 3,
             'reflects': 3,
             'upbringing': 1,
             'slums': 1,
             'accused': 1,
             'cheating': 1,
             'indian': 2,
             'version': 2,
             'wants': 1,
             'millionaire': 2,
             '?"': 2})

In [45]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,key_words
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[two, imprisoned, men, bond, number, years, fi..."
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[aging, patriarch, organized, crime, dynasty, ..."
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[early, life, career, vito, corleone, 1920s, n..."
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[menace, known, joker, emerges, mysterious, pa..."
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[jury, holdout, attempts, prevent, miscarriage..."


In [46]:
df.set_index('Title', inplace=True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[two, imprisoned, men, bond, number, years, fi..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[aging, patriarch, organized, crime, dynasty, ..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[early, life, career, vito, corleone, 1920s, n..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[menace, known, joker, emerges, mysterious, pa..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[jury, holdout, attempts, prevent, miscarriage..."


In [47]:
df['bag_of_words']=''
columns=df.columns
for index, row in df.iterrows():
    words=''
    for col in columns:
        if col != 'Director':
            words=words+ ' '.join(row[col]) +' '
        else:
            words=words +row[col]+ ' '
    row['bag_of_words'] =words

df.drop(columns =[col for col in df.columns if col!= 'bag_of_words'])

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfr...
The Godfather,crime drama francisfordcoppola marlonbrando a...
The Godfather: Part II,crime drama francisfordcoppola alpacino rober...
The Dark Knight,action crime drama christophernolan christia...
12 Angry Men,crime drama sidneylumet martinbalsam johnfied...
...,...
The Lost Weekend,drama film-noir billywilder raymilland janewy...
Short Term 12,drama destindanielcretton brielarson johngalla...
His Girl Friday,comedy drama romance howardhawks carygrant r...
The Straight Story,biography drama davidlynch sissyspacek janega...


In [48]:
#instantiating and generating the count matrix
count=CountVectorizer()
count_matrix=count.fit_transform(df['bag_of_words'])

In [49]:
count_matrix

<250x2961 sparse matrix of type '<class 'numpy.int64'>'
	with 5342 stored elements in Compressed Sparse Row format>

In [50]:
c=count_matrix.todense()
c

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [51]:
print(count_matrix[0,:])

  (0, 584)	1
  (0, 768)	1
  (0, 1011)	1
  (0, 2678)	1
  (0, 1810)	1
  (0, 306)	1
  (0, 2765)	1
  (0, 1269)	1
  (0, 1733)	1
  (0, 311)	1
  (0, 1899)	1
  (0, 2950)	1
  (0, 969)	1
  (0, 2481)	1
  (0, 888)	1
  (0, 2174)	1
  (0, 59)	1
  (0, 519)	1
  (0, 655)	1


In [52]:
#generating the cosine similarity matrix
cosine_sim=cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

In [53]:
indices=pd.Series(df.index)
indices[:20]

0                              The Shawshank Redemption
1                                         The Godfather
2                                The Godfather: Part II
3                                       The Dark Knight
4                                          12 Angry Men
5                                      Schindler's List
6         The Lord of the Rings: The Return of the King
7                                          Pulp Fiction
8                                            Fight Club
9     The Lord of the Rings: The Fellowship of the Ring
10                                         Forrest Gump
11       Star Wars: Episode V - The Empire Strikes Back
12                                            Inception
13                The Lord of the Rings: The Two Towers
14                      One Flew Over the Cuckoo's Nest
15                                           Goodfellas
16                                           The Matrix
17                   Star Wars: Episode IV - A N

In [54]:
def recommendations(title, cosine_sim=cosine_sim):
    recommended_movies=[]
    
    #getting index of the movie that matches the title
    idx=indices[indices==title].index[0]
    
    #creating Series with the similarity scores in descending order
    score_series=pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    
    #getting the indexes of the 10 most similar movies
    top_10_indexes=list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    
    #population the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
    return recommended_movies

In [56]:
recommendations('The Godfather')

[2, 83, 128, 226, 100, 15, 123, 76, 110, 66]


['The Godfather: Part II',
 'Scarface',
 'Fargo',
 'Rope',
 'On the Waterfront',
 'Goodfellas',
 'Cool Hand Luke',
 'Baby Driver',
 'Casino',
 'A Clockwork Orange']

In [57]:
indices[2]

'The Godfather: Part II'