#Movie recommendation system based on content similarity

##Content based recommendation system that suggests movies/ items based on similarity between movies with regards to genre, director, actors and Movie description



In [0]:
import pandas as pd
import numpy as np

In [0]:
#Reading csv file
dt = pd.read_csv("IMDB-Movie-Data.csv")
dt.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


**Extracting Textual Data for Interpretation**

In this step, relevant textual data of the movie is collected. This data is crucial to draw similarity in the movies depending on features like Actors, Director, Genre, descriptive words etc,.

When we're using actor's or director's name for capturing the similarity, we have to take care of both the first names and the last names. If two actors have the same first name, the model may try to draw a correlation between the movies of these two actors, which may be very different movies. Hence, to avoid this, we use only the last names since last names are usually different. 

In [0]:
# creating 4 columns for the 4 actors
dt[['Actor1','Actor2','Actor3','Actor4']] = dt.Actors.str.split(",", expand = True) 

#removing missing data from the concerned attributes
data = dt.dropna(subset = ['Actor1','Actor2','Actor3','Actor4'])

In [0]:
# splitting first names and last names
data['Actor1'] = data['Actor1'].apply(lambda x: x.split())
data['Actor2'] = data['Actor2'].apply(lambda x: x.split())
data['Actor3'] = data['Actor3'].apply(lambda x: x.split())
data['Actor4'] = data['Actor4'].apply(lambda x: x.split())

In [0]:

# choosing only the last name
data['Last1'] = [x[-1] for x in data['Actor1']]
data['Last2'] = [x[-1] for x in data['Actor2']]
data['Last3'] = [x[-1] for x in data['Actor3']]
data['Last4'] = [x[-1] for x in data['Actor4']]


In [0]:
# similar steps for the directors
data.Director = data.Director.apply(lambda x: x.split())

In [0]:
data['Director_Last'] = [x[-1] for x in data.Director]

**Description Pre-processing**

The description of movies needs to be pre-processed for modeling. Below are the steps involved.

In [0]:
#Removing all characters other than aplpahbets
data['Description'] = data['Description'].str.replace("[^a-zA-Z]", " ") 

In [0]:
#removing all words that do not point to a certain sentiment like a, an ,the
data['Description'] = data['Description'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [0]:
#tokenization
data['Description'] = data['Description'].apply(lambda x: x.split())


In [0]:
#stemming: including a common word for words same in meaning like play, playing 
from nltk.stem.porter import *
stemmer = PorterStemmer()

data.Description = data.Description.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming



In [0]:
#converting the lists so created back to a string 
data.Description = [str(x) for x in data.Description]

In [0]:
#combining all text data to be used for analysis
data['Text'] = data[['Description','Director_Last','Genre','Last1','Last2','Last3','Last4']].apply(lambda x:','.join(x),axis =1) 

In [164]:
data['Text'].head()

0    ['group', 'intergalact', 'crimin', 'forc', 'wo...
1    ['follow', 'clue', 'origin', 'mankind', 'team'...
2    ['three', 'girl', 'kidnap', 'with', 'diagnos',...
3    ['citi', 'humanoid', 'anim', 'hustl', 'theater...
4    ['secret', 'govern', 'agenc', 'recruit', 'some...
Name: Text, dtype: object

In [0]:
#tokenizing the merged textual data and giving numbers to 
#words which is the frequency of their occurence
from keras.preprocessing.text import Tokenizer
tk = Tokenizer(lower = True)

tk.fit_on_texts(data.Text)
X_seq = tk.texts_to_sequences(data.Text)

In [0]:
#padding the sentences with zero till the length is zero 
from keras.preprocessing.sequence import pad_sequences
X_pad = pad_sequences(X_seq, maxlen=100, padding='post')

### Implementing cosine similarity 

In [0]:

from sklearn.metrics.pairwise import cosine_similarity

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(X_pad,X_pad)

Names = pd.Series(data.Title)

In [0]:
Names.index[10]

10

In [0]:
# Function to recommend movies

def Recommend(Title, Names = Names, cosine_sim = cosine_sim):
  
  recommendation = []
  
  idx = Names[Names == Title].index[0]
  
  score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
  
  for i in range(0,10):
    recommendation.append(Names[score_series.index[i]])
    
  return recommendation

In [0]:
#suggesting top 10 similar movies for a movie
Title = "Prometheus"

List = Recommend(Title)

In [163]:
print(List)

['Prometheus', 'Morgan', 'Black Swan', 'The Lost City of Z', 'Into the Woods', 'Pirates of the Caribbean: On Stranger Tides', 'The Imitation Game', 'Cars', 'Green Lantern', 'Chuck']
