**Import Libraries**

In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

**Load Dataset**

In [21]:
df = pd.read_csv("/content/movies.csv", on_bad_lines='skip', engine='python')

In [22]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [23]:
df.shape # show no. of rows and columns in dataset

(4803, 24)

In [24]:
df.columns # all colummns names

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

**Preprocessing**

In [25]:
df.isnull().sum() # checking for missing values

Unnamed: 0,0
index,0
budget,0
genres,28
homepage,3091
id,0
keywords,412
original_language,0
original_title,0
overview,3
popularity,0


In [26]:
# select columns that are useful for recommender system

df=df[['title','vote_average','vote_count','genres','cast','director','overview']]
df.dropna(subset=['title'], inplace=True)
df.drop_duplicates(subset=['title'],inplace=True) # drop duplicates values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['title'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=['title'],inplace=True) # drop duplicates values


In [27]:
# fill missing values with empty strings

df['genres']=df['genres'].fillna('')
df['cast']=df['cast'].fillna('')
df['director']=df['director'].fillna('')
df['overview']=df['overview'].fillna('')

**Popularity-Based Recommender (Baseline)**

In [28]:
# Weighted rating = (v/(v+m)) * R + (m/(v+m)) * C
#Weighted rating formula (IMDB-style)
C=df['vote_average'].mean()
m=df['vote_count'].quantile(0.85)

qualified = df[df['vote_count'] >= m].copy()
qualified['score']=qualified.apply(lambda x:
    (x['vote_count']/(x['vote_count']+m)*x['vote_average']) +
    (m/(m+x['vote_count']) * C), axis=1)

# Top 10 movies
top_movies=qualified.sort_values('score', ascending=False)[['title','score']]
print(top_movies.head(10))


                                              title     score
1881                       The Shawshank Redemption  8.170385
662                                      Fight Club  8.031839
3232                                   Pulp Fiction  8.004690
65                                  The Dark Knight  7.993808
3337                                  The Godfather  7.982546
96                                        Inception  7.926423
809                                    Forrest Gump  7.902759
95                                     Interstellar  7.885270
329   The Lord of the Rings: The Return of the King  7.821002
1990                        The Empire Strikes Back  7.817978


In [29]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ","")) for i in x]
    else:
        return []

df['tags']=df['genres'].fillna('')+" "+df['cast'].fillna('')+" "+df['director'].fillna('')
df['tags']=df['tags'].astype(str)


tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['tags'])

# Cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


**Movie Recommender Using Cosine Similarity**

In [30]:
indices=pd.Series(df.index, index=df['title'].str.lower())

def recommend_movie(title,cosine_sim=cosine_sim):
    title=title.lower()
    if title not in indices:
        print("Movie not found!")
        return
    idx=indices[title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores=sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    movie_indices=[i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

recommend_movie("ted")

Unnamed: 0,title
616,Ted 2
1229,A Million Ways to Die in the West
1398,Max Payne
1853,Contraband
702,The Italian Job
1500,This Is the End
1138,The Interview
37,Oz: The Great and Powerful
3320,Extract
3738,Lost in Translation


**Matrix Factorization with SVD**

In [33]:
# movie-feature matrix(bag-of-words from 'overview')

count=CountVectorizer(max_features=5000, stop_words='english')
count_matrix=count.fit_transform(df['overview'].fillna(''))

# apply SVD (Truncated)
svd=TruncatedSVD(n_components=100)
latent_matrix=svd.fit_transform(count_matrix)

# find similar movies using SVD latent features
cosine_sim_svd=cosine_similarity(latent_matrix)

def recommend_svd(title, sim_matrix=cosine_sim_svd):
    title=title.lower()
    if title not in indices:
        print("Movie not found!")
        return
    idx=indices[title]
    sim_scores=list(enumerate(sim_matrix[idx]))
    sim_scores=sorted(sim_scores,key=lambda x: x[1],reverse=True)[1:11]
    movie_indices=[i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

recommend_svd("Inception")

Unnamed: 0,title
66,Up
1663,Once Upon a Time in America
2022,The Bridges of Madison County
4698,Butterfly Girl
2815,Star Trek II: The Wrath of Khan
1763,The Duchess
4534,Deceptive Practice: The Mysteries and Mentors ...
2331,Under the Tuscan Sun
3387,Somewhere
3962,Open Road
