In [40]:
import numpy as np
import pandas as pd

In [41]:
df = pd.read_csv('tmdb-movies.csv')
df.head(2)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999900.0,348161300.0


In [42]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10866 non-null  int64  
 1   imdb_id               10856 non-null  str    
 2   popularity            10866 non-null  float64
 3   budget                10866 non-null  int64  
 4   revenue               10866 non-null  int64  
 5   original_title        10866 non-null  str    
 6   cast                  10790 non-null  str    
 7   homepage              2936 non-null   str    
 8   director              10822 non-null  str    
 9   tagline               8042 non-null   str    
 10  keywords              9373 non-null   str    
 11  overview              10862 non-null  str    
 12  runtime               10866 non-null  int64  
 13  genres                10843 non-null  str    
 14  production_companies  9836 non-null   str    
 15  release_date          10866 no

In [43]:
movies = df[['imdb_id', 'original_title', 'cast', 'director', 'keywords', 'overview', 'genres']]

In [44]:
movies.isnull().sum()

imdb_id             10
original_title       0
cast                76
director            44
keywords          1493
overview             4
genres              23
dtype: int64

In [45]:
movies.dropna(inplace=True, ignore_index=True)

In [46]:
movies.duplicated().sum()

np.int64(1)

In [47]:
movies.drop_duplicates(inplace=True, ignore_index=True)

In [48]:
movies.info()

<class 'pandas.DataFrame'>
RangeIndex: 9302 entries, 0 to 9301
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   imdb_id         9302 non-null   str  
 1   original_title  9302 non-null   str  
 2   cast            9302 non-null   str  
 3   director        9302 non-null   str  
 4   keywords        9302 non-null   str  
 5   overview        9302 non-null   str  
 6   genres          9302 non-null   str  
dtypes: str(7)
memory usage: 508.8 KB


In [49]:
def convert(s):
    L = []
    for c in s.split("|"):
        L.append(c.replace(" ", "").lower())
    return L[:3]


In [50]:
movies['cast'] = movies['cast'].apply(convert)

In [51]:
movies['genres'] = movies['genres'].apply(convert)

In [52]:
movies['keywords'] = movies['keywords'].apply(convert)

In [53]:
movies['director'] = movies['director'].apply(lambda x: [x.replace(" ", "").lower()])

In [54]:
movies['overview'] = movies['overview'].apply(lambda x: x.lower().split(" "))

In [55]:
movies['tags'] = movies['overview'] + 2*movies['keywords'] + 2*movies['genres'] + movies['cast'] + 2*movies['director']

In [56]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [57]:
movies_df = movies[['imdb_id', 'original_title', 'tags']]

In [58]:
movies_df.head(1)

Unnamed: 0,imdb_id,original_title,tags
0,tt0369610,Jurassic World,twenty-two years after the events of jurassic ...


In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1,2)
)

vectors = tfidf.fit_transform(movies_df['tags'])

In [60]:
from sklearn.metrics.pairwise import cosine_similarity
def recommend(movie):
    idx = movies_df[movies_df['original_title'] == movie].index[0]
    sim = cosine_similarity(vectors[idx], vectors)
    top = sorted(list(enumerate(sim[0])), key=lambda x: x[1], reverse=True)[1:11]
    return [movies_df.iloc[i[0]].original_title for i in top]


In [61]:
recommend('Se7en')

['Mystery Road',
 'Insomnia',
 'Zodiac',
 'The Girl with the Dragon Tattoo',
 'Tightrope',
 'Don McKay',
 'Along Came a Spider',
 'Gone Girl',
 'Evil Under the Sun',
 'Felony']

In [62]:
recommend('Spider-Man')

['Spider-Man 3',
 'Spider-Man 2',
 "Cirque du Freak: The Vampire's Assistant",
 "Charlotte's Web",
 "Charlotte's Web",
 'The Amazing Spider-Man 2',
 'Spider',
 'The Amazing Spider-Man',
 'Army of Darkness',
 'Planes, Trains and Automobiles']

In [63]:
recommend('Batman')

['Batman & Robin',
 'Batman Returns',
 'Batman Unlimited: Monster Mayhem',
 'A History of Violence',
 'Batman Beyond: Return of the Joker',
 'The Dark Knight Rises',
 'Batman Forever',
 'Batman: Under the Red Hood',
 'Batman: Mystery of the Batwoman',
 'Batman Begins']

In [64]:
recommend('Titanic')

['Titanic',
 'Death Ship',
 'Ghosts of the Abyss',
 'Free Willy 3: The Rescue',
 'Captain Phillips',
 'Moby Dick',
 'The Blue Lagoon',
 'Ship of Fools',
 'Ghost Ship',
 'Aliens of the Deep']

In [65]:
recommend('Shutter Island')

['Hours',
 'An Inspector Calls',
 'The Other Man',
 'The Wicker Man',
 'Afterwards',
 'La Moustache',
 'Death and the Maiden',
 'Original Sin',
 'Coma',
 'Chloe']

In [66]:
recommend('The Avengers')

['Avengers: Age of Ultron',
 "Marvel One-Shot: A Funny Thing Happened on the Way to Thor's Hammer",
 'Marvel One-Shot: Agent Carter',
 'The Incredible Hulk',
 'Serenity',
 'Captain America: The Winter Soldier',
 'Marvel One-Shot: The Consultant',
 'Marvel One-Shot: Item 47',
 'Ant-Man',
 'Captain America: The First Avenger']

In [67]:
recommend("Harry Potter and the Philosopher's Stone")

['Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Prisoner of Azkaban',
 'Harry Potter and the Order of the Phoenix',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Deathly Hallows: Part 1',
 'Percy Jackson & the Olympians: The Lightning Thief',
 'Harry Potter and the Deathly Hallows: Part 2',
 'Halloweentown',
 'Return to Halloweentown']

In [68]:
recommend('Fight Club')

['The Curious Case of Benjamin Button',
 'Cake',
 'Gone Girl',
 'Se7en',
 'The Fault in Our Stars',
 'Zodiac',
 'Guyver: Dark Hero',
 'The One',
 'The New Guy',
 'Panic Room']

In [69]:
recommend('Interstellar')

['Terminator Genisys',
 'The Matrix',
 'About Time',
 'A.I. Artificial Intelligence',
 'The Terminator',
 'Terminator Salvation',
 'The Matrix Revolutions',
 'Terminator 3: Rise of the Machines',
 'Igby Goes Down',
 'Battlestar Galactica']

In [70]:
movies_df.head()

Unnamed: 0,imdb_id,original_title,tags
0,tt0369610,Jurassic World,twenty-two years after the events of jurassic ...
1,tt1392190,Mad Max: Fury Road,an apocalyptic story set in the furthest reach...
2,tt2908446,Insurgent,beatrice prior must confront her inner demons ...
3,tt2488496,Star Wars: The Force Awakens,thirty years after defeating the galactic empi...
4,tt2820852,Furious 7,deckard shaw seeks revenge against dominic tor...


In [71]:
import pickle
pickle.dump(movies_df, open('movies.pkl', 'wb'))