In [278]:
import pandas as pd
import numpy as np
import ast

In [279]:
pd.options.mode.chained_assignment = None




# Importing Data

In [280]:
df_movies = pd.read_csv('data/tmdb_5000_movies.csv')
df_credits = pd.read_csv('data/tmdb_5000_credits.csv') 

In [281]:
df_movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [282]:
df_movies.shape

(4803, 20)

In [283]:
df_credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [284]:
df_credits.shape

(4803, 4)

In [285]:
# Merging data from both the dfs into one giant dataframe
df_movies = df_movies.merge(df_credits,on='title')

In [286]:
df_movies.shape

(4809, 23)

In [287]:
movies = df_movies[['movie_id','title','overview','genres','keywords','cast','crew']]



In [288]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


Here we can see that our desired data is sitting at the value of the name key and we need to make some transformations to our data and the way in which it has been stored in order to make it more accessible for our model in the future. Thus, I'll make use of ast library and my own function to make the necessary transformation to our data!

In [289]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [290]:
movies.dropna(inplace=True)

In [291]:
# movies['genres'] = movies['genres'].apply(convert)

movies['genres'] = movies['genres'].apply(lambda text: [i['name'] for i in ast.literal_eval(text)])


In [292]:
movies.genres[0]

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [293]:
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])

In [294]:
movies.keywords[0]

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [295]:
movies['cast'] = movies['cast'].apply(lambda text: [i['name'] for idx, i in enumerate(ast.literal_eval(text)) if idx < 3])

In [296]:
movies.cast[0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [297]:
movies['crew'] = movies['crew'].apply(lambda text: [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director'])

In [298]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [299]:
movies['cast'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['cast']]
movies['crew'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['crew']]
movies['genres'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['genres']]
movies['keywords'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['keywords']]


In [300]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [301]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [302]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
updated_movies = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [303]:
updated_movies['tags'] = updated_movies['tags'].apply(lambda x: " ".join(x))
updated_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# Model 

In [304]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [305]:
vector = cv.fit_transform(updated_movies['tags']).toarray()
vector.shape


(4806, 5000)

In [306]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity


array([[1.        , 0.08964215, 0.06071767, ..., 0.02519763, 0.0277885 ,
        0.        ],
       [0.08964215, 1.        , 0.06350006, ..., 0.02635231, 0.        ,
        0.        ],
       [0.06071767, 0.06350006, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.02519763, 0.02635231, 0.02677398, ..., 1.        , 0.07352146,
        0.04774099],
       [0.0277885 , 0.        , 0.        , ..., 0.07352146, 1.        ,
        0.05264981],
       [0.        , 0.        , 0.        , ..., 0.04774099, 0.05264981,
        1.        ]])

In [307]:
updated_movies[updated_movies['title'] == 'The Lego Movie'].index[0]

744

In [308]:
import heapq

def recommend(movie):
    index = updated_movies.loc[updated_movies['title'] == movie].index.item()
    distances = list(enumerate(similarity[index]))
    top_k = heapq.nlargest(6, distances, key=lambda x: x[1])[1:]
    return [updated_movies.iloc[i[0]].title for i in top_k]


In [309]:
recommend('Gandhi')

['Gandhi, My Father',
 'The Wind That Shakes the Barley',
 'A Passage to India',
 'Guiana 1838',
 'Ramanujan']

In [310]:
import pickle
pickle.dump(updated_movies,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [311]:
import gzip

with open('similarity.pkl', 'rb') as f_in:
    with gzip.open('similarity.pkl.gz', 'wb') as f_out:
        f_out.write(f_in.read())


'["Avatar", "Pirates of the Caribbean: At World\'s End", "Spectre", "The Dark Knight Rises", "John Carter", "Spider-Man 3", "Tangled", "Avengers: Age of Ultron", "Harry Potter and the Half-Blood Prince", "Batman v Superman: Dawn of Justice", "Superman Returns", "Quantum of Solace", "Pirates of the Caribbean: Dead Man\'s Chest", "The Lone Ranger", "Man of Steel", "The Chronicles of Narnia: Prince Caspian", "The Avengers", "Pirates of the Caribbean: On Stranger Tides", "Men in Black 3", "The Hobbit: The Battle of the Five Armies", "The Amazing Spider-Man", "Robin Hood", "The Hobbit: The Desolation of Smaug", "The Golden Compass", "King Kong", "Titanic", "Captain America: Civil War", "Battleship", "Jurassic World", "Skyfall", "Spider-Man 2", "Iron Man 3", "Alice in Wonderland", "X-Men: The Last Stand", "Monsters University", "Transformers: Revenge of the Fallen", "Transformers: Age of Extinction", "Oz: The Great and Powerful", "The Amazing Spider-Man 2", "TRON: Legacy", "Cars 2", "Green L

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',