In [241]:
import pandas as pd
import numpy as np
import ast

In [242]:
pd.options.mode.chained_assignment = None




# Importing Data

In [243]:
df_movies = pd.read_csv('data/tmdb_5000_movies.csv')
df_credits = pd.read_csv('data/tmdb_5000_credits.csv') 

In [244]:
df_movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [245]:
df_movies.shape

(4803, 20)

In [246]:
df_credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [247]:
df_credits.shape

(4803, 4)

In [248]:
# Merging data from both the dfs into one giant dataframe
df_movies = df_movies.merge(df_credits,on='title')

In [249]:
df_movies.shape

(4809, 23)

In [250]:
movies = df_movies[['movie_id','title','overview','genres','keywords','cast','crew']]



In [251]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


Here we can see that our desired data is sitting at the value of the name key and we need to make some transformations to our data and the way in which it has been stored in order to make it more accessible for our model in the future. Thus, I'll make use of ast library and my own function to make the necessary transformation to our data!

In [252]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [253]:
movies.dropna(inplace=True)

In [254]:
# movies['genres'] = movies['genres'].apply(convert)

movies['genres'] = movies['genres'].apply(lambda text: [i['name'] for i in ast.literal_eval(text)])


In [255]:
movies.genres[0]

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [256]:
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])

In [257]:
movies.keywords[0]

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [258]:
movies['cast'] = movies['cast'].apply(lambda text: [i['name'] for idx, i in enumerate(ast.literal_eval(text)) if idx < 3])

In [259]:
movies.cast[0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [260]:
movies['crew'] = movies['crew'].apply(lambda text: [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director'])

In [261]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [262]:
movies['cast'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['cast']]
movies['crew'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['crew']]
movies['genres'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['genres']]
movies['keywords'] = [list(map(lambda y: y.replace(' ', ''), x)) for x in movies['keywords']]


In [263]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [264]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [265]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
updated_movies = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [266]:
updated_movies['tags'] = updated_movies['tags'].apply(lambda x: " ".join(x))
updated_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# Model 

In [267]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [268]:
vector = cv.fit_transform(updated_movies['tags']).toarray()
vector.shape


(4806, 5000)

In [269]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity


array([[1.        , 0.08964215, 0.06071767, ..., 0.02519763, 0.0277885 ,
        0.        ],
       [0.08964215, 1.        , 0.06350006, ..., 0.02635231, 0.        ,
        0.        ],
       [0.06071767, 0.06350006, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.02519763, 0.02635231, 0.02677398, ..., 1.        , 0.07352146,
        0.04774099],
       [0.0277885 , 0.        , 0.        , ..., 0.07352146, 1.        ,
        0.05264981],
       [0.        , 0.        , 0.        , ..., 0.04774099, 0.05264981,
        1.        ]])

In [270]:
updated_movies[updated_movies['title'] == 'The Lego Movie'].index[0]

744

In [273]:
import heapq

def recommend(movie):
    index = updated_movies.loc[updated_movies['title'] == movie].index.item()
    distances = list(enumerate(similarity[index]))
    top_k = heapq.nlargest(6, distances, key=lambda x: x[1])[1:]
    return [updated_movies.iloc[i[0]].title for i in top_k]


In [274]:
recommend('Gandhi')

Gandhi, My Father, The Wind That Shakes the Barley, A Passage to India, Guiana 1838, Ramanujan


In [275]:
import pickle
pickle.dump(updated_movies,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))