In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits,on='title')

In [4]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [5]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace=True)

In [7]:
movies.duplicated().sum()

0

In [8]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [9]:
import ast

# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]' -> ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
def convert_dict(obj):
    converted_list = []
    for i in ast.literal_eval(obj): # converts string to dictionaries
        converted_list.append(i['name'])
    return converted_list

In [10]:
movies['genres'] = movies['genres'].apply(convert_dict)

In [11]:
movies['keywords'] = movies['keywords'].apply(convert_dict)

In [12]:
# get list of top 3 actors for each movie
def convert_cast(obj):
    converted_list = []
    counter = 0
    for i in ast.literal_eval(obj):
        while counter < 3:
            converted_list.append(i['name'])
            counter += 1
    return converted_list

In [13]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [14]:
# get director from crew
def get_director(obj):
    converted_list = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            converted_list.append(i['name'])
            break
    return converted_list

In [15]:
movies['crew'] = movies['crew'].apply(get_director)

In [16]:
movies['overview'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [17]:
# convert overview to list
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [18]:
# remove any spaces so that phrases/names are together
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, SamWorthington, SamWorthington]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, JohnnyDepp, JohnnyDepp]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, DanielCraig, DanielCraig]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, ChristianBale, ChristianBale]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, TaylorKitsch, TaylorKitsch]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [21]:
new_df = movies[['movie_id', 'title', 'tags']]

In [23]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [25]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [26]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [34]:
!pip install nltk



In [35]:
import nltk

In [36]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [37]:
# stem words
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [38]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [43]:
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors

In [45]:
cv.get_feature_names()



['00',
 '000',
 '007',
 '07am',
 '10',
 '100',
 '1000',
 '101',
 '108',
 '10th',
 '11',
 '114',
 '117',
 '118',
 '119',
 '11th',
 '12',
 '1200',
 '1215',
 '1250',
 '125th',
 '12th',
 '13',
 '1300',
 '13th',
 '14',
 '140',
 '1408',
 '142',
 '1429',
 '148',
 '14pm',
 '14th',
 '15',
 '150',
 '150th',
 '1520',
 '1536',
 '15th',
 '15thcenturi',
 '16',
 '1600s',
 '161',
 '1630s',
 '1644',
 '1681',
 '1691',
 '16th',
 '16thcenturi',
 '17',
 '170',
 '1700s',
 '173rd',
 '1748',
 '1776',
 '17th',
 '17thcenturi',
 '18',
 '180',
 '1800',
 '1818',
 '1820',
 '1820s',
 '1824',
 '1831',
 '1834',
 '1836',
 '1838',
 '1839',
 '1841',
 '1845',
 '1850',
 '1856',
 '1857',
 '1860',
 '1862',
 '1863',
 '1870',
 '1875',
 '1876',
 '1879',
 '1880s',
 '1882',
 '1885',
 '1889',
 '1890',
 '18th',
 '18thcenturi',
 '19',
 '1900',
 '1900s',
 '1903',
 '1905',
 '191',
 '1910',
 '1911',
 '1912',
 '1914',
 '1915',
 '1917',
 '1918',
 '1919',
 '1920',
 '1920s',
 '1921',
 '1922',
 '1924',
 '1925',
 '1926',
 '1927',
 '1928',
 '

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
similarity = cosine_similarity(vectors)

In [48]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)
    

In [50]:
import pickle

In [53]:
pickle.dump(new_df.to_dict(), open('movies.pkl', 'wb'))

In [54]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))