In [101]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
m_dataset = pd.read_csv('tmdb_5000_movies.csv')
c_dataset = pd.read_csv('tmdb_5000_credits.csv')

In [6]:
c_dataset.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [8]:
# merging the datasets
final_data = m_dataset.merge(c_dataset,on = 'title')

In [9]:
#checking which column is important
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [10]:
final_data = final_data[['movie_id','title','overview','genres','keywords','cast','crew']]

In [14]:
#checking for Na values
final_data.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [11]:
final_data.dropna(inplace = True)

In [16]:
#checking for duplicated values
final_data.duplicated().sum()

0

In [38]:
def convert(json):
    Listt = []
    for i in ast.literal_eval(json): #converting json format
        Listt.append(i['name'])
    return Listt

In [30]:
final_data['genres'] = final_data['genres'].apply(convert)

In [31]:
final_data['keywords'] = final_data['keywords'].apply(convert)

In [32]:
#we will write a different function for cast as we only want top 3 cast members.
def convert_cast(json):
    Listt = []
    counter = 0
    for i in ast.literal_eval(json):
        if counter != 3:
            Listt.append(i['name'])
            counter = counter + 1
        else:
            break
    return Listt

In [34]:
final_data['cast'] = final_data['cast'].apply(convert_cast)

In [36]:
#we will write a different function for crew too as we only want the naem of director
def convert_crew(json):
    Listt = []
    for i in ast.literal_eval(json):
        if i['job'] == 'Director':
            Listt.append(i['name'])
            break
    return Listt

In [39]:
final_data['crew'] = final_data['crew'].apply(convert_crew)

In [40]:
final_data.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [43]:
#converting overview into a list so that it'll be easier to merge with other lists.
final_data['overview'] = final_data['overview'].apply(lambda x:x.split())

In [47]:
# removing the spcaes between the names
final_data['genres'] = final_data['genres'].apply(lambda x:[i.replace(" " ,"") for i in x])
final_data['keywords'] = final_data['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
final_data['cast'] = final_data['cast'].apply(lambda x:[i.replace(" ","") for i in x])
final_data['crew'] = final_data['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [48]:
final_data

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui...","[Action, Crime, Thriller]","[unitedstates–mexicobarrier, legs, arms, paper...","[CarlosGallardo, JaimedeHoyos, PeterMarquardt]",[RobertRodriguez]
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended...","[Comedy, Romance]",[],"[EdwardBurns, KerryBishé, MarshaDietlein]",[EdwardBurns]
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,...","[Comedy, Drama, Romance, TVMovie]","[date, loveatfirstsight, narration, investigat...","[EricMabius, KristinBooth, CrystalLowe]",[ScottSmith]
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is...",[],[],"[DanielHenney, ElizaCoupe, BillPaxton]",[DanielHsia]


In [49]:
final_data['tags'] = final_data['overview'] + final_data['genres'] + final_data['cast'] + final_data['crew'] + final_data['keywords']

In [91]:
final_df = final_data[['movie_id','title','tags']]

In [54]:
#converting list to string
final_df['tags'] = final_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x: " ".join(x))


In [55]:
#converting to lower case
final_df['tags'] = final_df['tags'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x : x.lower())


In [57]:
cv = CountVectorizer(max_features = 5000,stop_words = 'english')

In [60]:
word_vectors = cv.fit_transform(final_df['tags']).toarray()

In [64]:
#corpus
#cv.get_feature_names()

In [65]:
ps = PorterStemmer()

In [67]:
def stem(corpus):
    lisst = []
    for i in corpus.split():
        lisst.append(ps.stem(i))
        
    " ".join(lisst) #to convert list back to string

In [68]:
final_df = final_df['tags'].apply(stem)

In [72]:
cos_sim = cosine_similarity(word_vectors)

In [75]:
sorted(list(enumerate(cos_sim[0])),reverse = True,key = lambda x:x[1])[1:6]

[(539, 0.26089696604360174),
 (1194, 0.2581988897471611),
 (507, 0.25302403842552984),
 (260, 0.25110592822973776),
 (1216, 0.24944382578492943)]

In [99]:
def recommend(movie):
    movie_index = final_df[final_df['title'] == movie].index[0]
    distances = cos_sim[movie_index]
    top_movies = sorted(list(enumerate(distances)),reverse = True,key = lambda x:x[1])[1:10]
    for i in top_movies:
        print(final_df.iloc[i[0]].title)

In [100]:
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem
Battle: Los Angeles
Lifeforce
Falcon Rising
Krull


In [104]:
pickle.dump(final_df.to_dict(),open('movies_dict.pkl','wb'))

In [105]:
pickle.dump(cos_sim,open('cos_sim.pkl','wb'))