In [3]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier



import ast

In [4]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits =  pd.read_csv("tmdb_5000_credits.csv")

In [5]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [6]:
movies = movies.drop(['original_title','original_language','production_companies','production_countries','release_date',
                      'spoken_languages','status','homepage','tagline'],axis=1)

In [7]:
movies.head(1)

Unnamed: 0,budget,genres,id,keywords,overview,popularity,revenue,runtime,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",150.437577,2787965087,162.0,Avatar,7.2,11800


In [8]:
movies.isnull().sum()

budget          0
genres          0
id              0
keywords        0
overview        3
popularity      0
revenue         0
runtime         2
title           0
vote_average    0
vote_count      0
dtype: int64

In [9]:
movies = movies.dropna()

In [10]:
movies.isnull().sum()

budget          0
genres          0
id              0
keywords        0
overview        0
popularity      0
revenue         0
runtime         0
title           0
vote_average    0
vote_count      0
dtype: int64

In [11]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [12]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L


In [13]:
credits['crew'] = credits['crew'].apply(fetch_director)

In [14]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",[James Cameron]


In [15]:
def convert(text):
  l = []
  for i in ast.literal_eval(text):
    l.append(i['name'])
  return l

In [16]:
credits['cast'] = credits['cast'].apply(convert)

In [17]:
credits['cast'] = credits['cast'].apply(lambda x:x[0:3])

In [18]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [19]:
movies['genres'] = movies['genres'].apply(convert)

In [20]:
movies['keywords'] = movies['keywords'].apply(convert)

In [21]:
movies.head(1)

Unnamed: 0,budget,genres,id,keywords,overview,popularity,revenue,runtime,title,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",150.437577,2787965087,162.0,Avatar,7.2,11800


In [22]:
movies = movies.merge(credits,on="title")

In [23]:
movies.head(1)

Unnamed: 0,budget,genres,id,keywords,overview,popularity,revenue,runtime,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",150.437577,2787965087,162.0,Avatar,7.2,11800,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [24]:
movies['overview'] = movies['overview'].apply(lambda x:  x.split())

In [25]:
def collapse(L):
  L1 = []
  for i in L:
    L1.append(i.replace(" ",""))
  return L1

In [26]:
movies['cast'] = movies['cast'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [27]:
movies['tags'] = movies['cast'] + movies['crew'] + movies['genres'] + movies['keywords'] + movies['overview']

In [28]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [29]:
movies=movies[['popularity','revenue','runtime','vote_count','budget','title','vote_average','movie_id','tags']]

In [30]:
movies['vote_average']  = movies['vote_average'].astype("int")

In [31]:
movies.head(5)

Unnamed: 0,popularity,revenue,runtime,vote_count,budget,title,vote_average,movie_id,tags
0,150.437577,2787965087,162.0,11800,237000000,Avatar,7,19995,SamWorthington ZoeSaldana SigourneyWeaver Jame...
1,139.082615,961000000,169.0,4500,300000000,Pirates of the Caribbean: At World's End,6,285,JohnnyDepp OrlandoBloom KeiraKnightley GoreVer...
2,107.376788,880674609,148.0,4466,245000000,Spectre,6,206647,DanielCraig ChristophWaltz LéaSeydoux SamMende...
3,112.31295,1084939099,165.0,9106,250000000,The Dark Knight Rises,7,49026,ChristianBale MichaelCaine GaryOldman Christop...
4,43.926995,284139100,132.0,2124,260000000,John Carter,6,49529,TaylorKitsch LynnCollins SamanthaMorton Andrew...


In [32]:
features = ['popularity','revenue','runtime','vote_count','budget']

In [33]:
X_train  ,  X_test  , y_train,  y_test = train_test_split(movies[features],movies['vote_average'] , test_size=0.35, random_state=42)

In [34]:
dt = DecisionTreeClassifier()
gb = GradientBoostingClassifier()
rf = RandomForestClassifier()
xg =XGBClassifier(max_depth=3,
    gamma=4,
    eta=0.3,
    reg_alpha=0.5,
    reg_lambda=0.6)

In [35]:
model = VotingClassifier(estimators=[('dt',dt),('gb',gb),('rf',rf),('xg',xg)] , voting='hard')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.5038621509209744

In [36]:
model.score(X_train,y_train)

0.9068203650336215

In [37]:
dt.fit(X_train,y_train)
gb.fit(X_train,y_train)
rf.fit(X_train,y_train)
xg.fit(X_train,y_train)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
cv = CountVectorizer(max_features=4806,stop_words='english')

In [40]:
vector  = cv.fit_transform(movies['tags'].values.astype('U')).toarray()

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [42]:
similarity

array([[1.        , 0.09107651, 0.06071767, ..., 0.02548236, 0.0277885 ,
        0.        ],
       [0.09107651, 1.        , 0.06451613, ..., 0.02707652, 0.        ,
        0.        ],
       [0.06071767, 0.06451613, 1.        , ..., 0.02707652, 0.        ,
        0.        ],
       ...,
       [0.02548236, 0.02707652, 0.02707652, ..., 1.        , 0.07435224,
        0.0489116 ],
       [0.0277885 , 0.        , 0.        , ..., 0.07435224, 1.        ,
        0.05333807],
       [0.        , 0.        , 0.        , ..., 0.0489116 , 0.05333807,
        1.        ]])

In [43]:
def predict(name):
  x = movies.loc[movies.title ==name].iloc[:,0:5]
  fp = (rf.predict(x)+gb.predict(x)+xg.predict(x)+dt.predict(x))/4
  print(f"score prediction averaging {int(fp)}")
  print(f"score prediction max_voting {int(model.predict(x))}")
  print(f"actual score {int(movies.loc[movies.title == name]['vote_average'])}\n")
  print(f"Similar movies for ({name}):")

  index =  movies.loc[movies.title == name].index[0]
  distance = sorted(list(enumerate(similarity[index])), reverse=True,key=lambda vector:vector[1])
  for i in distance[1:5]:
    print(movies.iloc[i[0]]['title'])

In [44]:
predict("The Dark Knight Rises")

score prediction averaging 7
score prediction max_voting 7
actual score 7

Similar movies for (The Dark Knight Rises):
The Dark Knight
Batman Begins
Batman
Batman Returns


  print(f"score prediction averaging {int(fp)}")
  print(f"score prediction max_voting {int(model.predict(x))}")
  print(f"actual score {int(movies.loc[movies.title == name]['vote_average'])}\n")
