In [337]:
import pickle
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import pandas as pd
import seaborn as sea
import datetime
import matplotlib.pyplot as plt
import warnings
import random
from rankit.Merge import borda_count_merge
warnings.filterwarnings('ignore')

In [338]:
box_office_mojo = pd.read_csv("./data/box_office_mojo.csv", index_col=0)
imdb = pd.read_pickle("./data/imdb.pkl")
tmdb = pd.read_csv("./data/tmdb.csv", index_col=0)
cast_crew = pd.read_pickle("./data/cast_crew.pkl")

In [339]:
print(box_office_mojo.columns)
print(imdb.columns)
print(tmdb.columns)
print(cast_crew.columns)

Index(['gross', 'movie_name', 'release_date', 'theaters', 'title', 'url',
       'tconst'],
      dtype='object')
Index(['awards', 'cast', 'cast_ids', 'critic_reviews', 'directors',
       'directors_ids', 'genres', 'imdb_popularity', 'meta_score',
       'mmpa_rating', 'number_user_ratings', 'tconst', 'user_rating',
       'user_reviews'],
      dtype='object')
Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'status_code', 'status_message',
       'tagline', 'title', 'video', 'vote_average', 'vote_count'],
      dtype='object')
Index(['adult', 'also_known_as', 'biography', 'birthday', 'deathday', 'gender',
       'homepage', 'id', 'imdb_id', 'known_for_department', 'name',
       'place_of_birth', 'populari

### Data cleaning and processing for box office mojo data

In [340]:
box_office_mojo["gross"] = box_office_mojo["gross"].apply( 
                                lambda x: int(x[1:].replace(',', '')))
box_office_mojo["theaters"] = box_office_mojo["theaters"].apply( 
                                lambda x: int(x.replace(',', '')) if x !="-" else None)
box_office_mojo["theaters"].fillna(box_office_mojo["theaters"].median(), inplace=True)
# df['purchase'].astype(str).astype(int)

In [341]:
df = box_office_mojo.groupby("title")["movie_name"].count()

In [342]:
box_office_mojo.head()

Unnamed: 0,gross,movie_name,release_date,theaters,title,url,tconst
0,126737428,Smokey and the Bandit (1977),1977-05-27,2250.0,Smokey and the Bandit,/release/rl477136385/?ref_=bo_yld_table_2,tt0076729
1,116395460,Close Encounters of the Third Kind (1977),1977-11-16,650.0,Close Encounters of the Third Kind,/release/rl340428289/?ref_=bo_yld_table_3,tt0075860
2,94213184,Saturday Night Fever (1977),1977-12-16,726.0,Saturday Night Fever,/release/rl2926544385/?ref_=bo_yld_table_4,tt0076666
3,50750000,A Bridge Too Far (1977),1977-06-15,2250.0,A Bridge Too Far,/release/rl139036161/?ref_=bo_yld_table_5,tt0075784
4,47346365,The Deep (1977),1977-06-17,731.0,The Deep,/release/rl3561784833/?ref_=bo_yld_table_6,tt0075925


### Data cleaning and processing for imdb data

In [343]:
imdb['critic_reviews'] = imdb['critic_reviews'].astype(str).astype(int)
imdb['user_rating'] = imdb['user_rating'].astype(str).astype(float)
imdb["number_user_ratings"] = imdb["number_user_ratings"].apply( 
                                lambda x: int(x.replace(',', '')))
imdb["user_reviews"] = imdb["user_reviews"].apply( 
                                lambda x: int(x.replace(',', '')))
imdb["meta_score"] = imdb["meta_score"].apply( 
                                lambda x: float(x) if x != None else x)
imdb["imdb_popularity"] = imdb["imdb_popularity"].apply( 
                                lambda x: float(x) if x != None else x)
imdb['mmpa_rating'] = imdb['mmpa_rating'].astype(str)
imdb['reviews'] = imdb['critic_reviews'] + imdb['user_reviews']
imdb['ratings']  = imdb['number_user_ratings'] * imdb['user_rating'] 
imdb['directors']  = imdb['directors'].apply( 
                                lambda x: x[1:] if len(x) > 1 else x )
imdb["cast_crew_id"] = imdb["cast_ids"] + imdb["directors_ids"]

In [344]:
imdb["imdb_popularity"].fillna(imdb["imdb_popularity"].mean(), inplace=True)
imdb["meta_score"].fillna(imdb["meta_score"].mean(), inplace=True)

In [345]:
imdb.head()

Unnamed: 0,awards,cast,cast_ids,critic_reviews,directors,directors_ids,genres,imdb_popularity,meta_score,mmpa_rating,number_user_ratings,tconst,user_rating,user_reviews,reviews,ratings,cast_crew_id
0,2,"[Burt Reynolds, Sally Field, Jerry Reed]","[nm0000608, nm0000398, nm0715274]",78,[Hal Needham],[nm0624102],"[Action, Comedy]",3.0,50.0,PG,41965,tt0076729,7.0,191,269,293755.0,"[nm0000608, nm0000398, nm0715274, nm0624102]"
1,53,"[Richard Dreyfuss, François Truffaut, Teri Garr]","[nm0000377, nm0000076, nm0000414]",220,[Steven Spielberg],[nm0000229],"[Drama, Sci-Fi]",2.943485,90.0,PG,171550,tt0075860,7.6,411,631,1303780.0,"[nm0000377, nm0000076, nm0000414, nm0000229]"
2,16,"[John Travolta, Karen Lynn Gorney, Barry Miller]","[nm0000237, nm0331186, nm0587944]",97,[John Badham],[nm0000824],"[Drama, Music]",1.0,77.0,R,65781,tt0076666,6.8,240,337,447310.8,"[nm0000237, nm0331186, nm0587944, nm0000824]"
3,11,"[Sean Connery, Ryan O'Neal, Michael Caine]","[nm0000125, nm0641939, nm0000323]",55,[Richard Attenborough],[nm0000277],"[Drama, History, War]",2.0,54.707366,PG,48385,tt0075784,7.4,238,293,358049.0,"[nm0000125, nm0641939, nm0000323, nm0000277]"
4,3,"[Jacqueline Bisset, Nick Nolte, Dick Anthony W...","[nm0000302, nm0000560, nm0930454]",33,[Peter Yates],[nm0946811],"[Adventure, Mystery, Thriller]",2.943485,41.0,PG,10324,tt0075925,6.2,69,102,64008.8,"[nm0000302, nm0000560, nm0930454, nm0946811]"


### Data cleaning and processing for tmdb data

In [346]:
tmdb = tmdb[tmdb.status == "Released"]
tmdb['votes']  = tmdb['vote_average'] * tmdb['vote_count'] 
tmdb["runtime"].fillna(tmdb["runtime"].median(), inplace=True)
tmdb.rename(columns = {"popularity": "tmdb_popularity"}, inplace=True)

In [347]:
tmdb.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,spoken_languages,status,status_code,status_message,tagline,title,video,vote_average,vote_count,votes
0,False,/hzjX8eA4u7tGz4d4kjTnsW4RNWK.jpg,"{'id': 87220, 'name': 'Smokey and the Bandit C...",4300000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,11006.0,tt0076729,en,Smokey and the Bandit,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,,What we have here is a total lack of respect f...,Smokey and the Bandit,False,6.8,347.0,2359.6
1,False,/xATZyEpnZ0Z0iO9z5K8RZsraGKI.jpg,,20000000.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,840.0,tt0075860,en,Close Encounters of the Third Kind,...,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}, {'iso_...",Released,,,We are not alone.,Close Encounters of the Third Kind,False,7.4,2206.0,16324.4
2,False,/gqTwlRkQ2FBCeeq6JxzCj4ctjHF.jpg,"{'id': 288280, 'name': 'Saturday Night Fever C...",3500000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,11009.0,tt0076666,en,Saturday Night Fever,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,,Catch it,Saturday Night Fever,False,6.7,901.0,6036.7
3,False,/qHryw7hcO1Gcd7xQmJfDYPTgNLX.jpg,,22000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,5902.0,tt0075784,en,A Bridge Too Far,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,,Out of the sky comes the screen's most incredi...,A Bridge Too Far,False,7.2,369.0,2656.8
4,False,/1Wg8AmasOfb3HvkaS33AzJ6puph.jpg,,0.0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",,20416.0,tt0075925,en,The Deep,...,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,,,Is anything worth the terror of ... The Deep.,The Deep,False,6.2,100.0,620.0


### Data cleaning and processing for actors and director data

In [348]:
cast_crew['popularity'] = cast_crew['popularity'].astype(str).astype(float)

In [349]:
cast_crew.head()

Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2,,1,nm0000184,Directing,George Lucas,"Modesto, California, USA",6.642,/8qxin8urtFE0NqaZNFWOuV537bH.jpg
1,False,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2,,2,nm0000434,Acting,Mark Hamill,"Concord, California, USA",7.589,/fk8OfdReNltKZqOk2TZgkofCUFq.jpg
2,False,"[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...",Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2,,3,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",10.845,/7CcoVFTogQgex2kJkXKMe8qHZrC.jpg
3,False,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",Carrie Frances Fisher (21 October 1956 - 27 De...,1956-10-21,2016-12-27,1,http://www.carriefisher.com/,4,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",3.878,/rfJtncHewKVnHjqpIZvjn24ESeC.jpg
4,False,[Peter Wilton Cushing],"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",1913-05-26,1994-08-11,2,,5,nm0001088,Acting,Peter Cushing,"Kenley, Surrey, England, UK",2.704,/l0grZXcjqctESg7h6Jdp33pA3QG.jpg


### Removing unwanted columns for modelling

In [350]:
# Removing unwanted columns in each dataset
box_office_mojo.drop(['url', "title", "gross"], inplace=True, axis=1)
imdb.drop(["critic_reviews", "user_reviews", "number_user_ratings", 
           "user_rating", "cast_ids", "directors_ids", "cast", "directors"], inplace=True, axis=1)
tmdb.drop(["belongs_to_collection", "backdrop_path", "title","id", "adult", "production_companies",
           "overview", "poster_path", "original_language", "production_countries",
          "video", "status_code", "status", "status_message", "release_date",
          "homepage", "spoken_languages", "genres","original_title", "tagline",
          "status", "title", "vote_average", "vote_count"], inplace=True, axis=1)
cast_crew.drop(["adult", "id", "also_known_as", "biography", "deathday",
               "birthday", "gender", "homepage", "known_for_department", 
               "place_of_birth", "profile_path", "name"], inplace=True, axis=1)

### Merge the datasets

In [351]:
imdb_mer = imdb.groupby(imdb.tconst).first()
tmdb_mer = tmdb.groupby(tmdb.imdb_id).first()
box_office_mojo.set_index("tconst", inplace=True)
df = tmdb_mer.merge(imdb_mer, how ="inner", left_index=True, right_index=True)
data = box_office_mojo.merge(df, how ="inner", left_index=True, right_index=True)

In [352]:
cast_popularity = {}
for i in cast_crew.values:
    cast_popularity[i[0]] = i[1]

In [353]:
def get_popularity(cast_crew):
    popu = 0
    for person in cast_crew:
        if person in cast_popularity:
            popu += cast_popularity[person]
    return popu

In [354]:
# Summing up popularity of the cast and crew
data["actor_popularity"] = data["cast_crew_id"].apply(lambda x: get_popularity(x))

In [355]:
data.drop(['cast_crew_id'], inplace=True, axis=1)

In [356]:
data.head()

Unnamed: 0,movie_name,release_date,theaters,budget,tmdb_popularity,revenue,runtime,votes,awards,genres,imdb_popularity,meta_score,mmpa_rating,reviews,ratings,actor_popularity
tt0029583,Snow White and the Seven Dwarfs1983 Re-release...,1983-07-15,1362.0,1488423.0,18.454,184925486.0,83.0,31717.0,17,"[Animation, Family, Fantasy]",2.943485,95.0,G,396,1285152.4,4.941
tt0029583,Snow White and the Seven Dwarfs1987 Re-release...,1987-07-17,1729.0,1488423.0,18.454,184925486.0,83.0,31717.0,17,"[Animation, Family, Fantasy]",2.943485,95.0,G,396,1285152.4,4.941
tt0029583,Snow White and the Seven Dwarfs1993 Re-release...,1993-07-02,1831.0,1488423.0,18.454,184925486.0,83.0,31717.0,17,"[Animation, Family, Fantasy]",2.943485,95.0,G,396,1285152.4,4.941
tt0032455,Fantasia1985 Re-release (1985),1985-02-08,225.0,2280000.0,15.459,83320000.0,124.0,12247.0,9,"[Animation, Family, Fantasy]",2.943485,96.0,G,433,650418.6,5.211
tt0032455,Fantasia1990 Re-release (1990),1990-10-05,481.0,2280000.0,15.459,83320000.0,124.0,12247.0,9,"[Animation, Family, Fantasy]",2.943485,96.0,G,433,650418.6,5.211


In [357]:
for i in data.columns:
    print(i)

movie_name
release_date
theaters
budget
tmdb_popularity
revenue
runtime
votes
awards
genres
imdb_popularity
meta_score
mmpa_rating
reviews
ratings
actor_popularity


In [365]:
data = pd.concat([data, pd.get_dummies(data['mmpa_rating'])], axis=1)

In [366]:
# Save dataframe as csv
data.to_csv("./data/modelling_data.csv")
# Save dataframe as pickle
data.to_pickle("./data/modelling_data.pkl")

In [359]:
cast_crew[cast_crew.imdb_id =="nm0000093"]

Unnamed: 0,imdb_id,popularity
127,nm0000093,20.336


In [360]:
cast_crew[cast_crew.popularity == max(cast_crew["popularity"])]

Unnamed: 0,imdb_id,popularity
169,nm0000134,32.371


In [None]:
da