# Box Office Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

import ast

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


## Henter data

In [2]:
data = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

train = data.copy()
test = data_test.copy()

train = text_to_dict(train)
test = text_to_dict(test)

In [3]:
len(train)

3000

In [4]:
len(test)


4398

# Analysere data

In [5]:
train.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,{},3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,{},1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,{},0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,{},"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


Ser at de fleste kolonnene inneholder objekter av type string. For å hente ut noe nyttig informasjon fra disse må vi parse ut stringen og kategorisere verdiene i objektet

In [6]:
train.keys()

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  3000 non-null   object 
 2   budget                 3000 non-null   int64  
 3   genres                 3000 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   3000 non-null   object 
 12  production_countries   3000 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

In [8]:
train.corr()

Unnamed: 0,id,budget,popularity,runtime,revenue
id,1.0,0.019732,-0.00747,0.01075,0.00061
budget,0.019732,1.0,0.342356,0.238373,0.752965
popularity,-0.00747,0.342356,1.0,0.13369,0.46146
runtime,0.01075,0.238373,0.13369,1.0,0.21638
revenue,0.00061,0.752965,0.46146,0.21638,1.0


Ser at budget og popularity har høy korrelasjon med revenue 

Fokuserer på genres, spoken_languages, cast og crew

# Forberede dataen

## Genres

Vi ser at filmer kan ha flere sjangere, men for simplisitet regner vi bare med den første, og vi antar at disse er sortert etter primærsjanger

Fra https://www.statista.com/statistics/188658/movie-genres-in-north-america-by-box-office-revenue-since-1995/ ser vi at Adventure, Action, Drama, Comedy, thriller er de mest inntjenede sjangrene i filmindustrien.

Setter derfor opp en binærverdi om filmen er action eller adventure.

Action = 28
Adventure = 12

In [9]:
train_prepared = train
test_prepared = test

In [10]:
def setGenres (data):
    genres = []
    for i in range(len(data)):
        try:
            sjangre = pd.DataFrame(data['genres'].loc[i])
            sjanger_id = sjangre.iat[0,0]
            if sjanger_id == sjanger_id == 12 or sjanger_id == 28:
                genres.append(1)
            else:
                genres.append(0)
        except:
            genres.append(0)
    data['act_or_adv'] = genres

In [11]:
setGenres(train_prepared)

In [12]:
corr_matrix = train_prepared.corr()

In [13]:
corr_matrix["revenue"].sort_values(ascending=False)

revenue       1.000000
budget        0.752965
popularity    0.461460
runtime       0.216380
act_or_adv    0.209031
id            0.000610
Name: revenue, dtype: float64

## Language

Kategoriserer så original_language til tallverdier

In [14]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [15]:
lang_cat = train_prepared[["original_language"]]
lang_cat_test = test_prepared[["original_language"]]

In [16]:
lang_cat_encoded = ordinal_encoder.fit_transform(lang_cat)
lang_cat_encoded_test = ordinal_encoder.fit_transform(lang_cat_test)

In [17]:
train_prepared['lan_cat'] = lang_cat_encoded
test_prepared['lan_cat'] = lang_cat_encoded_test

In [18]:
corr_matrix = train_prepared.corr()
corr_matrix["revenue"].sort_values(ascending=False)

revenue       1.000000
budget        0.752965
popularity    0.461460
runtime       0.216380
act_or_adv    0.209031
id            0.000610
lan_cat      -0.092296
Name: revenue, dtype: float64

## Crew - Director

Fra samme kilde som sist ser vi på hvilke directors som får høyest inntjeninger med filmene sine https://www.statista.com/statistics/655728/all-time-top-grossing-directors-box-office/.

Vi ser at Steven Spielberg har dobbel så høy inntjening som nestemann på listen. Lager derfor en egen feature om han er regissør og en for resten av top 8.

Steven Spielberg = 488

Michael Bay = 865
Anthony Russo = 19271
Joe Russo = 19272
Peter Jackson = 108
Ron Howard = 6159
Robert Zemeckis = 24
Christopher Nolan = 525
James Cameron = 2710


In [19]:
def isSpielberg (data):
    spielberg = []
    for i in range (len(data)):
        try:
            crew = pd.DataFrame(data['crew'].iloc[i])
            directorId = crew [(crew.job == 'Director')].at[0, 'id']
            if directorId == 488:
                spielberg.append(1)
            else:
                spielberg.append(0)
        except:
            spielberg.append(0)
    data['isSpielberg'] = spielberg

In [20]:
def isTop8Dir (data):
    top8 = []
    for i in range (len(data)):
        try:
            crew = pd.DataFrame(data['crew'].iloc[i])
            directorId = crew [(crew.job == 'Director')].at[0, 'id']
            if (directorId == 865 or directorId == 19271 or directorId == 19272 or 
                directorId == 108 or directorId == 6159 or directorId == 24 or 
                directorId == 525 or directorId == 2710):
                top8.append(1)
            else:
                top8.append(0)
        except:
            top8.append(0)
    data['isTop8Dir'] = top8

In [21]:
isSpielberg(train_prepared)
isTop8Dir(train_prepared)

In [22]:
corr_matrix = train_prepared.corr()
corr_matrix["revenue"].sort_values(ascending=False)

revenue        1.000000
budget         0.752965
popularity     0.461460
runtime        0.216380
act_or_adv     0.209031
isSpielberg    0.108314
isTop8Dir      0.092721
id             0.000610
lan_cat       -0.092296
Name: revenue, dtype: float64

## Cast - Actors

Samme strategi som sist: https://www.statista.com/statistics/655480/all-time-top-grossing-actors-box-office/
Her tar vi en binærverdi om filmen har top 5 skuespillere som ranker øverst på grossing og spiller hovedrollen.

Samuel L. Jackson = 2231
Robert Downey Jr. = 3223
Scarlett Johannsson = 1245
Harrison Ford = 3
Tom Hanks = 31

In [23]:
def isTop5Act (data):
    top5actor = []
    for i in range(len(data)):
        try:
            cast = pd.DataFrame(data['cast'].iloc[i])
            actor = cast.at[0, 'id']
            if actor == 2231 or actor == 3223 or actor == 1245 or actor == 3 or actor == 31:
                top5actor.append(1)
            else:
                top5actor.append(0)
        except:
            top5actor.append(0)
    data['top5actor'] = top5actor

In [24]:
isTop5Act (train_prepared)

In [25]:
corr_matrix = train_prepared.corr()
corr_matrix["revenue"].sort_values(ascending=False)

revenue        1.000000
budget         0.752965
popularity     0.461460
runtime        0.216380
act_or_adv     0.209031
top5actor      0.126734
isSpielberg    0.108314
isTop8Dir      0.092721
id             0.000610
lan_cat       -0.092296
Name: revenue, dtype: float64

## Collections

Binært uttrykk for om filmen er i en collection eller ikke

In [26]:
def setCollection (data):
    collection_list = []
    for i in range(len(data)):
        if pd.DataFrame(data['belongs_to_collection'].iloc[i]).empty:
            collection_list.append(0)
        else:
            collection_list.append(1)
    data['isCollection'] = collection_list

In [27]:
setCollection(train_prepared)
setCollection(test_prepared)

In [28]:
corr_matrix = train_prepared.corr()
corr_matrix["revenue"].sort_values(ascending=False)

revenue         1.000000
budget          0.752965
popularity      0.461460
isCollection    0.339425
runtime         0.216380
act_or_adv      0.209031
top5actor       0.126734
isSpielberg     0.108314
isTop8Dir       0.092721
id              0.000610
lan_cat        -0.092296
Name: revenue, dtype: float64

## Fikser language til binær

In [29]:
fixedLang = []
for lang in train_prepared['lan_cat']:
    if lang == 7:
        fixedLang.append(1)
    else:
        fixedLang.append(0)
train_prepared['lan_cat'] = fixedLang

In [30]:
train_prepared.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,Keywords,cast,crew,revenue,act_or_adv,lan_cat,isSpielberg,isTop8Dir,top5actor,isCollection
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,0,1,0,0,0,1
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,0,1,0,0,0,1
2,3,{},3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,0,1,0,0,0,0
3,4,{},1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,0,0,0,0,0,0
4,5,{},0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,{},"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,1,0,0,0,0,0


## Rydde opp i unødvendig features

Fjerner budgets hvor verdi = 0

In [31]:
fixedBud = []
for bud in train_prepared['budget']:
    if bud == 0:
        fixedBud.append(None)
    else:
        fixedBud.append(bud)

train_prepared['budget'] = fixedBud

In [32]:
train_finished = train_prepared.drop(['belongs_to_collection', 'genres', 'homepage', 'imdb_id', 'original_language', 
                                      'original_title', 'overview','poster_path', 'production_companies',
                                     'production_countries', 'release_date','spoken_languages',
                                     'status', 'tagline', 'title', 'Keywords', 'cast', 'crew',
                                     ], axis = 1)
test_finished = test_prepared.drop(['belongs_to_collection', 'genres', 'homepage', 'imdb_id', 'original_language', 
                                      'original_title', 'overview','poster_path', 'production_companies',
                                     'production_countries', 'release_date','spoken_languages',
                                     'status', 'tagline', 'title', 'Keywords', 'cast', 'crew',
                                     ], axis = 1)

In [33]:
train_finished.isnull().sum()

id                0
budget          812
popularity        0
runtime           2
revenue           0
act_or_adv        0
lan_cat           0
isSpielberg       0
isTop8Dir         0
top5actor         0
isCollection      0
dtype: int64

Fjerner manglende verdier. Synd at budget har så mange manglende verdier ettersom denne featuren har det høyeste korrelasjonsnivået

In [34]:
train_finished = train_finished.dropna(axis = 0, how = 'any')
train_finished.isnull().sum()

id              0
budget          0
popularity      0
runtime         0
revenue         0
act_or_adv      0
lan_cat         0
isSpielberg     0
isTop8Dir       0
top5actor       0
isCollection    0
dtype: int64

In [35]:
data_labels = train_finished['revenue']
train_finished = train_finished.drop('revenue', axis = 1)

In [36]:
train_finished = train_finished.drop('id', axis = 1)

In [37]:
train_finished.head()

Unnamed: 0,budget,popularity,runtime,act_or_adv,lan_cat,isSpielberg,isTop8Dir,top5actor,isCollection
0,14000000.0,6.575393,93.0,0,1,0,0,0,1
1,40000000.0,8.248895,113.0,0,1,0,0,0,1
2,3300000.0,64.29999,105.0,0,1,0,0,0,0
3,1200000.0,3.174936,122.0,0,0,0,0,0,0
5,8000000.0,0.743274,83.0,0,1,0,0,0,0


# Finne riktig modell

### Linear Regression

In [38]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(train_finished, data_labels)

from sklearn.metrics import mean_squared_error
train_finished_lin = lin_reg.predict(train_finished)
lin_mse = mean_squared_error(data_labels, train_finished_lin)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

92989626.43753068

### Random Forrest

In [39]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_finished, data_labels)
train_finished_predictions_forr = forest_reg.predict(train_finished)
forest_mse = mean_squared_error(data_labels, train_finished_predictions_forr)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

34020917.81965044

Random Forrest gir oss det beste resultatet. Tar i bruk gridsearch for å finne de beste parametrene til modellen.

In [40]:
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, 
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]

In [41]:
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=30,scoring='neg_mean_squared_error')
grid_search.fit(train_finished, data_labels)
h = grid_search.best_params_
gridres = grid_search.cv_results_
for mean_score, params in zip(gridres["mean_test_score"], gridres["params"]):
    print(np.sqrt(-mean_score), params)

108397851.40777034 {'max_features': 2, 'n_estimators': 3}
95210346.9066658 {'max_features': 2, 'n_estimators': 10}
90222687.9403006 {'max_features': 2, 'n_estimators': 30}
97591964.67904128 {'max_features': 4, 'n_estimators': 3}
91783484.9706546 {'max_features': 4, 'n_estimators': 10}
89682992.63472028 {'max_features': 4, 'n_estimators': 30}
104230509.55196415 {'max_features': 6, 'n_estimators': 3}
93057746.61253598 {'max_features': 6, 'n_estimators': 10}
90517883.40148057 {'max_features': 6, 'n_estimators': 30}
103597778.80415303 {'max_features': 8, 'n_estimators': 3}
92959226.77725312 {'max_features': 8, 'n_estimators': 10}
92019304.15734518 {'max_features': 8, 'n_estimators': 30}
102562750.10989217 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
96698434.81589857 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
99697888.42886516 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
93448328.42839059 {'bootstrap': False, 'max_features': 3, 'n_estimators': 1

In [42]:
final_model = grid_search.best_estimator_
final_model

RandomForestRegressor(max_features=4, n_estimators=30)

In [43]:
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

final_predictions = final_model.predict(train_finished)
final_mse = mean_squared_log_error(data_labels, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

1.7354518111143593

Sier oss fornøyd med modellen. Dumper den så til en joblib-fil som brukes i webapplikasjonen

In [44]:
from joblib import dump

In [45]:
dump(final_model, 'box_office_pred.joblib', compress=6)

['box_office_pred.joblib']