# Predict Movie Box Office

To predict the boxoffice for a movie using the pre-processed datasets from the IMDb and WorldBoxOffice data sources.
refer: https://github.com/madsenmj/iot-ml-imdb-demo/tree/master/src

In [1]:
#Import libraries

import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

from sklearn import naive_bayes
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import cluster
from sklearn import tree
from sklearn import ensemble
from sklearn import preprocessing

# Compute confusion matrix
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')



Load the dataset, then use the LabelEncoder to create labels for the Director and Genre columns. Finally, encode the year as a label as well.

In [2]:
FIELDS = ['worldGross','originalTitle','startYear','runtimeMinutes', 
          'genres', 'directors', 'writers', 'principalCast', 
          'genres1', 'genres2', 'genres3', 
          'director1', 'director2', 'director3', 'director4',
          'writer1', 'writer2', 'writer3', 'writer4',
          'principalCast1', 'principalCast2', 'principalCast3', 'principalCast4',
          'principalCast5', 'principalCast6', 'principalCast7', 'principalCast8'
          ]

# df = pd.read_csv("./input/movie_title_list_simple.csv")
df = pd.read_csv("./input/movie_title_list_v1.00.csv")

# max_len = 0
# for d in df['genres']:
# for d in df['directors']:
# for d in df['writers']:
# for d in df['principalCast']:
#     l=len(str(d).split(','))
#     if l>4:
#         print(d)
#     max_len = l if l > max_len else max_len
# print(max_len)

temp_df = pd.DataFrame()
temp_df = df.reindex(columns = FIELDS)

for i in range(0, len(df.index)):
    slist=str(df.loc[i, 'genres']).split(',')
    l = len(slist)
    for j in range(0, 3 if l > 3 else l):
        colname = 'genres' + str(j+1)
        temp_df.loc[i, colname] = slist[j]
        
    slist=str(df.loc[i, 'directors']).split(',')
    l = len(slist)
    for j in range(0, 4 if l > 4 else l):
        colname = 'director' + str(j+1)
        temp_df.loc[i, colname] = slist[j]
        
    slist=str(df.loc[i, 'writers']).split(',')
    l = len(slist)
    for j in range(0, 4 if l > 4 else l):
        colname = 'writer' + str(j+1)
        temp_df.loc[i, colname] = slist[j]        
        
    slist=str(df.loc[i, 'principalCast']).split(',')
    l = len(slist)
    for j in range(0, 8 if l > 8 else l):
        colname = 'principalCast' + str(j+1)
        temp_df.loc[i, colname] = slist[j]
print(temp_df)

       worldGross                                  originalTitle  startYear  \
0          2788.0                                         Avatar       2009   
1          2187.4                                        Titanic       1997   
2          2187.4                                        Titanic       1953   
3          2066.0     Star Wars: Episode VII - The Force Awakens       2015   
4          1670.4                                 Jurassic World       2015   
5          1519.6                                   The Avengers       2012   
6          1516.0                                  Furious Seven       2015   
7          1405.4                        Avengers: Age of Ultron       2015   
8          1391.7                        The Fate of the Furious       2017   
9          1341.5   Harry Potter and the Deathly Hallows: Part 2       2011   
10         1276.5                                         Frozen       2010   
11         1276.5                                   

In [None]:
genres = pd.concat([temp_df['genres1'], temp_df['genres2'], temp_df['genres3']])
directors = pd.concat([temp_df['director1'],temp_df['director2'],temp_df['director3'],temp_df['director4']])
writers = pd.concat([temp_df['writer1'],temp_df['writer2'],temp_df['writer3'],temp_df['writer4']])
principalCast = pd.concat([temp_df['principalCast1'],temp_df['principalCast2'],
                           temp_df['principalCast3'],temp_df['principalCast4'],
                           temp_df['principalCast5'],temp_df['principalCast6'],
                           temp_df['principalCast7'],temp_df['principalCast8']])

#Try model fitting on just director, year, and genre and see what we get.
dfred = pd.DataFrame()
y=(temp_df["worldGross"].reset_index())['worldGross']

leD = preprocessing.LabelEncoder()
leD.fit(genres.astype(str))
for j in range(0, 1):
    colname = 'genres' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leD = preprocessing.LabelEncoder()
leD.fit(directors.astype(str))
for j in range(0, 4):
    colname = 'director' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leD.fit(writers.astype(str))
for j in range(0, 4):
    colname = 'writer' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leD.fit(principalCast.astype(str))
for j in range(0, 8):
    colname = 'principalCast' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leY = preprocessing.LabelEncoder()
leY.fit(temp_df['startYear'].astype(str))
dfred['startYear'] = leY.transform(temp_df['startYear'].astype(str))

print("y: %s"%(y))
print(dfred)

In [5]:
genres = pd.concat([temp_df['genres1'], temp_df['genres2'], temp_df['genres3']])
directors = pd.concat([temp_df['director1']])
writers = pd.concat([temp_df['writer1']])
principalCast = pd.concat([temp_df['principalCast1'],temp_df['principalCast2']])

#Try model fitting on just director, year, and genre and see what we get.
dfred = pd.DataFrame()

for i in range(0, len(df.index)):
    if df.loc[i, 'worldGross'] > 200:
        temp_df.loc[i, 'worldGross'] = 9
    elif df.loc[i, 'worldGross'] > 150:
        temp_df.loc[i, 'worldGross'] = 8
    elif df.loc[i, 'worldGross'] > 100:
        temp_df.loc[i, 'worldGross'] = 7
    elif df.loc[i, 'worldGross'] > 65:
        temp_df.loc[i, 'worldGross'] = 6
    elif df.loc[i, 'worldGross'] > 40:
        temp_df.loc[i, 'worldGross'] = 5
    elif df.loc[i, 'worldGross'] > 20:
        temp_df.loc[i, 'worldGross'] = 4
    elif df.loc[i, 'worldGross'] > 10:
        temp_df.loc[i, 'worldGross'] = 3
    elif df.loc[i, 'worldGross'] > 1:
        temp_df.loc[i, 'worldGross'] = 2
    else:
        temp_df.loc[i, 'worldGross'] = 1
        
y=(temp_df['worldGross'].reset_index())['worldGross']

leD = preprocessing.LabelEncoder()
leD.fit(genres.astype(str))
for j in range(0, 1):
    colname = 'genres' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leD = preprocessing.LabelEncoder()
leD.fit(directors.astype(str))
for j in range(0, 1):
    colname = 'director' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leD.fit(writers.astype(str))
for j in range(0, 1):
    colname = 'writer' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leD.fit(principalCast.astype(str))
for j in range(0, 2):
    colname = 'principalCast' + str(j+1)
    dfred[colname] = leD.transform(temp_df[colname].astype(str))

leY = preprocessing.LabelEncoder()
leY.fit(temp_df['startYear'].astype(str))
dfred['startYear'] = leY.transform(temp_df['startYear'].astype(str))

leR = preprocessing.LabelEncoder()
leR.fit(temp_df['runtimeMinutes'].astype(str))
dfred['runtimeMinutes'] = leY.transform(temp_df['runtimeMinutes'].astype(str))

print(y)
print(dfred)

ValueError: y contains new labels: ['100.0' '101.0' '102.0' '103.0' '104.0' '105.0' '106.0' '107.0' '108.0'
 '109.0' '110.0' '111.0' '112.0' '113.0' '114.0' '115.0' '116.0' '117.0'
 '118.0' '119.0' '120.0' '121.0' '122.0' '123.0' '124.0' '125.0' '126.0'
 '127.0' '128.0' '129.0' '130.0' '131.0' '132.0' '133.0' '134.0' '135.0'
 '136.0' '137.0' '138.0' '139.0' '140.0' '141.0' '142.0' '143.0' '144.0'
 '145.0' '146.0' '147.0' '148.0' '149.0' '150.0' '151.0' '152.0' '153.0'
 '154.0' '155.0' '156.0' '157.0' '158.0' '159.0' '160.0' '161.0' '162.0'
 '163.0' '164.0' '165.0' '166.0' '167.0' '168.0' '169.0' '170.0' '171.0'
 '172.0' '173.0' '174.0' '175.0' '176.0' '177.0' '178.0' '179.0' '180.0'
 '181.0' '182.0' '183.0' '184.0' '185.0' '186.0' '187.0' '188.0' '189.0'
 '191.0' '192.0' '193.0' '194.0' '195.0' '197.0' '201.0' '202.0' '205.0'
 '206.0' '207.0' '208.0' '210.0' '212.0' '213.0' '216.0' '219.0' '220.0'
 '224.0' '225.0' '229.0' '238.0' '242.0' '271.0' '298.0' '306.0' '310.0'
 '325.0' '357.0' '38.0' '40.0' '41.0' '427.0' '44.0' '45.0' '46.0' '47.0'
 '48.0' '50.0' '51.0' '52.0' '54.0' '55.0' '56.0' '57.0' '58.0' '59.0'
 '60.0' '61.0' '62.0' '63.0' '64.0' '65.0' '66.0' '67.0' '68.0' '69.0'
 '70.0' '71.0' '72.0' '73.0' '74.0' '75.0' '76.0' '77.0' '78.0' '79.0'
 '80.0' '81.0' '82.0' '83.0' '84.0' '85.0' '86.0' '87.0' '88.0' '89.0'
 '90.0' '91.0' '92.0' '93.0' '94.0' '95.0' '96.0' '97.0' '98.0' '99.0'
 'nan']

I try using a random forest classifier to see how well it works at predicting the number of stars for out-of-sample data.

In [None]:

#How well does a decision tree model work to predict these data?
X_train, X_test, y_train, y_test = train_test_split(dfred,y,test_size=0.20, random_state=1) #split the data for training

#dtc = tree.DecisionTreeClassifier(min_samples_leaf=2)
dtc = ensemble.RandomForestClassifier(n_estimators=200,min_samples_leaf=5)
#dtc = svm.SVC(kernel='rbf',gamma=10,C=100,probability=True)
dtc.fit(X_train.astype(str),y_train.astype(str))
print( "score = {}".format(dtc.score(X_test.astype(str),y_test.astype(str))))
print (dtc.feature_importances_)
pdata =dtc.predict(X_test.astype(str))

#dtc_probs = dtc.predict_proba(X_test)
#score = log_loss(y_test, dtc_probs)
#print( "logloss = {}".format(score))
cm = confusion_matrix(y_test.astype(str), pdata.astype(str))
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)

Although the model only has about 40% accuracy, it does take the inputs and make a prediction. This is good enough for now.