#Overview & motivation

All of our team members enjoy movies. In addition to enjoying movies, we also enjoy working with API’s and somewhat structured data sets. Therefore, determining what makes a movie successful using the data available in the Internet Movie Database (IMDB) and Wikipedia seemed like a natural choice.

In [26]:
from imdb import IMDb
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import cPickle as pickle
ia = IMDb(accessSystem='http')
from collections import defaultdict 
import io
from datetime import datetime
import time
from sklearn.linear_model import LogisticRegression, ElasticNet
import sklearn.svm
import math
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

###Related Work

#Initial Questions

#Data

In [2]:
# Load AAdict (dict of Oscar nominated movies)
AAdict = pickle.load(open('AAdict.p','rb'))
# Load movies (dict of all movies)
#movies = pickle.load(io.open('moviestemp.p','rb'))

In [3]:
# convert AAdict to pandas
AAdf = pd.DataFrame.from_dict(AAdict).transpose()
AAdf['movieid'] = AAdf.index

# Create new columns
# number of nominations
# winner
AAdf['winner'] = AAdf['won'].apply(lambda x: len(x)!=0) * 1
# convert years to ints
AAdf['year'] = AAdf['year'].apply(lambda x: int(x))

AAdf.head()
#AAdf[AAdf['Nominated Best Actor']==1].head()

Unnamed: 0,Nominated Best Actor,Nominated Best Actress,Nominated Best Animated Feature Film,Nominated Best Art Direction,Nominated Best Cinematography,Nominated Best Costume Design,Nominated Best Director,"Nominated Best Documentary, Feature","Nominated Best Documentary, Short Subject",Nominated Best Film Editing,...,keywords,mpaa,nominations,releasedate,runtime,title,won,year,movieid,winner
35423,False,False,False,False,False,False,False,False,False,False,...,"[time-travel, brooklyn-bridge, bridge, time-tr...",PG-13,"[Best Music, Song]",2001-12-25,118,Kate & Leopold,[],2001,35423,0
80388,Burt Lancaster,Susan Sarandon,False,False,False,False,Louis Malle,False,False,False,...,"[drugs, gangster, camera-shot-of-feet, female-...",,"[Best Picture, Best Actor, Best Actress, Best ...",1981-04-03,104,Atlantic City,[],1981,80388,0
80855,False,False,False,Tambi Larsen (Art Direction); Jim Berkey (Set ...,False,False,False,False,False,False,...,"[immigrant, sheriff, 1890s, johnson-county-war...",,[Best Art Direction],1980-11-18,149,Heaven's Gate,[],1981,80855,0
81974,Paul Newman,False,False,False,False,False,False,False,False,False,...,"[murder, newspaper, mafia, reporter, slander, ...",,"[Best Actor, Best Supporting Actress, Best Wri...",1981-11-19,116,Absence of Malice,[],1981,81974,0
81988,False,False,False,False,False,False,False,"Suzanne Bauman, Paul Neshamkin, Jim Burroughs ...",False,False,...,,,"[Best Documentary, Feature]",,60,Against Wind and Tide: A Cuban Odyssey,[],1981,81988,0


### Include Star Power info to AAdf

###Import and Split Data

First we split our data into a validation set (movies in 2006) and a training set (movies 1981-2005).  We will use k-fold cross validation to train our model.  In this preliminary analysis, we train our model to predict Oscar winners given the movie was nominated.

In [4]:
# Functions

# convert release dates into quarters
def get_quarter(monthint):
    if len(monthint) == 1:
        return 0
    else:
        if int(monthint[1]) <= 3:
            return 1
        if int(monthint[1]) >3 and int(monthint[1]) <=6:
            return 2
        if int(monthint[1]) >6 and int(monthint[1]) <=9:
            return 3
        if int(monthint[1]) >9:
            return 4
# convert release dates into month
def get_month(monthint):
    if len(monthint) == 1:
        return 0
    else:
        return(int(monthint[1]))

# convert the string of countries into countries
def get_countries(countrylist):
    countries =[]
    if countrylist == 0:
        return [u'USA']
    else:
        for country in countrylist.split('/'):
            country = country.replace(" ", "")
            countries.append(country)
        return countries

# Roc curve function, taken from HW3
from sklearn.metrics import roc_curve, auc
def make_roc(name, clf, ytest, xtest, ax=None, labe=5, proba=True, skip=0):
    initial=False
    if not ax:
        ax=plt.gca()
        initial=True
    if proba:#for stuff like logistic regression
        fpr, tpr, thresholds=roc_curve(ytest, clf.predict_proba(xtest)[:,1])
    else:#for stuff like SVM
        fpr, tpr, thresholds=roc_curve(ytest, clf.decision_function(xtest))
    roc_auc = auc(fpr, tpr)
    if skip:
        l=fpr.shape[0]
        ax.plot(fpr[0:l:skip], tpr[0:l:skip], '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
    else:
        ax.plot(fpr, tpr, '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
    label_kwargs = {}
    label_kwargs['bbox'] = dict(
        boxstyle='round,pad=0.3', alpha=0.2,
    )
    if labe!=None:
        for k in xrange(0, fpr.shape[0],labe):
            #from https://gist.github.com/podshumok/c1d1c9394335d86255b8
            threshold = str(np.round(thresholds[k], 2))
            ax.annotate(threshold, (fpr[k], tpr[k]), **label_kwargs)
    if initial:
        ax.plot([0, 1], [0, 1], 'k--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('ROC')
    ax.legend(loc="lower right")
    return ax

In [8]:
%%time
# Dealing with Categorical variables & creating other descriptive variables

# mpaa is ordinal, convert to ordinal 'mpaaint'
AAdf['mpaaint'] = AAdf['mpaa']
AAdf['mpaaint'].loc[AAdf['mpaaint']=='R'] = 3
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG-13'] = 2
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG'] = 1
AAdf['mpaaint'].loc[pd.isnull(AAdf['mpaaint'])] = 0

# count number of nominations
AAdf.loc[:,'numnominations'] = AAdf['nominations'].apply(lambda x: len(x))

# convert release dates to quarters
AAdf.loc[:,'quarter'] = AAdf['releasedate'].apply( lambda x: get_quarter(str(x).split('-')) )

# convert release dates into months
AAdf.loc[:,'month'] = AAdf['releasedate'].apply( lambda x: get_month(str(x).split('-')) )

# convert release dates to number of days since beginning of the year
AAdf.loc[:,'countrylist'] = AAdf['country'].apply( lambda x: get_countries(x) )
# get unique list of countries
allcountries = []
for countrylist in AAdf['countrylist']:
    for country in countrylist:
        allcountries.append(country)
uniquecountries = set(allcountries)
# create dummy variables for countries
for country in uniquecountries:
    AAdf[country] = 0
for movie in AAdf.iterrows():
    if type(movie[1]['countrylist']) == list:
        for country in uniquecountries:
            if country in set(movie[1]['countrylist']):
                AAdf.loc[movie[0],country] = 1
            else:
                AAdf.loc[movie[0],country] = 0

# make genres into dummy variables

AAdf.head()

Wall time: 39.7 s


In [9]:
AAdf.columns

Index([                     u'Nominated Best Actor',
                          u'Nominated Best Actress',
            u'Nominated Best Animated Feature Film',
                    u'Nominated Best Art Direction',
                   u'Nominated Best Cinematography',
                   u'Nominated Best Costume Design',
                         u'Nominated Best Director',
             u'Nominated Best Documentary, Feature',
       u'Nominated Best Documentary, Short Subject',
                     u'Nominated Best Film Editing', 
       ...
                                           u'India',
                 u'UnionofSovietSocialistRepublics',
                                         u'Austria',
                                         u'Vietnam',
                                      u'Yugoslavia',
                                              u'UK',
                                         u'Hungary',
                                          u'Taiwan',
                                  

In [10]:
## Split Data

# Test Set:  will be 2006 movies
testdf = AAdf[AAdf['year']>=2006].copy()

# Training Set:  1981-2005 movies
traindf = AAdf[AAdf['year']<2005].copy()

# Create k-fold training/test datasets
from sklearn.cross_validation import KFold
kfdf = KFold(n=len(traindf), n_folds=10, shuffle=True, random_state=None)

### Model Setup

In [11]:
# Baseline classifier:
1-np.mean(traindf['winner'])

0.7324281150159744

##### Random Forest Classifier

Because the RandomForest Classifier can't take categorical variables as inputs.  Therefore we have to deal with the categorical variables in some way.

For categorical variables that are considered ordinal (mpaa), we can just convert them to ints.
For categorical varaibles that are not ordinal (keywords), we will use sparse matrcies.

In [19]:
# identify desired variables to be used in the model
#movievars=[u'runtime',u'year',u'releasedate',u'nominations',u'mpaa',u'genres',u'country']
movievars=[u'runtime',u'year',u'mpaaint',u'numnominations',u'month']
movievars.extend(list(uniquecountries))
Xtrain=traindf[movievars].values
Ytrain=traindf['winner'].values
test=testdf[movievars].values

#movievarstest = movievars
#movievarstest.append(u'winner')
xtest=testdf[movievars].values
ytest=testdf['winner'].values

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

# Create the random forest object
forest = RandomForestClassifier(n_estimators=1000, max_depth=15, min_samples_split=15, random_state=0)
# Fit to the training data
forest = forest.fit(Xtrain,Ytrain)
# Predit on test data
predictionRF = forest.predict(test)
# Get score of prediction
print "Score of Random Forest classifier:", forest.score(xtest, ytest)

Score of Random Forest classifier: 0.827586206897


In [21]:
zip(predictionRF, ytest)

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0)]

#### Logistic Regression

In [None]:
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
clfLR = lr.fit(Xtrain,Ytrain)
predictionLR = lr.predict_proba(xtest)

In [23]:
#lr.predict_proba?
zip(predictionLR, ytest)

[(array([ 0.64172888,  0.35827112]), 0),
 (array([ 0.82709722,  0.17290278]), 0),
 (array([ 0.80466282,  0.19533718]), 0),
 (array([ 0.89524554,  0.10475446]), 0),
 (array([ 0.8907331,  0.1092669]), 0),
 (array([ 0.8688053,  0.1311947]), 1),
 (array([ 0.58910092,  0.41089908]), 1),
 (array([ 0.84974603,  0.15025397]), 0),
 (array([ 0.85724001,  0.14275999]), 0),
 (array([ 0.85311371,  0.14688629]), 0),
 (array([ 0.68059527,  0.31940473]), 0),
 (array([ 0.84420808,  0.15579192]), 1),
 (array([ 0.43056763,  0.56943237]), 1),
 (array([ 0.84386551,  0.15613449]), 0),
 (array([ 0.79058527,  0.20941473]), 0),
 (array([ 0.87020884,  0.12979116]), 1),
 (array([ 0.2480387,  0.7519613]), 1),
 (array([ 0.78266673,  0.21733327]), 1),
 (array([ 0.85724001,  0.14275999]), 0),
 (array([ 0.80369627,  0.19630373]), 0),
 (array([ 0.82033616,  0.17966384]), 0),
 (array([ 0.11532509,  0.88467491]), 1),
 (array([ 0.85405832,  0.14594168]), 0),
 (array([ 0.82688119,  0.17311881]), 0),
 (array([ 0.46894102, 

In [28]:
#with sns.hls_palette(8, l=.3, s=.8):
with sns.hls_palette(8, l=.3, s=.8):
    ax=make_roc("logistic-regression",clfLR, ytest, xtest, labe=200, skip=50)
#make_roc("logistic regression",clfLR, ytest, xtest, labe=3, skip=50)

TypeError: make_roc() got an unexpected keyword argument 'prob'

In [None]:
get_

In [47]:
#Define do_classify and cv_optimize, used from homework 3.
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.3f" % (training_accuracy)
    print "Accuracy on test data:     %0.3f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [188]:
# Import Logistic Regresion and grid_search from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC

clfsvm=LinearSVC(loss="hinge")
Cs=[0.000001,0.00001,0.0001,0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
Xmatrix=traindf[movievars].values
Yresp=traindf['winner'].values
xtest=testdf[movievars].values
ytest=testdf['winner'].values

# Initialize classifer as Logistic Regression
clf = LogisticRegression()

# Use GridSearchCV over the parameter grid of regularization coefficients in 
# the Cs array to get the best fit classifier using 5-fold cross validation
fitmodel = GridSearchCV(clf, param_grid=dict(C=Cs), cv=5, scoring="accuracy")

# Fit over training data
fitmodel.fit(Xmatrix,Yresp)

# Find best value of C
best = fitmodel.best_params_
best

{'C': 1e-06}

In [193]:
# Import accuracy_score from sklearn
from sklearn.metrics import accuracy_score

#calculate the accuracy here
clf=LogisticRegression(C=fitmodel.best_params_['C'])
clf.fit(Xmatrix,Yresp)
ypred=clf.predict(Xmatrix)
print "Accuracy on training data: ", clf.score(Xmatrix, Yresp)
#print "Accuracy on test data: ", accuracy_score(xtest, ytest)

Accuracy on training data:  0.732428115016


In [142]:
%%time
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
clfsvm, Xtrain, ytrain, Xtest, ytest = do_classify(LinearSVC(loss="hinge"), {"C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, traindf, movievars, u'winner',1, mask=False)

using mask


TypeError: Singleton array 0 cannot be considered a valid collection.

In [38]:
for train_index, test_index in kfdf:
    print train_index
    print test_index

[   1    2    4 ..., 1303 1304 1305]
[   0    3   26   28   30   41   51   54   59   63   83   95  100  117  146
  154  158  163  182  190  198  202  205  211  222  269  270  271  275  304
  325  331  339  364  368  375  376  381  404  405  434  436  472  491  492
  495  521  534  535  540  568  575  578  584  586  600  606  607  622  624
  631  644  647  651  653  662  667  669  673  679  681  702  717  722  743
  755  771  774  778  780  793  798  810  816  846  848  860  900  901  909
  917  919  923  925  946  957  962  969  975  976  991 1000 1018 1035 1045
 1063 1064 1092 1110 1112 1115 1118 1122 1125 1134 1149 1181 1197 1205 1210
 1211 1218 1223 1227 1238 1259 1262 1276 1287 1292 1294]
[   0    1    2 ..., 1303 1304 1305]
[   7   12   60   77   81   92  114  122  126  131  153  161  173  174  196
  204  248  249  255  268  287  296  308  312  318  320  327  342  345  349
  357  383  393  400  401  406  408  413  418  423  439  440  443  449  458
  507  526  528  542  550  555  5

### Cross Validation

In [27]:
# Import Logistic Regresion and grid_search from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

1364

#Exploratory Data Analysis

#Final Analysis

#Presentation