In [1]:
import os, sys
base_path = os.getcwd()
if not os.path.basename(base_path) == 'moviescope':
    base_path = os.path.join(base_path, '..')
sys.path.append(base_path)

In [2]:
import numpy as np
from pandas import read_csv 

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import confusion_matrix

In [3]:
sys.path.insert(0, '/net/if2/ks6cq/public_html/vqa/keras2/local/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg')

In [4]:
from sklearn.metrics import average_precision_score
import xgboost

In [5]:
from utils import get_labels, get_ids

In [6]:
metadata_path = os.path.join(base_path,'data','movie_metadata.csv')
inFile = open(metadata_path)
data = read_csv(inFile)

In [7]:
attributes = [
        'num_critic_for_reviews',
        'duration',
        'actor_1_facebook_likes',
        'actor_2_facebook_likes',
        'actor_3_facebook_likes',
        'num_voted_users',
        'facenumber_in_poster',
        'num_user_for_reviews',
        'imdb_score',
        'title_year',
        'movie_facebook_likes'
        ]

matrix = data.fillna(0).as_matrix(columns=attributes)

In [8]:
trainLabels, valLabels, testLabels = get_labels()
trainIds, valIds, testIds = get_ids()

Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/labels/trainLabels.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/labels/valLabels.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/labels/testLabels.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/index/trainIds.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/index/valIds.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/index/testIds.p


In [9]:
xTrain = matrix[trainIds]
xVal = matrix[valIds]
xTest = matrix[testIds]

In [38]:
rf_model = RF(n_jobs=4, n_estimators=500) 
rf_model.fit(xTrain, trainLabels)
rf_valPredictions = rf_model.predict_proba(xTest)

In [39]:
rf_valPredictions = np.array([col[:,1] for col in rf_valPredictions]).T

In [40]:
rf_valPredictions.shape

(989, 13)

In [13]:
losses = []
eval_metric = ['error', 'logloss', 'auc']
models = []
for c in range(valLabels.shape[1]):

    model = xgboost.XGBClassifier(objective="binary:logistic", n_estimators=500)
    eval_set = [(xVal, valLabels[:,c])]
    model.fit(xTrain, trainLabels[:,c], eval_metric=eval_metric, eval_set=eval_set, verbose=True)
    models.append(model)
    res = model.evals_result()['validation_0']
    losses.append(res)

[0]	validation_0-error:0.232179	validation_0-logloss:0.657957	validation_0-auc:0.692134
[1]	validation_0-error:0.193483	validation_0-logloss:0.628711	validation_0-auc:0.692537
[2]	validation_0-error:0.197556	validation_0-logloss:0.603696	validation_0-auc:0.710747
[3]	validation_0-error:0.197556	validation_0-logloss:0.583703	validation_0-auc:0.706298
[4]	validation_0-error:0.197556	validation_0-logloss:0.566002	validation_0-auc:0.710583
[5]	validation_0-error:0.197556	validation_0-logloss:0.551862	validation_0-auc:0.708781
[6]	validation_0-error:0.197556	validation_0-logloss:0.540868	validation_0-auc:0.701825
[7]	validation_0-error:0.195519	validation_0-logloss:0.530054	validation_0-auc:0.703942
[8]	validation_0-error:0.199593	validation_0-logloss:0.521089	validation_0-auc:0.702203
[9]	validation_0-error:0.199593	validation_0-logloss:0.512666	validation_0-auc:0.709209
[10]	validation_0-error:0.199593	validation_0-logloss:0.506288	validation_0-auc:0.708201
[11]	validation_0-error:0.19959

In [14]:
xgb_valPredictions = np.array([model.predict_proba(xVal)[:,1] for model in models]).T

In [15]:
xgb_valPredictions.shape

(491, 13)

In [42]:
rf_meanAUC = average_precision_score(testLabels, rf_valPredictions, average='micro') 
print rf_meanAUC

0.583047260572


In [17]:
xgb_meanAUC = average_precision_score(valLabels, xgb_valPredictions, average='micro')
print xgb_meanAUC

0.579344437778


In [18]:
from evaluations import find_precision_recall

In [43]:
p,r,m = find_precision_recall(testLabels, rf_valPredictions)

In [44]:
m

{0: 0.53903777458663549,
 1: 0.45193132702691197,
 2: 0.28427590263045555,
 3: 0.72129897169065249,
 4: 0.34578511442537602,
 5: 0.83441375798468309,
 6: 0.40384298423193388,
 7: 0.46877417590461151,
 8: 0.52046933957325658,
 9: 0.25351136999799007,
 10: 0.32046453801361263,
 11: 0.3403935164179,
 12: 0.47075311209530735,
 'micro': 0.58304726057209388}

In [21]:
valLabels.shape

(491, 13)

In [22]:
print m['micro']

0.593155306968


In [23]:
rf_valPredictions[0]

array([ 0.204,  0.036,  0.024,  0.73 ,  0.198,  0.37 ,  0.07 ,  0.134,
        0.014,  0.034,  0.43 ,  0.064,  0.204])

In [24]:
from utils import dump_pkl

In [45]:
dump_pkl(rf_valPredictions,"meta_test_preds")

Dumping data to meta_test_preds.p


In [26]:
from utils import load_pkl

In [47]:
video_valPredictions = load_pkl("../predictions/video_test_preds")

Loading data from ../predictions/video_test_preds.p


In [48]:
video_valPredictions.shape

(989, 13)

In [29]:
video_valPredictions[0],rf_valPredictions[0]

(array([  1.01365015e-01,   1.45026206e-05,   1.05062440e-01,
          9.01553094e-01,   4.10986990e-01,   2.68695384e-01,
          1.16516873e-02,   1.90374535e-02,   2.64387368e-03,
          1.59891024e-02,   2.38523796e-01,   5.67750493e-03,
          8.71262103e-02], dtype=float32),
 array([ 0.204,  0.036,  0.024,  0.73 ,  0.198,  0.37 ,  0.07 ,  0.134,
         0.014,  0.034,  0.43 ,  0.064,  0.204]))

In [30]:
np.set_printoptions(precision=3)

In [46]:
text_valPredictions = load_pkl("../predictions/text_test_preds.p")

Loading data from ../predictions/text_test_preds.p


In [32]:
movieID = 60
print "Predicted Order:", np.argsort(np.mean(np.array(zip(text_valPredictions[movieID], rf_valPredictions[movieID],video_valPredictions[movieID])), axis=1))[::-1]
print "Actual order:   ", np.argsort(valLabels[movieID])[::-1]

Predicted Order: [12  8  0  3 11  9  5  7  4 10  6  1  2]
Actual order:    [12  8 11 10  9  7  6  5  4  3  2  1  0]


In [50]:
VT = video_valPredictions + text_valPredictions
VT /= 2

_,_,vt = find_precision_recall(testLabels, VT)
print vt

{0: 0.75346069298483909, 1: 0.97846109881929266, 2: 0.41044705708947976, 3: 0.89756844471553321, 4: 0.66836346531547708, 5: 0.854139597152207, 6: 0.82457291244816033, 7: 0.77069276410183718, 8: 0.74155948665475413, 9: 0.45851452873139248, 10: 0.53813988884840569, 11: 0.66482516560348937, 12: 0.77485294882601019, 'micro': 0.76214275878931259}


In [51]:
TM = text_valPredictions + rf_valPredictions
TM /=2 
_,_,tm = find_precision_recall(testLabels, TM)
print tm

{0: 0.70984305153625604, 1: 0.65435821937387206, 2: 0.40349300715242709, 3: 0.84864145637779109, 4: 0.68031252986447388, 5: 0.88078357650851768, 6: 0.65345335592265419, 7: 0.6968511657600015, 8: 0.72742016496239381, 9: 0.43986341797044942, 10: 0.52111825808073775, 11: 0.56248091786341126, 12: 0.71985484991632376, 'micro': 0.71045631881811022}


In [52]:
VM = video_valPredictions + rf_valPredictions
VM /= 2
_,_,vm = find_precision_recall(testLabels, VM)
print vm

{0: 0.74200640375201454, 1: 0.96070820597620177, 2: 0.45665872729626539, 3: 0.89808880302189731, 4: 0.56382030403714012, 5: 0.88106294564526855, 6: 0.79410758979259233, 7: 0.73353028277507226, 8: 0.68135230432386629, 9: 0.41135059684463343, 10: 0.48755930288027816, 11: 0.58722837186136223, 12: 0.724581052109455, 'micro': 0.74471618354287039}


In [53]:
VTM = video_valPredictions + text_valPredictions + rf_valPredictions
VTM /= 3
_,_, vtm = find_precision_recall(testLabels, VTM)
print vtm

{0: 0.7701924098816253, 1: 0.96877557662593639, 2: 0.50276642743628619, 3: 0.91339317281180143, 4: 0.68700285076428902, 5: 0.89183415110140951, 6: 0.82623456061786749, 7: 0.78006478258858736, 8: 0.77901225471541269, 9: 0.48643547408028109, 10: 0.54607032265828415, 11: 0.65550962032412008, 12: 0.77923161601972579, 'micro': 0.76847102770585274}


In [58]:
(np.sum(vtm.values()) - vtm['micro'])/13

0.73742486304812505