In [1]:
import os, sys
base_path = os.getcwd()
if not os.path.basename(base_path) == 'moviescope':
    base_path = os.path.join(base_path, '..')
sys.path.append(base_path)

In [2]:
import numpy as np
from pandas import read_csv 

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import confusion_matrix

In [4]:
sys.path.insert(0, '/net/if2/ks6cq/public_html/vqa/keras2/local/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg')

In [5]:
from sklearn.metrics import average_precision_score
import xgboost

In [6]:
from utils import get_labels, get_ids

In [7]:
metadata_path = os.path.join(base_path,'data','movie_metadata.csv')
inFile = open(metadata_path)
data = read_csv(inFile)

In [8]:
attributes = [
        'num_critic_for_reviews',
        'duration',
        'actor_1_facebook_likes',
        'actor_2_facebook_likes',
        'actor_3_facebook_likes',
        'num_voted_users',
        'facenumber_in_poster',
        'num_user_for_reviews',
        'imdb_score',
        'title_year',
        'movie_facebook_likes'
        ]

matrix = data.fillna(0).as_matrix(columns=attributes)

In [9]:
trainLabels, valLabels, testLabels = get_labels()
trainIds, valIds, testIds = get_ids()

Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/labels/trainLabels.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/labels/valLabels.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/labels/testLabels.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/index/trainIds.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/index/valIds.p
Loading data from /net/if2/ks6cq/public_html/moviescope/extra_codes/../data/index/testIds.p


In [10]:
xTrain = matrix[trainIds]
xVal = matrix[valIds]
xTest = matrix[testIds]

In [11]:
rf_model = RF(n_jobs=4, n_estimators=500) 
rf_model.fit(xTrain, trainLabels)
rf_valPredictions = rf_model.predict_proba(xVal)

In [12]:
rf_valPredictions = np.array([col[:,1] for col in rf_valPredictions]).T

In [13]:
rf_valPredictions.shape

(491, 13)

In [36]:
losses = []
eval_metric = ['error', 'logloss', 'auc']
models = []
for c in range(valLabels.shape[1]):

    model = xgboost.XGBClassifier(objective="binary:logistic", n_estimators=500)
    eval_set = [(xVal, valLabels[:,c])]
    model.fit(xTrain, trainLabels[:,c], eval_metric=eval_metric, eval_set=eval_set, verbose=True)
    models.append(model)
    res = model.evals_result()['validation_0']
    losses.append(res)

In [37]:
xgb_valPredictions = np.array([model.predict_proba(xVal)[:,1] for model in models]).T

In [38]:
xgb_valPredictions.shape

(491, 13)

In [24]:
rf_meanAUC = average_precision_score(valLabels, rf_valPredictions, average='micro') 
print rf_meanAUC

0.596733742201


In [39]:
xgb_meanAUC = average_precision_score(valLabels, xgb_valPredictions, average='micro')
print xgb_meanAUC

0.579344437778


In [14]:
from evaluations import find_precision_recall

In [18]:
p,r,m = find_precision_recall(valLabels, rf_valPredictions)

{0: array([ 0.20816327,  0.20654397,  0.20696721,  0.2073922 ,  0.20781893,
        0.20618557,  0.20661157,  0.20746888,  0.20790021,  0.20876827,
        0.21008403,  0.21097046,  0.21141649,  0.20974576,  0.2106383 ,
        0.21153846,  0.21199143,  0.2112069 ,  0.21212121,  0.21258134,
        0.21304348,  0.21350763,  0.21491228,  0.21412804,  0.21555556,
        0.21748879,  0.21846847,  0.21995465,  0.21967963,  0.22222222,
        0.22273782,  0.22429907,  0.22482436,  0.22695035,  0.2280285 ,
        0.22857143,  0.23300971,  0.23414634,  0.2345679 ,  0.23514851,
        0.23809524,  0.24111675,  0.24296675,  0.24675325,  0.24543081,
        0.24867725,  0.25066667,  0.25067385,  0.25135135,  0.25203252,
        0.25409836,  0.25549451,  0.25690608,  0.25842697,  0.25988701,
        0.26285714,  0.26666667,  0.26979472,  0.27218935,  0.27299703,
        0.27794562,  0.28134557,  0.28482972,  0.28526646,  0.28797468,
        0.29354839,  0.29545455,  0.29836066,  0.3013245 ,  

In [23]:
m

{0: 0.53745931538181546,
 1: 0.60495919160404255,
 2: 0.24019909648177978,
 3: 0.67824752545150846,
 4: 0.29877336178353864,
 5: 0.86302596978757318,
 6: 0.42166754995874733,
 7: 0.42005485811131749,
 8: 0.67482484369125539,
 9: 0.3110965162909814,
 10: 0.36018761917281072,
 11: 0.40830529520664904,
 12: 0.50092490571593318,
 'micro': 0.59673374220066799}

In [22]:
valLabels.shape

(491, 13)

In [25]:
print m['micro']

0.596733742201
