** Experiment - 3 ** 

Predicting the segment of audience based on "watch patterns"

1. Load dataset
2. Build a basic ensemble model
   * Genres OHE with watch time
3. Cross-validate
4. Bayesian Optimization

In [167]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

import gc
import json
import time

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from bayes_opt import BayesianOptimization

sns.set_style('dark')

SEED = 2123
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/models/cross_validation.py

In [10]:
with open('../data/raw/5f828822-4--4-hotstar_dataset/train_data.json', 'r') as infile:
    train_json = json.load(infile)
    train      = pd.DataFrame.from_dict(train_json, orient='index')
    
    train.reset_index(level=0, inplace=True)
    train.rename(columns = {'index':'ID'},inplace=True)
    
    infile.close()
    
with open('../data/raw/5f828822-4--4-hotstar_dataset/test_data.json') as infile:
    test_json = json.load(infile)
    
    test = pd.DataFrame.from_dict(test_json, orient='index')
    test.reset_index(level=0, inplace=True)
    test.rename(columns = {'index':'ID'},inplace=True)
    
    infile.close()

In [11]:
# encode segment variable
lbl = LabelEncoder()
lbl.fit(train['segment'])

train['segment'] = lbl.transform(train['segment'])

In [12]:
data       = pd.concat((train, test))
train_mask = data.segment.notnull()

del train, test
gc.collect()

21

In [14]:
data.loc[train_mask, 'segment'].value_counts(normalize=True)

0.0    0.923725
1.0    0.076275
Name: segment, dtype: float64

** Huge class imbalance **

In [122]:
genre_dict_train = data.loc[train_mask, 'genres'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

genre_dict_test  = data.loc[~train_mask, 'genres'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))
    
dv    = DictVectorizer(sparse=False)
X     = dv.fit_transform(genre_dict_train)
Xtest = dv.transform(genre_dict_test)

y     = data.loc[train_mask, 'segment']

In [134]:
# convert it into pandas dataframe
X = pd.DataFrame(X)
y = pd.Series(y)

Xtest = pd.DataFrame(Xtest)

In [135]:
params = {
    'stratify': y,
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X, y, **params)

In [132]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=SEED)

In [140]:
auc_scores = cross_validation(X_train, y_train, rf, 'auc', SEED)





In [141]:
print('Mean AUC score: {0} and std: {1}'.format(np.mean(auc_scores), np.std(auc_scores)))

Mean AUC score: 0.726302474069734 and std: 0.004106034401338635


In [170]:
def rfccv(n_estimators, min_samples_split, max_depth):
    skf = StratifiedKFold(n_splits=3, random_state=SEED)
    val = cross_val_score(
        RandomForestClassifier(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               max_depth=int(max_depth),
                               random_state=SEED
                              ),
        X_train, y_train, scoring='roc_auc', cv=skf
    ).mean()
    
    return val

def logccv(C):
    skf = StratifiedKFold(n_splits=3, random_state=SEED)
    
    val = cross_val_score(
        LogisticRegression(C=C,
        n_jobs=2,
        class_weight='balanced',
        random_state=SEED
                          ),
        X_train, y_train, scoring='roc_auc', cv=skf
    ).mean()
    
    return val

def parameter_search(rf):
    gp_params = {
        'alpha': 1e-5
    }
    
    if rf:
        rfcBO = BayesianOptimization(
            rfccv,
            {
                'n_estimators': (10, 250),
                'min_samples_split': (2, 25),
                'max_depth': (5, 30)
            }
        )
        rfcBO.maximize(n_iter=10, **gp_params)
        print('RFC: %f' % rfcBO.res['max']['max_val'])
        
    else:
        logcBO = BayesianOptimization(
            logccv,
            {
                'C': (.01, 100)
            }
        )
        
        logcBO.maximize(n_iter=10, **gp_params)
        print('Logistic Regression: %f' % logcBO.res['max']['max_val'])

In [151]:
start = time.time()
parameter_search()
end   = time.time()

print('Took: {} seconds to do parameter tuning'.format(end - start))

[31mInitialization[0m
[94m----------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   min_samples_split |   n_estimators | 
    1 | 00m53s | [35m   0.80459[0m | [32m    11.0137[0m | [32m            19.4458[0m | [32m      219.2036[0m | 
    2 | 00m23s |    0.80423 |     11.7281 |             13.2054 |        99.5782 | 
    3 | 00m34s |    0.80259 |     21.9016 |             15.4192 |        89.8688 | 
    4 | 00m41s |    0.80390 |     18.5479 |             17.9585 |       112.5680 | 
    5 | 00m06s |    0.78984 |     27.4873 |             12.0620 |        15.9280 | 
[31mBayesian Optimization[0m
[94m----------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   min_samples_split |   n_estimators | 
    6 | 01m56s |    0.79740 |     29.5030 |              2.7283 |       249.8664 | 
    7 | 00m30s |    0.79951 |      5.0023 |          

In [171]:
start = time.time()
parameter_search(rf=False)
end   = time.time()

print('Took: {} seconds to do parameter tuning'.format(end - start))

[31mInitialization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    1 | 00m08s | [35m   0.78599[0m | [32m  89.5155[0m | 
    2 | 00m07s |    0.78599 |   78.9667 | 
    3 | 00m08s |    0.78598 |   13.5476 | 
    4 | 00m07s |    0.78597 |   66.5578 | 
    5 | 00m08s |    0.78598 |    4.1060 | 
[31mBayesian Optimization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    6 | 00m12s | [35m   0.78600[0m | [32m   0.0133[0m | 
    7 | 00m11s |    0.78597 |   99.9988 | 
    8 | 00m11s |    0.78599 |    0.0105 | 
    9 | 00m11s |    0.78596 |   99.9991 | 
   10 | 00m09s |    0.78599 |    0.0120 | 
   11 | 00m10s |    0.78598 |   99.9985 | 
   12 | 00m11s | [35m   0.78601[0m | [32m   0.0102[0m | 
   13 | 00m11s |    0.78599 |   99.9971 | 
   14 | 00m12s |    0.78600 |    0.0124 | 
   15 | 00m10s |    0.78599 |   99.9994 | 
Logistic Regression: 0.786013
Took: 152.08046293258667 sec

In [172]:
def test_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    print('Log Loss on test set: {}'.format(roc_auc_score(y_test, preds)))

In [153]:
rf = RandomForestClassifier(n_estimators=219, 
                                max_depth=11, 
                                min_samples_split=19, 
                                random_state=SEED)
    
test_model(X_train, y_train, X_test, y_test, rf)

Log Loss on test set: 0.8094791959492946


In [173]:
log = LogisticRegression(C=.01, class_weight='balanced', random_state=SEED)
    
test_model(X_train, y_train, X_test, y_test, log)

Log Loss on test set: 0.7919791187472494


In [174]:
def full_training(X, y, Xtest, model, model_name, save=True):
    model.fit(X, y)
    final_preds = model.predict_proba(Xtest)[:, 1]
    
    if save:
        joblib.dump(model, '../models/%s'%(model_name))
        
    return final_preds

In [175]:
log = LogisticRegression(C=.01, class_weight='balanced', random_state=SEED)

In [176]:
final_preds = full_training(X, y, Xtest, log, 'log_genre_wt.pkl')

In [178]:
sub = pd.read_csv('../data/raw/5f828822-4--4-hotstar_dataset/sample_submission.csv')

In [179]:
sub['segment'] = final_preds
sub.to_csv('../submissions/hotstar/log_genre_watch_times.csv', index=False)