** Experiment - 5 **

1. OHE of cities and genres
2. Total watch time
3. Random Forest Model
4. Hyperparameter Tuning

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import gc
import time

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from bayes_opt import BayesianOptimization

pd.set_option('max_columns', None)

SEED = 2131
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py

In [2]:
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
dataset.load_data('../data/processed/hotstar_processed.feather')

<__main__.Hotstar at 0x7f8f6051bf98>

In [3]:
data_processed = dataset.data
train_mask     = dataset.get_train_mask() 

In [4]:
data_processed

Unnamed: 0,ID,cities,dow,genres,segment,titles,tod
0,train-1,"gurgaon:55494,delhi:31892","1:3412,3:15878,2:1737,5:10975,4:20974,7:17820,...","Cricket:82379,Kabaddi:255,Reality:4751",0.0,"Top Raids: Haryana vs Services SCB:103,Day 4: ...","10:26,13:331,12:323,20:21864,21:16233,17:7953,..."
1,train-10,"delhi:5862,nagar:8916,mumbai:1593","1:5745,3:3025,2:3346,5:123,4:3007,7:1108,6:10","Cricket:15640,Wildlife:730",0.0,"Dhoni Quits Captaincy:148,Day 4: India Move in...","11:1661,10:384,20:401,21:798,22:221,16:525,19:..."
2,train-100,navi mumbai:4142,3:4142,"LiveTV:13,Football:4129",0.0,"Star Sports 4:13,Manchester United vs Everton:...","1:1207,0:2406,2:529"
3,train-1000,"new delhi:4131,chennai:2878,navi mumbai:1339","1:658,3:5867,5:413,4:1339,7:71","TalkShow:658,Cricket:7690",0.0,"SRH vs RCB:701,KKR vs KXIP:1042,MI vs SRH:2288...","11:71,20:2417,21:1042,23:2288,19:1872,8:658"
4,train-10000,"gurgaon:6077,chennai:4055","1:1641,2:480,4:1445,7:1663,6:4900","Drama:5503,Cricket:3283,Reality:1345",0.0,"MI vs KKR:304,Yeh Rishta Kya Kehlata Hai:5449,...","20:158,22:4139,17:67,23:1510,19:288,18:56,0:23..."
5,train-100000,"hyderabad:998,bangalore:2748,gulbarga:43317,be...","1:6707,3:1948,2:3574,5:8525,4:18938,7:8295,6:7344","Action:998,Drama:8795,Cricket:45541",0.0,"India vs Australia 2nd Test English:2836,SRH v...","11:3450,10:1243,13:4420,12:4210,20:7050,21:770..."
6,train-100001,navi mumbai:10155,"1:1575,3:5330,2:1242,4:2007","Action:963,TalkShow:18,Romance:1357,Mythology:...",0.0,"Jodi:7222,Maapillai:1357,Mahabharatham:594,Ban...","11:20,12:574,21:1357,22:1066,23:2290,0:4847"
7,train-100002,"delhi:1571,navi mumbai:12729","1:333,2:2233,5:739,4:268,7:10727","Drama:4344,Cricket:9956",0.0,"Chandra Nandni:4344,India vs England 2nd T20I ...","11:242,15:419,14:1877,22:309,19:3063,18:64,1:2..."
8,train-100003,delhi:1318,"2:34,5:1074,7:210","Cricket:1248,Comedy:70",0.0,"India vs Bangladesh Day 2 English:1066,Fielder...","10:844,20:65,17:69,23:78,19:40,9:222"
9,train-100004,"chandigarh:2214,delhi:3829,mumbai:9465","5:14,4:14292,7:1201","Action:86,Drama:4826,Cricket:10557,Kids:24,Tal...",0.0,"The Jungle Book:24,Jolly LLB:4826,Escape Plan:...","11:1,13:1610,12:2626,20:2325,21:2985,17:1252,1..."


In [4]:
data = load_file('../data/processed/ohe_genres.feather')

** Save processed file **

In [38]:
save_file(data_processed.loc[:, ['ID'] + data_processed.columns[7:].tolist()], \
          '../data/processed/ohe_genres.feather')

In [18]:
X = data.loc[train_mask, data.columns.drop('ID', 'Action')]
y = data_processed.loc[train_mask, 'segment']
Xtest = data.loc[~train_mask, data.columns.drop('ID', 'Action')]

** Train, Val, Test Split **

In [6]:
params = {
    'stratify': y,
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X, y, **params)

In [7]:
X_train.shape, X_test.shape

((140000, 35), (60000, 35))

** Model Creation **

In [8]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=SEED)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_test)[:, 1]

print('AUC: {}'.format(roc_auc_score(y_test, preds)))

AUC: 0.7745496076842832


** Hyper-parameter Tuning **

In [9]:
def rfccv(n_estimators, min_samples_split, max_depth, max_features):
    skf = StratifiedKFold(n_splits=3, random_state=SEED)
    val = cross_val_score(
        RandomForestClassifier(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               max_depth=int(max_depth),
                               max_features=min(max_features, 1),
                               random_state=SEED,
                               class_weight='balanced'
                              ),
        X_train, y_train, scoring='roc_auc', cv=skf
    ).mean()

    return val

def logccv(C):
    skf = StratifiedKFold(n_splits=3, random_state=SEED)

    val = cross_val_score(
        LogisticRegression(C=C,
        n_jobs=2,
        class_weight='balanced',
        random_state=SEED
                          ),
        X_train, y_train, scoring='roc_auc', cv=skf
    ).mean()

    return val

In [10]:
def parameter_search(params, rf):
    gp_params = {
        'alpha': 1e-5
    }

    if rf:
        rfcBO = BayesianOptimization(
            rfccv,
            params
        )
        rfcBO.maximize(n_iter=10, **gp_params)
        print('RFC: %f' % rfcBO.res['max']['max_val'])

    else:
        logcBO = BayesianOptimization(
            logccv,
            {
                'C': (.01, 100)
            }
        )

        logcBO.maximize(n_iter=10, **gp_params)
        print('Logistic Regression: %f' % logcBO.res['max']['max_val'])

In [11]:
rf_params = {
                'n_estimators': (10, 250),
                'min_samples_split': (2, 25),
                'max_depth': (5, 30),
                'max_features': (0.1, 0.99)
            }

parameter_search(rf_params, rf=True)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   max_features |   min_samples_split |   n_estimators | 
    1 | 01m17s | [35m   0.77672[0m | [32m    10.5134[0m | [32m        0.7362[0m | [32m             4.3129[0m | [32m      220.1397[0m | 
    2 | 01m18s | [35m   0.77911[0m | [32m     8.4456[0m | [32m        0.9248[0m | [32m             5.2475[0m | [32m      201.2883[0m | 
    3 | 01m19s |    0.73948 |     28.1288 |         0.5666 |              5.2100 |       189.1249 | 
    4 | 00m11s | [35m   0.77961[0m | [32m     7.1602[0m | [32m        0.7216[0m | [32m            18.0646[0m | [32m       41.0869[0m | 
    5 | 00m35s | [35m   0.77974[0m | [32m    13.9344[0m | [32m        0.1934[0m | [32m            13.0437[0m | [32m      195.9890[0m | 
[31mBayesian Optimization[0m
[94m---------------------------------------------

In [12]:
def test_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    print('AUC on test set: {}'.format(roc_auc_score(y_test, preds)))

In [14]:
# train this model
rf = RandomForestClassifier(n_estimators=248, 
                            class_weight='balanced', 
                            max_depth=5, 
                            min_samples_split=2,
                            max_features=.1703,
                            random_state=SEED,
                            n_jobs=3
                           )

test_model(X_train, y_train, X_test, y_test, rf)

AUC on test set: 0.7741882208888831


In [15]:
def full_training(X, y, Xtest, model, model_name, save=True):
    model.fit(X, y)
    final_preds = model.predict_proba(Xtest)[:, 1]
    
    if save:
        joblib.dump(model, '../models/%s'%(model_name))
        
    return final_preds

In [19]:
final_preds = full_training(X, y, Xtest, rf, 'rf_genres_ohe.pkl')

In [20]:
sub            = pd.read_csv('../data/raw/5f828822-4--4-hotstar_dataset/sample_submission.csv')
sub['segment'] = final_preds
sub.to_csv('../submissions/hotstar/rf_genres_ohe.csv', index=False)