In [14]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from bayes_opt import BayesianOptimization

import xgboost as xgb

sns.set_style('dark')

import warnings
warnings.filterwarnings('ignore')

SEED = 1231
np.random.seed(SEED)

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py
%run ../src/models/feature_selection.py
%run ../src/features/build_features.py

In [2]:
# load dataset
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
dataset.load_data('../data/processed/hotstar_processed.feather')

data_processed = dataset.data
train_mask     = dataset.get_train_mask() 

```
1. Prepare a feature map that would be used to create new categorical feature for genres
2. Convert s1:i1, s2:i2 to s1, s2
3. Replace s1, s2, ... using feature map
```

In [3]:
start = time.time()
genres_low    = create_feature_with_low_card(data_processed.genres, 'genres')
tod_low       = create_feature_with_low_card(data_processed.tod, 'tod')
dow_low       = create_feature_with_low_card(data_processed.dow, 'dow')
end   = time.time()

print('Took: {} seconds'.format(end - start))

Took: 51.43310904502869 seconds


In [4]:
features = ['genres', 'tod', 'dow', 'titles', 'cities']
start = time.time()
count_features = calculate_count_features(data_processed, features)
end   = time.time()

print('Took: {} seconds'.format(end - start))

Took: 1.6184546947479248 seconds


In [5]:
start = time.time()
watch_time = calculate_watch_time(data_processed.genres)
end   = time.time()

print('Took: {} seconds'.format(end - start))

Took: 3.4961295127868652 seconds


In [6]:
features = ['genres_low_card', 
            'tod_low_card',
            'dow_low_card',
            'genres_count', 
            'tod_count', 
            'dow_count',
            'titles_count',
            'cities_count',
            'watch_time']

new_data = np.hstack((genres_low.values.reshape(-1, 1),
                      tod_low.values.reshape(-1, 1),
                      dow_low.values.reshape(-1, 1),
                      count_features, 
                      watch_time.values.reshape(-1, 1)))

new_data = pd.DataFrame(new_data, columns=features)

In [7]:
X = new_data.loc[train_mask, :]
y = data_processed.loc[train_mask, 'segment']

Xtest = new_data.loc[~train_mask, :]

In [8]:
global_mean = y.mean()
values = {}

def main_function(temp, i):
    return (temp[1][i] * (temp[0][i] + temp[1][i])/np.sum(temp) + 
                global_mean * 10.0) / (temp[0][i] + temp[1][i] + 10.)

for col in ['genres_low_card', 'tod_low_card', 'dow_low_card']:
    values[col] = {}
    temp = pd.DataFrame({'y': y, col: X[col]}).groupby(['y', col]).size()
    for i, item in enumerate(temp[1].index):
        values[col][item] = main_function(temp, i)
        
print('values ', values)

values  {'genres_low_card': {'01': 0.00069406939338235291, '012': 0.0006626733740567732, '0123': 0.0010590517241379309, '01234': 0.0012768927444794952, '012345': 0.002206752021563342, '0123456': 0.0012695182291666666, '01234567': 0.0017898529411764703, '012345678': 0.004664999999999999, '0123456789': 0.0051723648648648642, '012345679': 0.0027155610561056106, '01234568': 0.0042145604395604393, '012345689': 0.0086764772727272715, '01234569': 0.0027080412371134018, '0123457': 0.0031166292134831458, '01234578': 0.0093237804878048766, '012345789': 0.021796428571428571, '01234579': 0.010208999999999998, '0123458': 0.0087760919540229888, '0123459': 0.016597173913043477, '012346': 0.0051488815789473676, '0123467': 0.0015417107583774251, '01234678': 0.0021213315217391304, '012346789': 0.0060765873015873008, '01234679': 0.0079945833333333327, '0123468': 0.0043727142857142853, '01234689': 0.0048368987341772147, '0123469': 0.0075790099009900988, '012347': 0.0034698068669527893, '0123478': 0.002260

In [9]:
start = time.time()

X['genres_low_card_encoded']     = X.genres_low_card.replace(to_replace=values['genres_low_card'])
Xtest['genres_low_card_encoded'] = Xtest.genres_low_card.replace(to_replace=values['genres_low_card']) 


X['tod_low_card_encoded']     = X.tod_low_card.replace(to_replace=values['tod_low_card'])
Xtest['tod_low_card_encoded'] = Xtest.tod_low_card.replace(to_replace=values['tod_low_card']) 

X['dow_low_card_encoded']     = X.dow_low_card.replace(to_replace=values['dow_low_card'])
Xtest['dow_low_card_encoded'] = Xtest.dow_low_card.replace(to_replace=values['dow_low_card']) 

end = time.time()
print('Took: {} seconds'.format(end - start))

Took: 19.984405040740967 seconds


In [10]:
f = [
     'genres_low_card_encoded', 
     'tod_low_card_encoded',
     'dow_low_card_encoded',
     'genres_count',
     'tod_count', 
     'dow_count', 
     'watch_time'
    ]

params = {
    'stratify': y,
    'test_size': .2,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X.loc[:, f], y, **params)

params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [16]:
# model
pipeline = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('log', LogisticRegression(C=.1, class_weight='auto', n_jobs=3, random_state=SEED))
])

scores = cv_loop(Xtr, ytr, pipeline, 'auc', SEED)

print('Scores on 10 fold: {}'.format(scores))
print('Mean AUC: {}'.format(np.mean(scores)))
print('Std AUC: {}'.format(np.std(scores)))

Scores on 10 fold: [0.57483625414298833, 0.59440386674293599, 0.58326527902238634, 0.59188920423035118, 0.59007975061835882, 0.60405941860400636, 0.59894192695046478, 0.59125187360562836, 0.59137536518844214, 0.58856183695852615]
Mean AUC: 0.5908664776064089
Std AUC: 0.007575683168156838


In [18]:
# etr model
start = time.time()
model = ExtraTreesClassifier(n_estimators=100, max_depth=3, n_jobs=3, random_state=SEED)
scores = cv_loop(Xtr, ytr, model, 'auc', SEED)
end   = time.time()

print('Scores on 10 fold: {}'.format(scores))
print('Mean AUC: {}'.format(np.mean(scores)))
print('Std AUC: {}'.format(np.std(scores)))
print('Took: {} seconds'.format(end - start))

Scores on 10 fold: [0.57920115117250204, 0.59854088210159784, 0.58743129356803225, 0.59583349508640382, 0.59385714696699132, 0.59430427866911417, 0.59257723246966432, 0.59281276388201376, 0.58485834605513287, 0.57784523740195193]
Mean AUC: 0.5897261827373403
Std AUC: 0.006719918052660057
Took: 12.242074012756348 seconds


In [20]:
# xgboost model
start  = time.time()
model  = xgb.XGBClassifier(max_depth=6, n_estimators=150, learning_rate=.3, seed=SEED)
scores = cv_loop(Xtr.astype(np.float), ytr, model, 'auc', SEED)
end    = time.time()

print('Scores on 10 fold: {}'.format(scores))
print('Mean AUC: {}'.format(np.mean(scores)))
print('Std AUC: {}'.format(np.std(scores)))
print('Took: {} seconds'.format(end - start))

Scores on 10 fold: [0.74456646128894199, 0.75289515763784909, 0.73100072818256989, 0.74067799723818195, 0.75175611842543089, 0.75768711248585807, 0.75224354397280324, 0.75122308663441495, 0.75226206248448768, 0.73547058699145729]
Mean AUC: 0.7469782855341995
Std AUC: 0.008242545798794904
Took: 49.45935368537903 seconds


** Hyper-parameter tuning **

In [31]:
def rfccv(n_estimators, min_samples_split, max_depth):
    skf = StratifiedKFold(n_splits=5, random_state=SEED)
    val = cross_val_score(
        RandomForestClassifier(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               max_depth=int(max_depth),
                               random_state=SEED,
                               n_jobs=3
                              ),
        X_train, y_train, scoring='roc_auc', cv=skf
    ).mean()
    
    return val

def xgbccv(n_estimators, 
          learning_rate, 
          colsample_bytree, 
          min_child_weight,
          subsample,
          max_depth):
    
    skf = StratifiedKFold(n_splits=5, random_state=SEED)
    val = cross_val_score(
                    xgb.XGBClassifier(n_estimators=int(n_estimators),
                    max_depth=int(max_depth),
                    colsample_bytree=colsample_bytree,
                    subsample=subsample,
                    min_child_weight=int(min_child_weight),
                    learning_rate=learning_rate
                              ),
        X_train.astype(np.float), y_train, scoring='roc_auc', cv=skf
    ).mean()
    
    return val

def parameter_search(rf):
    gp_params = {
        'alpha': 1e-5
    }
    
    if rf:
        rfcBO = BayesianOptimization(
            rfccv,
            {
                'n_estimators': (10, 500),
                'min_samples_split': (2, 25),
                'max_depth': (5, 30)
            }
        )
        rfcBO.maximize(n_iter=5, **gp_params)
        print('RFC: %f' % rfcBO.res['max']['max_val'])
    else:
        xgbcBO = BayesianOptimization(
            xgbccv,
            {
                'n_estimators': (10, 500),
                'learning_rate': (.05, .3),
                'colsample_bytree': (.1, .99),
                'min_child_weight': (1, 20),
                'subsample': (.1, .99),
                'max_depth': (2, 12)
            }
        )
        xgbcBO.maximize(n_iter=5, **gp_params)
        print('RFC: %f' % xgbcBO.res['max']['max_val'])

In [32]:
start = time.time()
parameter_search(rf=False)
end   = time.time()

print('Took: {} seconds'.format(end - start))

[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   n_estimators |   subsample | 
    1 | 00m10s | [35m   0.74557[0m | [32m            0.6616[0m | [32m         0.2642[0m | [32m     5.9563[0m | [32m            1.7337[0m | [32m       65.9854[0m | [32m     0.5848[0m | 
    2 | 00m23s |    0.73770 |             0.2925 |          0.1024 |      5.8133 |            18.8369 |       161.3720 |      0.7238 | 
    3 | 00m27s |    0.72079 |             0.8259 |          0.2138 |     10.7134 |             5.4441 |        93.4216 |      0.1651 | 
    4 | 00m32s |    0.73325 |             0.1984 |          0.2822 |      2.7301 |             1.5623 |       453.0136 |      0.8058 | 
    5 | 00m10s |    0.68304 |             0.2083 |          0.2024 |      2.5298 |       