** Experiment - 7 **

1. Merge Cricket, Football, Badminton, Hockey, Football etc. to Sports

In [21]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import time
import gc

import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

sns.set_style('dark')

SEED = 31314
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py



In [42]:
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
dataset.load_data('../data/processed/hotstar_processed.feather')

data_processed = dataset.data
train_mask     = dataset.get_train_mask() 

In [43]:
# replace cricket, football, badminton, hocket with sports
data_processed['genres'] = data_processed.genres\
                                        .str\
                                        .replace('Cricket|Football|Badminton|Hockey|Volleyball|Swimming|Table Tennis|Tennis|Athletics|Boxing|Formula1|FormulaE|IndiaVsSa|Kabaddi', 'Sport')

In [44]:
# ohe genres
genres_ohe_encoded = encode_ohe(data_processed.genres)

In [45]:
# count based features

data_processed['num_cities'] = count_feature(data_processed.cities)
data_processed['num_genres'] = count_feature(data_processed.genres)
data_processed['num_titles'] = count_feature(data_processed.titles)
data_processed['num_tod']    = count_feature(data_processed.tod)
data_processed['num_dow']    = count_feature(data_processed.dow)

In [46]:
# watch time by genres
data_processed['watch_time_sec'] = num_seconds_watched(data_processed.genres)

In [47]:
features = pd.concat((data_processed[['num_cities', 'num_genres',
                'num_titles', 'num_tod',
                'num_dow', 'watch_time_sec',
                'segment'
               ]], genres_ohe_encoded), axis='columns')

save_file(features, '../data/processed/hotstar_processed_exp_7.feather')

In [48]:
features.columns

Index(['num_cities', 'num_genres', 'num_titles', 'num_tod', 'num_dow',
       'watch_time_sec', 'segment', 'Action', 'Awards', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Horror', 'Kids', 'LiveTV',
       'Mythology', 'NA', 'Reality', 'Romance', 'Science', 'Sport', 'TalkShow',
       'Teen', 'Thriller', 'Travel', 'Wildlife'],
      dtype='object')

In [41]:
features.columns

Index(['num_cities', 'num_genres', 'num_titles', 'num_tod', 'num_dow',
       'watch_time_sec', 'segment', 'Action', 'Athletics', 'Awards', 'Boxing',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Formula1',
       'FormulaE', 'Horror', 'IndiaVsSa', 'Kabaddi', 'Kids', 'LiveTV',
       'Mythology', 'NA', 'Reality', 'Romance', 'Science', 'Sport', 'TalkShow',
       'Teen', 'Thriller', 'Travel', 'Wildlife'],
      dtype='object')

** Train Test Split **

In [49]:
X = features.loc[train_mask, features.columns.drop('segment')]
y = features.loc[train_mask, 'segment']
Xtest = features.loc[~train_mask, features.columns.drop('segment')]

In [50]:
params = {
    'stratify': y,
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X, y, **params)

In [51]:
# further split train set into train and validation set
params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [59]:
dtrain = xgb.DMatrix(Xtr, ytr, missing=np.nan, feature_names=features.columns.drop('segment'))
dval   = xgb.DMatrix(Xte, yte, missing=np.nan, feature_names=features.columns.drop('segment'))

xgb_params = {
    'eta': 0.1,
    'max_depth': 5,
    'gamma': 1,
    'colsample_bytree': .7,
    'min_child_weight': 3.,
    'subsample': 1.,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': SEED,
    'silent': 1
}

n_estimators = 500

watchlist = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(xgb_params, dtrain, num_boost_round=n_estimators, verbose_eval=10,
                  evals=watchlist
                 )

[0]	train-auc:0.771607	val-auc:0.760746
[10]	train-auc:0.799447	val-auc:0.788333
[20]	train-auc:0.803408	val-auc:0.791523
[30]	train-auc:0.807451	val-auc:0.794378
[40]	train-auc:0.809682	val-auc:0.795346
[50]	train-auc:0.812329	val-auc:0.796456
[60]	train-auc:0.814387	val-auc:0.797144
[70]	train-auc:0.816436	val-auc:0.79782
[80]	train-auc:0.818278	val-auc:0.798162
[90]	train-auc:0.819676	val-auc:0.798296
[100]	train-auc:0.820919	val-auc:0.798363
[110]	train-auc:0.822162	val-auc:0.79861
[120]	train-auc:0.82321	val-auc:0.798515
[130]	train-auc:0.824423	val-auc:0.798588
[140]	train-auc:0.825335	val-auc:0.798735
[150]	train-auc:0.826472	val-auc:0.798877
[160]	train-auc:0.827754	val-auc:0.798863
[170]	train-auc:0.828462	val-auc:0.798879
[180]	train-auc:0.82973	val-auc:0.799037
[190]	train-auc:0.830491	val-auc:0.799079
[200]	train-auc:0.831482	val-auc:0.799044
[210]	train-auc:0.832284	val-auc:0.798969
[220]	train-auc:0.832953	val-auc:0.798974
[230]	train-auc:0.833652	val-auc:0.79898
[240]	tr

** Not showing particular good performance on the validation set **