In [15]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import os
import gc
import time
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
import xgbfir

from tqdm import tqdm_notebook

pd.set_option('max_columns', None)

SEED = 2131
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/util.py

In [16]:
def reload_data():
    dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
    dataset.load_data('../data/processed/hotstar_processed.feather')
    
    data_processed = dataset.data
    train_mask     = dataset.get_train_mask()
    
    return data_processed, train_mask

def summation(viewership):
    viewership = np.array(viewership)
    viewership  = viewership.astype(np.int)
    
    return viewership.sum()
    

def data_preparation(data_processed, **params):
    """
    Function to prepare dataset for modelling
    """
    
    st = time.time() # start time
    
    # viewership
    viewership     = data_processed.cities.str.replace(r'[^\d|^,]+', '').str.split(',').map(summation)
    data_processed = data_processed.assign(viewership=viewership)
    
    print('Prepared viewership')
    
    del viewership
    gc.collect()
    
    
    # number of cities involved
    num_cities     = data_processed.cities.str.split(',').map(len)
    data_processed = data_processed.assign(num_cities=num_cities)
    
    print('Prepared num cities')
    del num_cities
    gc.collect()
    
    # num genres watched
    num_genres     = data_processed.genres.str.split(',').map(len)
    data_processed = data_processed.assign(num_genres=num_genres)
    
    print('Prepared num genres')
    del num_genres
    gc.collect()
    
    # num titles watched
    num_titles     = data_processed.titles.str.split(',').map(len)
    data_processed = data_processed.assign(num_titles=num_titles)
    
    print('Prepared num titles')
    del num_titles
    gc.collect()
    
    # num tod
    num_tod        = data_processed.tod.str.split(',').map(len)
    data_processed = data_processed.assign(num_tod=num_tod)
    
    print('Prepared num tod')
    del num_tod
    gc.collect()
    
    # num DOW
    num_dow        = data_processed.dow.str.split(',').map(len)
    data_processed = data_processed.assign(num_dow=num_dow)
    
    print('Prepared num dow')
    del num_dow
    gc.collect()
    
    # dow OHE
    dow_dict_train = data_processed.loc[train_mask, 'dow'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

    dow_dict_test  = data_processed.loc[~train_mask, 'dow'].map(lambda x: x.split(','))\
                         .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                              (item.split(':') for item in x)))

    dv     = DictVectorizer(sparse=False)
    X1     = dv.fit_transform(dow_dict_train)
    Xtest1 = dv.transform(dow_dict_test)
    
    X1 = pd.DataFrame(np.vstack((X1, Xtest1)), columns=['dow' + c for c in dv.get_feature_names()])
    data_processed = pd.concat((data_processed, X1), axis='columns')
    print('Prepared DOW OHE')
    
    # genres OHE
    genres_dict_train = data_processed.loc[train_mask, 'genres'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

    genres_dict_test  = data_processed.loc[~train_mask, 'genres'].map(lambda x: x.split(','))\
                         .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                              (item.split(':') for item in x)))

    dv     = DictVectorizer(sparse=False)
    X1     = dv.fit_transform(genres_dict_train)
    Xtest1 = dv.transform(genres_dict_test)

    X1 = pd.DataFrame(np.vstack((X1, Xtest1)), columns=dv.get_feature_names())
    data_processed = pd.concat((data_processed, X1), axis='columns')
    print('Prepared genres OHE')
    
    # tod OHE
    tod_dict_train = data_processed.loc[train_mask, 'tod'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

    tod_dict_test  = data_processed.loc[~train_mask, 'tod'].map(lambda x: x.split(','))\
                         .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                              (item.split(':') for item in x)))

    dv     = DictVectorizer(sparse=False)
    X1     = dv.fit_transform(tod_dict_train)
    Xtest1 = dv.transform(tod_dict_test)

    X1 = pd.DataFrame(np.vstack((X1, Xtest1)), columns=['tod' + col for col in dv.get_feature_names()])
    data_processed = pd.concat((data_processed, X1), axis='columns')
    
    print('Prepared tod OHE')
    
    # convert watch time for three of the genres to hour.
    data_processed = data_processed.assign(cricket_view_hour=data_processed.Cricket / 3600)
    data_processed = data_processed.assign(romance_view_hour=data_processed.Romance / 3600)
    data_processed = data_processed.assign(ts_view_hour=data_processed.TalkShow / 3600)
    
    # mask for cricket view hour, romance view hour and talk show view hour
    data_processed = data_processed.assign(low_cricket_view=(data_processed.cricket_view_hour < data_processed.cricket_view_hour.quantile(q=.45)).astype('uint8'))
    data_processed = data_processed.assign(high_romance_view=(data_processed.romance_view_hour > data_processed.romance_view_hour.quantile(q=.99)).astype('uint8'))
    data_processed = data_processed.assign(high_ts_view=(data_processed.ts_view_hour > data_processed.ts_view_hour.quantile(q=.99)).astype('uint8'))
    
    if params['transform']:
        # TFIDF transformer
        vec = TfidfTransformer()
        features_to_transform = ['dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6', 'dow7',
           'Action', 'Athletics', 'Awards', 'Badminton', 'Boxing', 'Comedy',
           'Cricket', 'Crime', 'Documentary', 'Drama', 'Family', 'Football',
           'Formula1', 'FormulaE', 'Hockey', 'Horror', 'IndiaVsSa', 'Kabaddi',
           'Kids', 'LiveTV', 'Mythology', 'NA', 'Reality', 'Romance', 'Science',
           'Sport', 'Swimming', 'Table Tennis', 'TalkShow', 'Teen', 'Tennis',
           'Thriller', 'Travel', 'Volleyball', 'Wildlife', 'tod0', 'tod1', 'tod10',
           'tod11', 'tod12', 'tod13', 'tod14', 'tod15', 'tod16', 'tod17', 'tod18',
           'tod19', 'tod2', 'tod20', 'tod21', 'tod22', 'tod23', 'tod3', 'tod4',
           'tod5', 'tod6', 'tod7', 'tod8', 'tod9']

        transformed = vec.fit_transform(data_processed.loc[:, features_to_transform])
        transformed = pd.DataFrame(transformed.toarray(), columns=features_to_transform)

        data_processed.drop(features_to_transform, axis='columns', inplace=True)
        data_processed = pd.concat((data_processed, transformed), axis='columns')

        print('TFIDF transformed')
    
    print('Prepared flags for cricket, romance and talkshow genres')
    
    # feature interaction between features
    data_processed = data_processed.assign(cric_rom=data_processed.cricket_view_hour * data_processed.romance_view_hour)
    data_processed = data_processed.assign(cric_ts=data_processed.cricket_view_hour * data_processed.ts_view_hour)
    data_processed = data_processed.assign(rom_ts=data_processed.romance_view_hour * data_processed.ts_view_hour)
    
    print('Prepared Feature Interaction')
    
    # proportion of cricket watch time out of total viewership
    proportion_cric_wt = (data_processed.Cricket) / (data_processed.viewership)
    data_processed = data_processed.assign(proportion_cric_wt=proportion_cric_wt)
    
    print('Prepared cricket watch time proportion')
    
    # proportion of romance watch time out of total
    prop_romance_wt = (data_processed.Romance) / (data_processed.viewership)
    data_processed  = data_processed.assign(prop_romance_wt=prop_romance_wt)
    
    print('Prepared romance watch time proportion')
    
    # proportion of family watch time out of total
    prop_family_wt = (data_processed.Family) / (data_processed.viewership)
    data_processed  = data_processed.assign(prop_family_wt=prop_family_wt)
    
    print('Prepared family watch time proportion')
    
    # flag for those instances with very high viewership
    data_processed = data_processed.assign(high_viewership=(data_processed.viewership > data_processed.viewership.quantile(q=.99)).astype('uint8'))
    
    print('Prepared high viewership')
    
    et = time.time() # end time
    
    print('It took: {} seconds to prepare data'.format(et - st))
    
    return data_processed

** Dataset - 1 **

In [3]:
dataset1, train_mask = reload_data()

params = {
    'transform': False
}

dataset1 = data_preparation(dataset1, **params)

Prepared viewership
Prepared num cities
Prepared num genres
Prepared num titles
Prepared num tod
Prepared num dow
Prepared DOW OHE
Prepared genres OHE
Prepared tod OHE
Prepared flags for cricket, romance and talkshow genres
Prepared Feature Interaction
Prepared cricket watch time proportion
Prepared romance watch time proportion
Prepared family watch time proportion
Prepared high viewership
It took: 23.61421036720276 seconds to prepare data


In [None]:
save_file(dataset1, '../data/processed/hotstar_processed_experiment_20.feather')

** Dataset - 2 **

In [4]:
def prepare_titles(df):
    def cleanup(titles):
        cleaned_titles = []
        
        for title in titles:
            cleaned_titles.append(re.sub(r'[,"\']', '', title))
            
        return ' '.join(cleaned_titles)
    
    return df.titles.str.split(r':\d+').map(cleanup)

def prepare_cities(df):
    return df.cities.str.replace(r':\d+', '').str.replace(',', ' ')

def prepare_genres(df):
    return df.genres.str.replace(r':\d+', '').str.replace(',', ' ')

def prepare_tod(df):
    return df.tod.str.replace(r':\d+', '').str.replace(',', ' ')\
             .map(lambda x: ' '.join(['tod_' + z for z in x.split()]))

def prepare_dow(df):
    return df.dow.str.replace(r':\d+', '').str.replace(',', ' ')\
             .map(lambda x: ' '.join(['dow_' + z for z in x.split()]))


st = time.time()
cities_cleaned = prepare_cities(dataset1)
et = time.time()

print('Took: {} seconds'.format((et - st)))

st = time.time()
genres_cleaned = prepare_genres(dataset1)
et = time.time()

print('Took: {} seconds'.format((et - st)))

st = time.time()
titles_cleaned = prepare_titles(dataset1)
et = time.time()

print('Took: {} seconds'.format((et - st)))

st = time.time()
tod_cleaned = prepare_tod(dataset1)
et = time.time()

print('Took: {} seconds'.format((et - st)))

st = time.time()
dow_cleaned = prepare_dow(dataset1)
et = time.time()

print('Took: {} seconds '.format((et - st)))


dataset2 = pd.concat((titles_cleaned, 
                      cities_cleaned, 
                      genres_cleaned,
                      dow_cleaned,
                      tod_cleaned
                     ), axis='columns')

st = time.time()
dataset2 = dataset2\
           .apply(lambda x: x['titles'] + ' ' + x['cities'] + ' ' + x['genres'] + ' ' + x['dow'] + ' ' + x['tod'], axis='columns')
et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 0.5516326427459717 seconds
Took: 0.8438479900360107 seconds
Took: 7.986990690231323 seconds
Took: 1.664804220199585 seconds
Took: 1.2268295288085938 seconds 
Took: 15.788763046264648 seconds


In [5]:
save_file(dataset2.reset_index(), '../data/processed/hotstar_processed_2_experiment_20.feather')

** Split into training and test dataset **

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [6]:
features = [
           'viewership', 'num_cities', 'num_genres', 'num_titles', 'num_tod',
           'num_dow', 'low_cricket_view',
            'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6', 'dow7',
            'Action', 'Athletics', 'Awards', 'Badminton', 'Boxing', 'Comedy',
            'Cricket', 'Crime', 'Documentary', 'Drama', 'Family', 'Football',
            'Formula1', 'FormulaE', 'Hockey', 'Horror', 'IndiaVsSa', 'Kabaddi',
            'Kids', 'LiveTV', 'Mythology', 'NA', 'Reality', 'Romance', 'Science',
            'Sport', 'Swimming', 'Table Tennis', 'TalkShow', 'Teen', 'Tennis',
            'Thriller', 'Travel', 'Volleyball', 'Wildlife',
            'tod0', 'tod1', 'tod10',
            'tod11', 'tod12', 'tod13', 'tod14', 'tod15', 'tod16', 'tod17', 'tod18',
            'tod19', 'tod2', 'tod20', 'tod21', 'tod22', 'tod23', 'tod3', 'tod4',
            'tod5', 'tod6', 'tod7', 'tod8', 'tod9',
           'high_romance_view', 'high_ts_view', 'cric_rom', 'cric_ts', 'rom_ts',
           'proportion_cric_wt'
           ]

In [7]:
st = time.time()
vec = TfidfVectorizer(min_df=3)

X2      = vec.fit_transform(dataset2.loc[train_mask])
Xtest2  = vec.transform(dataset2.loc[~train_mask])

et = time.time()
print('It took: {} seconds to vectorize'.format((et - st)))

It took: 20.8608341217041 seconds to vectorize


In [8]:
X1     = dataset1.loc[train_mask, features]
Xtest1 = dataset1.loc[~train_mask, features]

y = dataset1.loc[train_mask, 'segment']

In [9]:
X_train1, X_test1, y_train, y_test = train_test_split(X1, y, stratify=y, test_size=.2, random_state=SEED)

In [10]:
X_train2 = X2[X_train1.index.values]
X_test2  = X2[X_test1.index.values]

## Cross-Validation

In [13]:
def cross_validation_multiple_dataset(X1, X2, y):
    skf = StratifiedKFold(n_splits=10, random_state=SEED)
    
    fold_scores = []
    for itr, ite in tqdm_notebook(skf.split(X1, y)):
        Xtr1 = X1.iloc[itr]
        Xtr2 = X2[itr]
        
        ytr  = y.iloc[itr]
        
        Xte1 = X1.iloc[ite]
        Xte2 = X2[ite]
        
        yte = y.iloc[ite]
        
        model1 = xgb.XGBClassifier(n_estimators=200, max_depth=4, colsample_bytree=1.)
        model2 = LogisticRegression(C=1., random_state=SEED)
        
        model1.fit(Xtr1, ytr)
        model2.fit(Xtr2, ytr)
        
        xgb_preds = model1.predict_proba(Xte1)[:, 1]
        log_preds = model2.predict_proba(Xte2)[:, 1]
        
        print('XGB AUC: {}'.format(roc_auc_score(yte, xgb_preds)))
        print('Log AUC: {}'.format(roc_auc_score(yte, log_preds)))
        
        xgb_rank  = sp.stats.rankdata(xgb_preds)
        log_rank  = sp.stats.rankdata(log_preds)
        
        ensemble_ranks = xgb_preds * .4 + .6 * log_rank
        fold_auc = roc_auc_score(yte, ensemble_ranks)
        
        print('Ensemble AUC: {}'.format(fold_auc))
        
        fold_scores.append(fold_auc)
        print('='*75)
        
    print('Mean AUC: {0} and std: {1}'.format(np.mean(fold_scores), np.std(fold_scores)))

In [14]:
cross_validation_multiple_dataset(X_train1, X_train2, y_train)

XGB AUC: 0.8062499792202091
Log AUC: 0.8200466243091413
Ensemble AUC: 0.8200466797219166
XGB AUC: 0.8144378540183681
Log AUC: 0.8308934534239
Ensemble AUC: 0.8308934811302876
XGB AUC: 0.8128042576960032
Log AUC: 0.8259704716402957
Ensemble AUC: 0.8259705270530711
XGB AUC: 0.8124701463673047
Log AUC: 0.8215171685401726
Ensemble AUC: 0.8215171962465602
XGB AUC: 0.8042658444064864
Log AUC: 0.8211801781317244
Ensemble AUC: 0.8211801504026265
XGB AUC: 0.8236700015528295
Log AUC: 0.8384297289203398
Ensemble AUC: 0.8384297289203398
XGB AUC: 0.8118836929670921
Log AUC: 0.823369501918429
Ensemble AUC: 0.823369501918429
XGB AUC: 0.8228634116419067
Log AUC: 0.834145065162243
Ensemble AUC: 0.8341450374312688
XGB AUC: 0.8216535647057911
Log AUC: 0.834915071118856
Ensemble AUC: 0.8349150156569078
XGB AUC: 0.8037291504671561
Log AUC: 0.8201252275326424
Ensemble AUC: 0.8201253107255645

Mean AUC: 0.8270592629206973 and std: 0.006582126882878879


In [None]:
# full training
