In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import gc
import time
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import xgboost as xgb
import xgbfir

from tqdm import tqdm_notebook

pd.set_option('max_columns', None)

SEED = 2131
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py



In [2]:
def reload_data():
    dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
    dataset.load_data('../data/processed/hotstar_processed.feather')
    
    data_processed = dataset.data
    train_mask     = dataset.get_train_mask()
    
    return data_processed, train_mask

In [3]:
def summation(viewership):
    viewership = np.array(viewership)
    viewership  = viewership.astype(np.int)
    
    return viewership.sum()
    

def data_preparation(data_processed, **params):
    """
    Function to prepare dataset for modelling
    """
    
    st = time.time() # start time
    
    # viewership
    viewership     = data_processed.cities.str.replace(r'[^\d|^,]+', '').str.split(',').map(summation)
    data_processed = data_processed.assign(viewership=viewership)
    
    print('Prepared viewership')
    
    del viewership
    gc.collect()
    
    
    # number of cities involved
    num_cities     = data_processed.cities.str.split(',').map(len)
    data_processed = data_processed.assign(num_cities=num_cities)
    
    print('Prepared num cities')
    del num_cities
    gc.collect()
    
    # num genres watched
    num_genres     = data_processed.genres.str.split(',').map(len)
    data_processed = data_processed.assign(num_genres=num_genres)
    
    print('Prepared num genres')
    del num_genres
    gc.collect()
    
    # num titles watched
    num_titles     = data_processed.titles.str.split(',').map(len)
    data_processed = data_processed.assign(num_titles=num_titles)
    
    print('Prepared num titles')
    del num_titles
    gc.collect()
    
    # num tod
    num_tod        = data_processed.tod.str.split(',').map(len)
    data_processed = data_processed.assign(num_tod=num_tod)
    
    print('Prepared num tod')
    del num_tod
    gc.collect()
    
    # num DOW
    num_dow        = data_processed.dow.str.split(',').map(len)
    data_processed = data_processed.assign(num_dow=num_dow)
    
    print('Prepared num dow')
    del num_dow
    gc.collect()
    
    # dow OHE
    dow_dict_train = data_processed.loc[train_mask, 'dow'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

    dow_dict_test  = data_processed.loc[~train_mask, 'dow'].map(lambda x: x.split(','))\
                         .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                              (item.split(':') for item in x)))

    dv     = DictVectorizer(sparse=False)
    X1     = dv.fit_transform(dow_dict_train)
    Xtest1 = dv.transform(dow_dict_test)
    
    X1 = pd.DataFrame(np.vstack((X1, Xtest1)), columns=['dow' + c for c in dv.get_feature_names()])
    data_processed = pd.concat((data_processed, X1), axis='columns')
    print('Prepared DOW OHE')
    
    # genres OHE
    genres_dict_train = data_processed.loc[train_mask, 'genres'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

    genres_dict_test  = data_processed.loc[~train_mask, 'genres'].map(lambda x: x.split(','))\
                         .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                              (item.split(':') for item in x)))

    dv     = DictVectorizer(sparse=False)
    X1     = dv.fit_transform(genres_dict_train)
    Xtest1 = dv.transform(genres_dict_test)

    X1 = pd.DataFrame(np.vstack((X1, Xtest1)), columns=dv.get_feature_names())
    data_processed = pd.concat((data_processed, X1), axis='columns')
    print('Prepared genres OHE')
    
    # tod OHE
    tod_dict_train = data_processed.loc[train_mask, 'tod'].map(lambda x: x.split(','))\
                     .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                          (item.split(':') for item in x)))

    tod_dict_test  = data_processed.loc[~train_mask, 'tod'].map(lambda x: x.split(','))\
                         .map(lambda x: dict((k.strip(), int(v.strip())) for k,v in 
                                              (item.split(':') for item in x)))

    dv     = DictVectorizer(sparse=False)
    X1     = dv.fit_transform(tod_dict_train)
    Xtest1 = dv.transform(tod_dict_test)

    X1 = pd.DataFrame(np.vstack((X1, Xtest1)), columns=['tod' + col for col in dv.get_feature_names()])
    data_processed = pd.concat((data_processed, X1), axis='columns')
    
    print('Prepared tod OHE')
    
    # convert watch time for three of the genres to hour.
    data_processed = data_processed.assign(cricket_view_hour=data_processed.Cricket / 3600)
    data_processed = data_processed.assign(romance_view_hour=data_processed.Romance / 3600)
    data_processed = data_processed.assign(ts_view_hour=data_processed.TalkShow / 3600)
    
    # mask for cricket view hour, romance view hour and talk show view hour
    data_processed = data_processed.assign(low_cricket_view=(data_processed.cricket_view_hour < data_processed.cricket_view_hour.quantile(q=.45)).astype('uint8'))
    data_processed = data_processed.assign(high_romance_view=(data_processed.romance_view_hour > data_processed.romance_view_hour.quantile(q=.99)).astype('uint8'))
    data_processed = data_processed.assign(high_ts_view=(data_processed.ts_view_hour > data_processed.ts_view_hour.quantile(q=.99)).astype('uint8'))
    
    if params['transform']:
        # TFIDF transformer
        vec = TfidfTransformer()
        features_to_transform = ['dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6', 'dow7',
           'Action', 'Athletics', 'Awards', 'Badminton', 'Boxing', 'Comedy',
           'Cricket', 'Crime', 'Documentary', 'Drama', 'Family', 'Football',
           'Formula1', 'FormulaE', 'Hockey', 'Horror', 'IndiaVsSa', 'Kabaddi',
           'Kids', 'LiveTV', 'Mythology', 'NA', 'Reality', 'Romance', 'Science',
           'Sport', 'Swimming', 'Table Tennis', 'TalkShow', 'Teen', 'Tennis',
           'Thriller', 'Travel', 'Volleyball', 'Wildlife', 'tod0', 'tod1', 'tod10',
           'tod11', 'tod12', 'tod13', 'tod14', 'tod15', 'tod16', 'tod17', 'tod18',
           'tod19', 'tod2', 'tod20', 'tod21', 'tod22', 'tod23', 'tod3', 'tod4',
           'tod5', 'tod6', 'tod7', 'tod8', 'tod9']

        transformed = vec.fit_transform(data_processed.loc[:, features_to_transform])
        transformed = pd.DataFrame(transformed.toarray(), columns=features_to_transform)

        data_processed.drop(features_to_transform, axis='columns', inplace=True)
        data_processed = pd.concat((data_processed, transformed), axis='columns')

        print('TFIDF transformed')
    
    print('Prepared flags for cricket, romance and talkshow genres')
    
    # feature interaction between features
    data_processed = data_processed.assign(cric_rom=data_processed.cricket_view_hour * data_processed.romance_view_hour)
    data_processed = data_processed.assign(cric_ts=data_processed.cricket_view_hour * data_processed.ts_view_hour)
    data_processed = data_processed.assign(rom_ts=data_processed.romance_view_hour * data_processed.ts_view_hour)
    
    print('Prepared Feature Interaction')
    
    # proportion of cricket watch time out of total viewership
    proportion_cric_wt = (data_processed.Cricket) / (data_processed.viewership)
    data_processed = data_processed.assign(proportion_cric_wt=proportion_cric_wt)
    
    print('Prepared cricket watch time proportion')
    
    # proportion of romance watch time out of total
    prop_romance_wt = (data_processed.Romance) / (data_processed.viewership)
    data_processed  = data_processed.assign(prop_romance_wt=prop_romance_wt)
    
    print('Prepared romance watch time proportion')
    
    # proportion of family watch time out of total
    prop_family_wt = (data_processed.Family) / (data_processed.viewership)
    data_processed  = data_processed.assign(prop_family_wt=prop_family_wt)
    
    print('Prepared family watch time proportion')
    
    # flag for those instances with very high viewership
    data_processed = data_processed.assign(high_viewership=(data_processed.viewership > data_processed.viewership.quantile(q=.99)).astype('uint8'))
    
    print('Prepared high viewership')
    
    et = time.time() # end time
    
    print('It took: {} seconds to prepare data'.format(et - st))
    
    return data_processed

In [4]:
data_processed, train_mask = reload_data()

params = {
    'transform': False
}

data_processed = data_preparation(data_processed, **params)

Prepared viewership
Prepared num cities
Prepared num genres
Prepared num titles
Prepared num tod
Prepared num dow
Prepared DOW OHE
Prepared genres OHE
Prepared tod OHE
Prepared flags for cricket, romance and talkshow genres
Prepared Feature Interaction
Prepared cricket watch time proportion
Prepared romance watch time proportion
Prepared family watch time proportion
Prepared high viewership
It took: 21.772369384765625 seconds to prepare data


In [5]:
def prepare_titles(df):
    def cleanup(titles):
        cleaned_titles = []
        
        for title in titles:
            title_replaced = re.sub(r'[,"\']', '', title)
            cleaned_titles.append(title_replaced)
            
        return ' '.join(cleaned_titles)
    
    return df.titles.str.split(r':\d+').map(cleanup)

def prepare_cities(df):
    return df.cities.str.replace(r':\d+', '').str.replace(',', ' ')

def prepare_genres(df):
    return df.genres.str.replace(r':\d+', '').str.replace(',', ' ')

def prepare_tod(df):
    return df.tod.str.replace(r':\d+', '').str.replace(',', ' ')

def prepare_dow(df):
    return df.dow.str.replace(r':\d+', '').str.replace(',', ' ')

In [6]:
st = time.time()
cities_cleaned = prepare_cities(data_processed)
et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 0.5889754295349121 seconds


In [7]:
st = time.time()
genres_cleaned = prepare_genres(data_processed)
et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 0.7167510986328125 seconds


In [8]:
st = time.time()
titles_cleaned = prepare_titles(data_processed)
et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 7.565122365951538 seconds


In [9]:
st = time.time()
dow_cleaned = prepare_dow(data_processed)
et = time.time()

print('Took: {} seconds '.format((et - st)))

Took: 0.7164156436920166 seconds 


In [10]:
st = time.time()
tod_cleaned = prepare_tod(data_processed)
et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 1.0518476963043213 seconds


In [11]:
text_data = pd.concat((titles_cleaned, cities_cleaned, 
                       genres_cleaned, dow_cleaned,
                       tod_cleaned
                      ), axis='columns')

In [12]:
st = time.time()
text_data = text_data.apply(lambda x: x['titles'] + ' ' + \
                                      x['cities'] + ' ' + \
                                      x['genres'] + ' ' + \
                                      x['dow'] + ' ' + \
                                      x['tod'], axis='columns')
et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 17.23492455482483 seconds


In [33]:
st = time.time()
vec = TfidfVectorizer(min_df=3)

train_transformed = vec.fit_transform(text_data.loc[train_mask])
test_transformed  = vec.transform(text_data.loc[~train_mask])

et = time.time()

print('Took: {} seconds'.format((et - st)))

Took: 19.110977172851562 seconds


In [20]:
y = data_processed.loc[train_mask, 'segment']

In [26]:
params = {
    'stratify': y,
    'test_size': .2,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(train_transformed, y, **params)

In [27]:
def cross_validate_single_model_sign(X, y, model, ret_fold_preds=False,
                   save_folds=False, plot_cv_scores=False):
    """
    Stratified K-Fold with 10 splits and then save each fold
    and analyze the performance of the model on each fold
    """
    
    skf = StratifiedKFold(n_splits=10, random_state=SEED)
    fold_counter = 0
    
    cv_scores = []
    preds     = []
    
    for (itr, ite) in tqdm_notebook(skf.split(X, y)):
        Xtr = X[itr]
        ytr = y.iloc[itr]
        
        Xte = X[ite]
        yte = y.iloc[ite]
        
        if save_folds:
            save_file(pd.concat((Xtr, ytr), axis='columns'), '../data/processed/train_fold%s.feather'%(fold_counter))
            save_file(pd.concat((Xte, yte), axis='columns'), '../data/processed/test_fold%s.feather'%(fold_counter))
        
        print('Training model')
        start_time = time.time()
        model.fit(Xtr, ytr)
        end_time   = time.time()
        
        print('Took: {} seconds to train model'.format(end_time - start_time))
        
        start_time  = time.time()
        fold_preds  = model.predict_proba(Xte)[:, 1]
        end_time    = time.time()
        
        if ret_fold_preds:
            preds.append(fold_preds)
        
        print('Took: {} seconds to generate predictions'.format(end_time - start_time))
        
        fold_score = roc_auc_score(yte, fold_preds)
        print('Fold log loss score: {}'.format(fold_score))
        
        cv_scores.append(fold_score)
        print('='*75)
        print('\n')
        
    if plot_cv_scores:
        plt.scatter(np.arange(0, len(cv_scores)), cv_scores)
    
    print('Mean cv score: {} \n Std cv score: {}'.format(np.mean(cv_scores), np.std(cv_scores)))
    
    return preds

In [31]:
model = LogisticRegression(C=.1, random_state=SEED)

In [32]:
params = {
    'ret_fold_preds': True,
    'save_folds': False,
    'plot_cv_scores': False
}

cv_scores = cross_validate_single_model_sign(X_train, y_train, model, **params)

Training model
Took: 1.4320430755615234 seconds to train model
Took: 0.0031549930572509766 seconds to generate predictions
Fold log loss score: 0.8185499529545539


Training model
Took: 1.3673627376556396 seconds to train model
Took: 0.002483844757080078 seconds to generate predictions
Fold log loss score: 0.8289553916076243


Training model
Took: 1.3045082092285156 seconds to train model
Took: 0.0024766921997070312 seconds to generate predictions
Fold log loss score: 0.825217827619722


Training model
Took: 1.4631121158599854 seconds to train model
Took: 0.003160238265991211 seconds to generate predictions
Fold log loss score: 0.8205757054877487


Training model
Took: 1.5363333225250244 seconds to train model
Took: 0.0024902820587158203 seconds to generate predictions
Fold log loss score: 0.8185754453293107


Training model
Took: 1.4928629398345947 seconds to train model
Took: 0.0033767223358154297 seconds to generate predictions
Fold log loss score: 0.8382807127487301


Training mode

In [75]:
model.fit(X_train, y_train)

preds = model.predict_proba(X_test)[:, 1]
print('ROC AUC score: {}'.format(roc_auc_score(y_test, preds)))

ROC AUC score: 0.8203640717702794


In [76]:
st = time.time()
model.fit(train_transformed, y)
et = time.time()

preds = model.predict_proba(test_transformed)[:, 1]
print('Took: {} seconds to train model'.format(et - st))

Took: 5.538108587265015 seconds to train model


In [53]:
sub            = pd.read_csv('../data/raw/5f828822-4--4-hotstar_dataset/sample_submission.csv')
sub['segment'] = preds
sub['ID']      = data_processed.loc[~train_mask, 'ID'].values
sub.to_csv('../submissions/hotstar/logistic_reg_experiment_19_tod_dow.csv', index=False)