** Experiment 9 Vowpal Wabbit **

1. OHE of genres, cities, tod, dow
2. Watch time in seconds.
3. Vowpal Wabbit with hinge loss

In [96]:
%matplotlib inline

import pandas as pd
import numpy as np
import gc
import time
import os

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('dark')

SEED = 2313
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py

In [3]:
# load dataset
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
dataset.load_data('../data/processed/hotstar_processed.feather')

data_processed = dataset.data
train_mask     = dataset.get_train_mask() 

In [67]:
def create_vw_file(features, label, outfile_path):
    with open(outfile_path, 'w') as outfile:
        for index in tqdm_notebook(range(len(features.values))):
            segment = label.iloc[index]
            genres  = features.iloc[index]

            if pd.isnull(segment):
                outfile.write('{}\n'.format(genres))
            else:
                outfile.write('{} | {}\n'.format(segment, genres))
        
        outfile.close()
    
    print('Number of lines written: {}'.format(len(features.values)))
    print('Created input file for vw at: {}'.format(outfile_path))

** Genres **

In [121]:
features = data_processed.genres.str.replace(r':\d+', '').str.replace(',', ' ')
segment  = data_processed.segment

In [122]:
# convert to 1 or -1
segment[(segment == 0).values] = -1

In [123]:
# train test split
params = {
    'stratify': segment.loc[train_mask],
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(features.loc[train_mask],
                                                        segment.loc[train_mask],
                                                        **params
                                                       )

In [124]:
# futher split train into train and validation
params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [125]:
# create training set
create_vw_file(Xtr, ytr, '../data/processed/vowpal_wabbit_input_Xtr.vw')


Number of lines written: 112000
Created input file for vw at: ../data/processed/vowpal_wabbit_input_Xtr.vw


In [126]:
# create test set
create_vw_file(Xte, yte, '../data/processed/vowpal_wabbit_input_Xte.vw')


Number of lines written: 28000
Created input file for vw at: ../data/processed/vowpal_wabbit_input_Xte.vw


In [72]:
!head -5 ../data/processed/vowpal_wabbit_input_Xtr.vw

-1.0 | Crime
-1.0 | Cricket,Reality
1.0 | Romance,TalkShow,Family,Reality
-1.0 | Action,Cricket
-1.0 | TalkShow,Cricket


In [73]:
!head -5 ../data/processed/vowpal_wabbit_input_Xte.vw

-1.0 | TalkShow,Cricket
-1.0 | Romance,Cricket,Family,Kabaddi,Drama,Action,Comedy,Thriller
1.0 | TalkShow,Family
-1.0 | Drama
-1.0 | Romance,Family


In [172]:
def train_vw_model(train_vw_file, model_filename,
                   ngram=1, bit_precision=28, passes=1,
                   seed=SEED, quiet=True):
    init_time = time.time()
    vw_call_string = ('vw {train_vw_file} ' + 
                       '-f {model_filename} -b {bit_precision} --random_seed {seed}').format(
                       train_vw_file=train_vw_file, 
                       model_filename=model_filename, bit_precision=bit_precision, seed=seed)
    if ngram > 1:
         vw_call_string += ' --ngram={}'.format(ngram)
            
    if passes > 1:
         vw_call_string += ' -k --passes={} --cache_file {}'.format(passes, 
                            model_filename.replace('.vw', '.cache'))
    if quiet:
        vw_call_string += ' --quiet'
    
    
    print(vw_call_string) 
    res = os.system(vw_call_string)
    print('Success. Elapsed: {} sec.'.format(round(time.time() - init_time, 2))
          if not res else 'Failed.')

In [242]:
def test_vw_model(model_filename, test_vw_file, prediction_filename,
                  true_labels, seed=SEED, quiet=True):
    init_time = time.time()
    vw_call_string = ('vw -t -i {model_filename} {test_vw_file} ' + 
                       '-p {prediction_filename} --random_seed {seed}').format(
                       model_filename=model_filename, test_vw_file=test_vw_file, 
                       prediction_filename=prediction_filename, seed=seed)
    if quiet:
        vw_call_string += ' --quiet'
        
    print(vw_call_string) 
    res = os.system(vw_call_string)
    
    if true_labels is not None:
        if not res: # the call resulted OK
            vw_pred = np.loadtxt(prediction_filename)
            print("AUC: {}. Elapsed: {} sec.".format(
                round(roc_auc_score(true_labels, vw_pred), 2), 
                round(time.time() - init_time, 2)))
        else:
            print('Failed.')

In [173]:
train_vw_model('../data/processed/vowpal_wabbit_input_Xtr.vw',
               '../models/vw_model.vw',
               ngram=1,
               passes=20
              )

vw ../data/processed/vowpal_wabbit_input_Xtr.vw -f ../models/vw_model.vw -b 28 --random_seed 2313 -k --passes=20 --cache_file ../models/vw_model.cache --quiet
Success. Elapsed: 7.34 sec.


In [174]:
test_vw_model('../models/vw_model.vw', '../data/processed/vowpal_wabbit_input_Xte.vw',
              '../vw_predictions/xtr_preds.txt', yte
             )

vw -t -i ../models/vw_model.vw ../data/processed/vowpal_wabbit_input_Xte.vw -p ../vw_predictions/xtr_preds.txt --random_seed 2313 --quiet
AUC: 0.78. Elapsed: 0.34 sec.


** Use titles instead of genres **

In [147]:
features = data_processed.titles.str.replace(r':\d+', '').str.replace(',', ' ')\
                     .str.replace(r':|\|', '')
segment  = data_processed.segment

In [148]:
# convert to 1 or -1
segment[(segment == 0).values] = -1

In [149]:
# train test split
params = {
    'stratify': segment.loc[train_mask],
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(features.loc[train_mask],
                                                        segment.loc[train_mask],
                                                        **params
                                                       )

In [150]:
# futher split train into train and validation
params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [151]:
# create training set
create_vw_file(Xtr, ytr, '../data/processed/vowpal_wabbit_input_Xtr_titles.vw')


Number of lines written: 112000
Created input file for vw at: ../data/processed/vowpal_wabbit_input_Xtr_titles.vw


In [152]:
# create test set
create_vw_file(Xte, yte, '../data/processed/vowpal_wabbit_input_Xte_titles.vw')


Number of lines written: 28000
Created input file for vw at: ../data/processed/vowpal_wabbit_input_Xte_titles.vw


In [156]:
train_vw_model('../data/processed/vowpal_wabbit_input_Xtr.vw',
               '../models/vw_model_titles.vw'
              )

vw ../data/processed/vowpal_wabbit_input_Xtr.vw -f ../models/vw_model_titles.vw -b 28 --random_seed 2313 --ngram=2 --quiet
Success. Elapsed: 0.45 sec.


In [157]:
test_vw_model('../models/vw_model_titles.vw', '../data/processed/vowpal_wabbit_input_Xte_titles.vw',
              '../vw_predictions/xtr_preds_titles.txt', yte
             )

vw -t -i ../models/vw_model_titles.vw ../data/processed/vowpal_wabbit_input_Xte_titles.vw -p ../vw_predictions/xtr_preds_titles.txt --random_seed 2313 --quiet
AUC: 0.49%. Elapsed: 0.36 sec.


** So since we created features based on whether certain feature occured in an instance or not, we can now add the actual value observed in the training set with this value **

In [159]:
features = data_processed.genres.str.replace(r',', ' ')
segment  = data_processed.segment

In [160]:
# convert to 1 or -1
segment[(segment == 0).values] = -1

# train test split
params = {
    'stratify': segment.loc[train_mask],
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(features.loc[train_mask],
                                                        segment.loc[train_mask],
                                                        **params
                                                       )

# futher split train into train and validation
params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [161]:
# create training set
create_vw_file(Xtr, ytr, '../data/processed/vowpal_wabbit_input_Xtr_with_count.vw')


Number of lines written: 112000
Created input file for vw at: ../data/processed/vowpal_wabbit_input_Xtr_with_count.vw


In [162]:
# create test set
create_vw_file(Xte, yte, '../data/processed/vowpal_wabbit_input_Xte_with_count.vw')


Number of lines written: 28000
Created input file for vw at: ../data/processed/vowpal_wabbit_input_Xte_with_count.vw


In [168]:
train_vw_model('../data/processed/vowpal_wabbit_input_Xtr.vw',
               '../models/vw_model_genres_with_count.vw',
               ngram=1
              )

vw ../data/processed/vowpal_wabbit_input_Xtr.vw -f ../models/vw_model_genres_with_count.vw -b 28 --random_seed 2313 --quiet
Success. Elapsed: 0.43 sec.


In [171]:
test_vw_model('../models/vw_model_genres_with_count.vw', 
              '../data/processed/vowpal_wabbit_input_Xte_with_count.vw',
              '../vw_predictions/xtr_preds_genres_with_count.txt', 
              yte
             )

vw -t -i ../models/vw_model_genres_with_count.vw ../data/processed/vowpal_wabbit_input_Xte_with_count.vw -p ../vw_predictions/xtr_preds_genres_with_count.txt --random_seed 2313 --quiet
AUC: 0.72. Elapsed: 0.33 sec.


** Input data format preparation **

* Need to first prepare ohe features using regex to replace numeric values.

In [196]:
' tod_'.join(X['tod'].iloc[0].split(' '))

'10 tod_13 tod_12 tod_20 tod_21 tod_17 tod_16 tod_19 tod_18 tod_22 tod_2 tod_14 tod_15 tod_23'

In [250]:
def create_vw_file(X, y, outfile_path):
    with open(outfile_path, 'w') as outfile:
        for index in tqdm_notebook(range(len(X.values))):
            segment = y.iloc[index]
            
            genres  = X['genres'].iloc[index]
            cities  = X['cities'].iloc[index]
            
            tod     = 'tod_' + ' tod_'.join(X['tod'].iloc[index].split(' '))
            dow     = 'dow_' + ' dow_'.join(X['dow'].iloc[index].split(' '))

            if pd.isnull(segment):
                outfile.write('{} | {} {} {} {}\n'.format(
                                                     '1',
                                                     genres,
                                                     cities,
                                                     tod,
                                                     dow
                                                    ))
            else:
                outfile.write('{} | {} {} {} {}\n'.format(segment, 
                                                          genres,
                                                          cities,
                                                          tod,
                                                          dow
                                                         ))
        
        outfile.close()
    
    print('Number of lines written: {}'.format(len(features.values)))
    print('Created input file for vw at: {}'.format(outfile_path))

In [233]:
X = data_processed.loc[train_mask, ['genres', 'cities', 'tod', 'dow']]

X['genres'] = X.genres.str.replace(r':\d+', '').str.replace(',', ' ')
X['cities'] = X.cities.str.replace(r':\d+', '').str.replace(',', ' ')
X['tod']    = X.tod.str.replace(r':\d+', '').str.replace(',', ' ')
X['dow']    = X.dow.str.replace(r':\d+', '').str.replace(',', ' ')

Xtest = data_processed.loc[~train_mask, ['genres', 'cities', 'tod', 'dow']]

Xtest['genres'] = Xtest.genres.str.replace(r':\d+', '').str.replace(',', ' ')
Xtest['cities'] = Xtest.cities.str.replace(r':\d+', '').str.replace(',', ' ')
Xtest['tod']    = Xtest.tod.str.replace(r':\d+', '').str.replace(',', ' ')
Xtest['dow']    = Xtest.dow.str.replace(r':\d+', '').str.replace(',', ' ')

y = data_processed.loc[train_mask, 'segment']

In [234]:
# train test split
params = {
    'stratify': y,
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X,
                                                        y,
                                                        **params
                                                       )

# futher split train into train and validation
params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [235]:
create_vw_file(Xtr, ytr, '../data/processed/vowpal_wabbit_xtr_multiple_features.vw')


Number of lines written: 300000
Created input file for vw at: ../data/processed/vowpal_wabbit_xtr_multiple_features.vw


In [236]:
create_vw_file(Xte, yte, '../data/processed/vowpal_wabbit_xte_multiple_features.vw')


Number of lines written: 300000
Created input file for vw at: ../data/processed/vowpal_wabbit_xte_multiple_features.vw


In [240]:
train_vw_model('../data/processed/vowpal_wabbit_xtr_multiple_features.vw',
               '../models/vw_model_multiple_features.vw',
               ngram=1
              )

vw ../data/processed/vowpal_wabbit_xtr_multiple_features.vw -f ../models/vw_model_multiple_features.vw -b 28 --random_seed 2313 --quiet
Success. Elapsed: 0.48 sec.


In [243]:
test_vw_model('../models/vw_model_multiple_features.vw', 
              '../data/processed/vowpal_wabbit_xte_multiple_features.vw',
              '../vw_predictions/xtr_preds_multiple_features.txt', 
              yte
             )

vw -t -i ../models/vw_model_multiple_features.vw ../data/processed/vowpal_wabbit_xte_multiple_features.vw -p ../vw_predictions/xtr_preds_multiple_features.txt --random_seed 2313 --quiet
AUC: 0.78. Elapsed: 0.32 sec.


** Train on X_train, X_test **

In [216]:
# train on X_train and test on X_test
create_vw_file(X_train, y_train, '../data/processed/vowpal_wabbit_x_train_multiple_features.vw')
create_vw_file(X_test, y_test, '../data/processed/vowpal_wabbit_x_test_multiple_features.vw')


Number of lines written: 300000
Created input file for vw at: ../data/processed/vowpal_wabbit_x_train_multiple_features.vw

Number of lines written: 300000
Created input file for vw at: ../data/processed/vowpal_wabbit_x_test_multiple_features.vw


In [222]:
train_vw_model('../data/processed/vowpal_wabbit_x_train_multiple_features.vw',
               '../models/vw_model_multiple_features_x_train.vw',
               ngram=1
              )

test_vw_model('../models/vw_model_multiple_features_x_train.vw', 
              '../data/processed/vowpal_wabbit_x_test_multiple_features.vw',
              '../vw_predictions/xtr_preds_multiple_features_x_test.txt', 
              y_test
             )

vw ../data/processed/vowpal_wabbit_x_train_multiple_features.vw -f ../models/vw_model_multiple_features_x_train.vw -b 28 --random_seed 2313 --quiet
Success. Elapsed: 0.49 sec.
vw -t -i ../models/vw_model_multiple_features_x_train.vw ../data/processed/vowpal_wabbit_x_test_multiple_features.vw -p ../vw_predictions/xtr_preds_multiple_features_x_test.txt --random_seed 2313 --quiet
AUC: 0.78. Elapsed: 0.59 sec.


** Full Training **

In [251]:
# train on X_train and test on X_test
create_vw_file(X, y, '../data/processed/vowpal_wabbit_x_multiple_features.vw')
create_vw_file(Xtest, pd.Series([np.nan] * len(Xtest)), '../data/processed/vowpal_wabbit_xtest_multiple_features.vw')


Number of lines written: 300000
Created input file for vw at: ../data/processed/vowpal_wabbit_x_multiple_features.vw

Number of lines written: 300000
Created input file for vw at: ../data/processed/vowpal_wabbit_xtest_multiple_features.vw


In [252]:
!head -2 '../data/processed/vowpal_wabbit_x_multiple_features.vw'

-1.0 | Cricket Kabaddi Reality gurgaon delhi tod_10 tod_13 tod_12 tod_20 tod_21 tod_17 tod_16 tod_19 tod_18 tod_22 tod_2 tod_14 tod_15 tod_23 dow_1 dow_3 dow_2 dow_5 dow_4 dow_7 dow_6
-1.0 | Cricket Wildlife delhi nagar mumbai tod_11 tod_10 tod_20 tod_21 tod_22 tod_16 tod_19 tod_18 tod_23 tod_1 tod_3 tod_2 tod_5 tod_8 tod_9 tod_15 dow_1 dow_3 dow_2 dow_5 dow_4 dow_7 dow_6


In [253]:
!head -2 '../data/processed/vowpal_wabbit_xtest_multiple_features.vw'

1 | Romance Cricket delhi mumbai tod_13 tod_15 dow_2 dow_4
1 | Romance Action Mythology pune delhi navi mumbai tod_20 tod_21 tod_22 tod_23 tod_19 tod_1 tod_0 tod_15 dow_1 dow_3 dow_2 dow_5 dow_4 dow_7


In [254]:
train_vw_model('../data/processed/vowpal_wabbit_x_multiple_features.vw',
               '../models/vw_model_multiple_features_x.vw',
               ngram=1
              )

test_vw_model('../models/vw_model_multiple_features_x.vw', 
              '../data/processed/vowpal_wabbit_xtest_multiple_features.vw',
              '../vw_predictions/xtr_preds_multiple_features_xtest.txt', 
              None
             )

vw ../data/processed/vowpal_wabbit_x_multiple_features.vw -f ../models/vw_model_multiple_features_x.vw -b 28 --random_seed 2313 --quiet
Success. Elapsed: 0.67 sec.
vw -t -i ../models/vw_model_multiple_features_x.vw ../data/processed/vowpal_wabbit_xtest_multiple_features.vw -p ../vw_predictions/xtr_preds_multiple_features_xtest.txt --random_seed 2313 --quiet


In [255]:
!head -5 '../vw_predictions/xtr_preds_multiple_features_xtest.txt'

-1
-0.839207
-0.709526
-0.738760
-0.778406


In [257]:
# read predictions
preds = np.loadtxt('../vw_predictions/xtr_preds_multiple_features_xtest.txt')

# load predictions
sub = pd.read_csv('../data/raw/5f828822-4--4-hotstar_dataset/sample_submission.csv')
sub['segment'] = preds
sub['ID']      = data_processed.loc[~train_mask, 'ID'].values
sub.to_csv('../submissions/hotstar/vw.csv', index=False)