** Feature Interaction: Combine two or more features **

1. Combine two or more features into a single feature.

In [9]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

SEED = 1231
np.random.seed(SEED)

import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib

import xgboost as xgb

from itertools import combinations

sns.set_style('dark')

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py
%run ../src/features/util.py
%run ../src/models/cross_validation.py
%run ../src/models/feature_selection.py

In [34]:
# load dataset
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
dataset.load_data('../data/processed/hotstar_processed.feather')

data_processed = dataset.data
train_mask     = dataset.get_train_mask() 

In [3]:
# preprocessing: replacement map
genre_replacement_map = {
    'Thriller': 'Crime',
    'Horror': 'Crime',
    'Action': 'Action',
    'Hockey': 'Sport',
    'Kabaddi': 'Sport',
    'Formula1': 'Sport',
    'FormulaE': 'Sport',
    'Tennis': 'Sport',
    'Athletics': 'Sport',
    'Table Tennis': 'Sport',
    'Volleyball': 'Sport',
    'Boxing': 'Sport',
    'Football': 'Sport',
    'NA': 'Sport',
    'Swimming': 'Sport',
    'IndiavsSa': 'Sport',
    'Wildlife': 'Travel',
    'Science': 'Travel',
    'Documentary': 'Travel'
}

def cluster_genres(genres):
    for replacement_key in genre_replacement_map.keys():
        to_replace = genre_replacement_map[replacement_key]
        genres     = genres.str.replace(r'%s'%(replacement_key), to_replace)
    
    return genres
            

start = time.time()
data_processed['genres'] = cluster_genres(data_processed.genres)
end   = time.time()

print('Took: {} seconds'.format(end - start))

Took: 4.675248384475708 seconds


In [5]:
start = time.time()
ohe_genres = encode_ohe(data_processed.genres)
end   = time.time()

print('Took: {} seconds'.format(end - start))

Took: 4.896948575973511 seconds


In [7]:
def group_data(dd, degree=2):
    new_data = []
    columns  = []
    
    for indices in combinations(dd.columns, degree):
        key = '_'.join(list(indices))
        columns.append(key)
        
        new_data.append(np.product(dd.loc[:, list(indices)].values, axis=1))
    
    new_data = np.array(new_data)
    return pd.DataFrame(new_data.T, columns=columns)

In [8]:
start = time.time()
feature_interaction = group_data(ohe_genres)
end   = time.time()

print('Took: {} seconds'.format(end - start))

Took: 0.8307907581329346 seconds


In [9]:
# concat different data frames
# data = pd.concat((ohe_genres, feature_interaction, data_processed.segment), axis='columns')
data = np.hstack((ohe_genres.values, 
                  feature_interaction.values,
                  data_processed.segment.values.reshape(-1, 1)
                 ))

columns = ohe_genres.columns.tolist() + feature_interaction.columns.tolist() + ['segment']
data = pd.DataFrame(data, columns=columns)
save_file(data, '../data/processed/hotstar_processed_exp_10.feather')

del data_processed, ohe_genres, feature_interaction
gc.collect()

497

In [4]:
data = load_file('../data/processed/hotstar_processed_exp_10.feather')
train_mask = data.segment.notnull()

In [5]:
f = data.columns.drop('segment')

X = data.loc[train_mask, f]
y = data.loc[train_mask, 'segment']

Xtest  = data.loc[~train_mask, f]

In [6]:
params = {
    'stratify': y,
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X, y, **params)

In [7]:
params = {
    'stratify': y_train,
    'test_size': .2,
    'random_state': SEED
}

Xtr, Xte, ytr, yte = get_train_test_split(X_train, y_train, **params)

In [14]:
# train a logistic regression model
model = LogisticRegression(C=.01, class_weight='balanced', random_state=SEED)
model.fit(Xtr, ytr)

preds = model.predict_proba(Xte)[:, 1]
print('AUC: {}'.format(roc_auc_score(yte, preds)))

AUC: 0.782496934049149


In [20]:
# train a random forest model
model = RandomForestClassifier(n_estimators=100, max_depth=7,
                               max_features=.3, n_jobs=2, random_state=SEED)
model.fit(Xtr, ytr)

preds = model.predict_proba(Xte)[:, 1]
print('AUC: {}'.format(roc_auc_score(yte, preds)))

AUC: 0.7775745515870396


In [23]:
# train a extreme gradient boosting model
model = xgb.XGBClassifier(colsample_bytree=.6, seed=SEED)

model.fit(Xtr, ytr)

preds = model.predict_proba(Xte)[:, 1]
print('AUC: {}'.format(roc_auc_score(yte, preds)))

AUC: 0.7814893950465182


In [8]:
start = time.time()
model = LogisticRegression(random_state=SEED)
greedy_feature_search(Xtr.iloc[:1000], ytr.iloc[:1000], model)
end = time.time()

print('Took: {} seconds'.format(end - start))

Feature: 0 Mean AUC: 0.523225
Feature: 1 Mean AUC: 0.519176
Feature: 2 Mean AUC: 0.516814
Feature: 3 Mean AUC: 0.465022
Feature: 4 Mean AUC: 0.752324
Feature: 5 Mean AUC: 0.479223
Feature: 6 Mean AUC: 0.627107
Feature: 7 Mean AUC: 0.627220
Feature: 8 Mean AUC: 0.500000
Feature: 9 Mean AUC: 0.488255
Feature: 10 Mean AUC: 0.544316
Feature: 11 Mean AUC: 0.518836
Feature: 12 Mean AUC: 0.483704
Feature: 13 Mean AUC: 0.629826
Feature: 14 Mean AUC: 0.526131
Feature: 15 Mean AUC: 0.500000
Feature: 16 Mean AUC: 0.554223
Feature: 17 Mean AUC: 0.493038
Feature: 18 Mean AUC: 0.492920
Feature: 19 Mean AUC: 0.504982
Feature: 20 Mean AUC: 0.507056
Feature: 21 Mean AUC: 0.483655
Feature: 22 Mean AUC: 0.544775
Feature: 23 Mean AUC: 0.511790
Feature: 24 Mean AUC: 0.470929
Feature: 25 Mean AUC: 0.488191
Feature: 26 Mean AUC: 0.500000
Feature: 27 Mean AUC: 0.504882
Feature: 28 Mean AUC: 0.491004
Feature: 29 Mean AUC: 0.485013
Feature: 30 Mean AUC: 0.475068
Feature: 31 Mean AUC: 0.466254
Feature: 32 Mean A

In [10]:
selected_features = [4, 6, 9, 12, 16, 27, 40, 48, 55, 57, 77, 80, 89, 99, 100, 105, 112, 116, 118, 121, 129, 146, 147, 155, 157, 168, 170, 172, 174, 175, 181]

In [11]:
joblib.dump(selected_features, '../data/interim/experiment_10_selected_features.pkl')

['../data/interim/experiment_10_selected_features.pkl']

In [13]:
model.fit(Xtr.iloc[:, selected_features], ytr)
preds = model.predict_proba(Xte.iloc[:, selected_features])[:, 1]

print('AUC: {}'.format(roc_auc_score(yte, preds)))

AUC: 0.7750579305059828


In [17]:
model.fit(X_train.iloc[:, selected_features], y_train)
preds = model.predict_proba(X_test.iloc[:, selected_features])[:, 1]

print('AUC: {}'.format(roc_auc_score(y_test, preds)))

AUC: 0.7715171313326461


In [18]:
start = time.time()
model = xgb.XGBClassifier(seed=SEED)
greedy_feature_search(Xtr.iloc[:1000], ytr.iloc[:1000], model)
end = time.time()

print('Took: {} seconds'.format(end - start))

Feature: 0 Mean AUC: 0.471108
Feature: 1 Mean AUC: 0.519176
Feature: 2 Mean AUC: 0.516814
Feature: 3 Mean AUC: 0.465022
Feature: 4 Mean AUC: 0.752324
Feature: 5 Mean AUC: 0.479223
Feature: 6 Mean AUC: 0.627107
Feature: 7 Mean AUC: 0.627220
Feature: 8 Mean AUC: 0.500000
Feature: 9 Mean AUC: 0.491418
Feature: 10 Mean AUC: 0.544316
Feature: 11 Mean AUC: 0.518836
Feature: 12 Mean AUC: 0.484205
Feature: 13 Mean AUC: 0.629826
Feature: 14 Mean AUC: 0.526131
Feature: 15 Mean AUC: 0.500000
Feature: 16 Mean AUC: 0.554223
Feature: 17 Mean AUC: 0.496743
Feature: 18 Mean AUC: 0.492920
Feature: 19 Mean AUC: 0.505420
Feature: 20 Mean AUC: 0.501627
Feature: 21 Mean AUC: 0.483655
Feature: 22 Mean AUC: 0.544775
Feature: 23 Mean AUC: 0.487903
Feature: 24 Mean AUC: 0.470929
Feature: 25 Mean AUC: 0.488191
Feature: 26 Mean AUC: 0.500000
Feature: 27 Mean AUC: 0.500000
Feature: 28 Mean AUC: 0.491004
Feature: 29 Mean AUC: 0.496205
Feature: 30 Mean AUC: 0.475068
Feature: 31 Mean AUC: 0.466254
Feature: 32 Mean A

In [19]:
selected_features = [4, 6, 14, 25, 48, 90, 107, 116, 129, 161, 163, 169, 177, 178]
joblib.dump(selected_features, '../data/interim/experiment_10_selected_features_xgboost.pkl')

['../data/interim/experiment_10_selected_features_xgboost.pkl']

In [27]:
model = xgb.XGBClassifier(n_estimators=150, max_depth=4, seed=SEED, learning_rate=.1)
model.fit(Xtr.iloc[:, selected_features], ytr)

preds = model.predict_proba(Xte.iloc[:, selected_features])[:, 1]
print('AUC: {}'.format(roc_auc_score(yte, preds)))

AUC: 0.7662074908394355


In [28]:
model.fit(X_train.iloc[:, selected_features], y_train)

preds = model.predict_proba(X_test.iloc[:, selected_features])[:, 1]
print('AUC: {}'.format(roc_auc_score(y_test, preds)))

AUC: 0.7625347432699567


In [31]:
# full training
model.fit(X.iloc[:, selected_features], y)
final_preds = model.predict_proba(Xtest.iloc[:, selected_features])[:, 1]

In [35]:
sub            = pd.read_csv('../data/raw/5f828822-4--4-hotstar_dataset/sample_submission.csv')
sub['segment'] = final_preds
sub['ID']      = data_processed.loc[~train_mask, 'ID'].values
sub.to_csv('../submissions/hotstar/xgb_experiment_10.csv', index=False)