In [1]:
import json
import pandas as pd
import os
import numpy as np
import logging

from sklearn import preprocessing

In [2]:
import sys
print sys.executable

/Users/manrajsingh/virtualenvs/ds/bin/python


## Get Data from JSON to DataFrame

In [3]:
DATA_DIR = 'data'

In [None]:
with open(os.path.join(DATA_DIR, 'train_data.json'), 'r') as data:
    data_train = json.load(data)

df_train = pd.DataFrame.from_dict(data_train, orient='index')
df_train = df_train.reset_index()
df_train = df_train.rename(columns = {'index':'id'})
df_train.shape

In [None]:
df_train.head()

In [None]:
with open(os.path.join(DATA_DIR, 'test_data.json'), 'r') as data:
    data_test = json.load(data)

df_test = pd.DataFrame.from_dict(data_test, orient='index')
df_test = df_test.reset_index()
df_test = df_test.rename(columns = {'index':'id'})
df_test.shape

In [None]:
df_test.head()

## Feature Engineering

#### Total time spent watching on HotStar

In [None]:
def time_calc(row):
    titles = row['titles'].strip()
    if len(titles) == 0:
        row['time_spent'] = 0
    else:
        title_list = titles.split(',')
        times = [int(title.split(':')[-1]) for title in title_list]
        row['time_spent'] = sum(times)
    return row

df_train['time_spent'] = None
df_test['time_spent'] = None

df_train = df_train.apply(
    time_calc, axis=1
)

df_test = df_test.apply(
    time_calc, axis=1
)

df_train.head()

#### Get genres in separate columns

In [None]:
unique_genres = []

def get_label(label):
    return label + '_genre'

def get_genres(row):
    if len(row['genres']) == 0:
        return []

    genres_list = row['genres'].split(',')
    genres = [(genre.split(':')[0], genre.split(':')[1]) for genre in genres_list]
    return genres

def extract_genres(row):
    genres = get_genres(row)
    genre_labels = [genre[0] for genre in genres]
    unique_genres.extend(genre_labels)
    return row

df_train = df_train.apply(
    extract_genres, axis=1
)

df_test = df_test.apply(
    extract_genres, axis=1
)

unique_genres = list(set(unique_genres))

for genre in unique_genres:
    df_train[get_label(genre)] = None
    df_test[get_label(genre)] = None


def add_genre_features(row):
    genres = get_genres(row)
    for genre in genres:
        row[get_label(genre[0])] = genre[1]
    
    return row

df_train = df_train.apply(
    add_genre_features, axis=1
)

df_test = df_test.apply(
    add_genre_features, axis=1
)

df_train.head()

In [None]:
def change_datatype(df):
    for column in df.columns:
        if '_genre' in column:
            df[column] = df[column].astype('float64')
    
    return df

df_train = change_datatype(df_train)
df_test = change_datatype(df_test)

df_train.head()

#### Extract counts

In [None]:
labels = ['genres', 'titles', 'cities', 'dow', 'tod']

def get_list(row, key):
    if len(row[key]) == 0:
        return []

    elements = row[key].split(',')
    result = [(elem.split(':')[0], elem.split(':')[1]) for elem in elements]
    return result

def extract_count(row, key):
    count_list = get_list(row, key)
    row[key + '_count'] = len(count_list)
    return row

for label in labels:
    df_train = df_train.apply(
        extract_count, args=(label,), axis=1
    )

    df_test = df_test.apply(
        extract_count, args=(label,), axis=1
    )

df_train.head()

#### Does watch genre encoder

In [None]:
df_train.loc[:, 'watch_cricket'] = df_train.apply(
    lambda row: 1 if 'cricket' in row['genres'].lower() else 0, axis=1
)

df_test.loc[:, 'watch_cricket'] = df_test.apply(
    lambda row: 1 if 'cricket' in row['genres'].lower() else 0, axis=1
)

df_train.loc[:, 'watch_drama'] = df_train.apply(
    lambda row: 1 if 'drama' in row['genres'].lower() else 0, axis=1
)

df_test.loc[:, 'watch_drama'] = df_test.apply(
    lambda row: 1 if 'drama' in row['genres'].lower() else 0, axis=1
)

df_train.loc[:, 'watch_romance'] = df_train.apply(
    lambda row: 1 if 'romance' in row['genres'].lower() else 0, axis=1
)

df_test.loc[:, 'watch_romance'] = df_test.apply(
    lambda row: 1 if 'romance' in row['genres'].lower() else 0, axis=1
)

df_train.loc[:, 'watch_family'] = df_train.apply(
    lambda row: 1 if 'family' in row['genres'].lower() else 0, axis=1
)

df_test.loc[:, 'watch_family'] = df_test.apply(
    lambda row: 1 if 'family' in row['genres'].lower() else 0, axis=1
)

df_train.head()

##### Does watch hour

In [None]:
def add_columns(df):
    for i in xrange(1, 24, 1):
        df['watch_hour_sum_' + str(i)] = None
    
    return df

def does_watch_hour(row):
    elements = row['tod'].split(',')
    result = [(elem.split(':')[0], elem.split(':')[1]) for elem in elements]

    for (key, val) in result:
        row['watch_hour_sum_' + key] = val
    
    return row

df_train = add_columns(df_train)
df_test = add_columns(df_test)

df_train = df_train.apply(
    does_watch_hour, axis=1
)

df_test = df_test.apply(
    does_watch_hour, axis=1
)

df_train.head()

In [None]:
df_test.head()

In [None]:
for column in df_test.columns:
    print column

In [None]:
def change_datatype(df):
    for column in df.columns:
        if 'watch_hour_sum_' in column:
            df[column] = df[column].astype('float64')
    
    return df

df_train = change_datatype(df_train)
df_test = change_datatype(df_test)

df_train.head()

#### Does watch day

In [None]:
def add_columns(df):
    for i in xrange(1, 8, 1):
        df['watch_day_sum_' + str(i)] = None
    
    return df

def does_watch_day(row):
    elements = row['dow'].split(',')
    result = [(elem.split(':')[0], elem.split(':')[1]) for elem in elements]

    for (key, val) in result:
        row['watch_day_sum_' + key] = val

    return row

df_train = add_columns(df_train)
df_test = add_columns(df_test)

df_train = df_train.apply(
    does_watch_day, axis=1
)

df_test = df_test.apply(
    does_watch_day, axis=1
)

df_train.head()

In [None]:
df_test.head()

In [None]:
def change_datatype(df):
    for column in df.columns:
        if 'watch_day_sum' in column:
            df[column] = df[column].astype('float64')
    
    return df

df_train = change_datatype(df_train)
df_test = change_datatype(df_test)

df_train.head()

#### Label Encoder

In [None]:
def label_encoder(df, key):
    label = preprocessing.LabelEncoder()
    label.fit(list(df[key].values))
    df[key + '_encoded'] = label.transform(list(df[key].values))
    return df

df_train = label_encoder(df_train, 'segment')

df_train.head()

In [None]:
df_train.to_csv(
    os.path.join(DATA_DIR, 'fe_train_data_with_sum.csv'),
    index=False
)

df_test.to_csv(
    os.path.join(DATA_DIR, 'fe_test_data_with_sum.csv'),
    index=False
)


#### Constants and display options

In [4]:
RANDOM_STATE = 1234
DATA_DIR = 'data'
pd.options.display.precision = 20

#### Imports

In [5]:
from sklearn import preprocessing

#### Read feature engineered data

In [6]:
df_train = pd.read_csv(
    os.path.join(DATA_DIR, 'fe_train_data_with_sum.csv'),
    sep=','
)

df_test = pd.read_csv(
    os.path.join(DATA_DIR, 'fe_test_data_with_sum.csv'),
    sep=','
)

#### Drop unwanted columns

In [7]:
df_train_dropped = df_train.drop(['genres', 'titles', 'cities', 'segment', 'dow', 'tod'], axis=1, errors='ignore')
df_test_dropped = df_test.drop(['genres', 'titles', 'cities', 'segment', 'dow', 'tod', 'segment_encoded'], axis=1, errors='ignore')

df_train_dropped.head()

Unnamed: 0,Action_genre,Athletics_genre,Awards_genre,Badminton_genre,Boxing_genre,Comedy_genre,Cricket_genre,Crime_genre,Documentary_genre,Drama_genre,...,watch_hour_sum_8,watch_hour_sum_9,watch_day_sum_1,watch_day_sum_2,watch_day_sum_3,watch_day_sum_4,watch_day_sum_5,watch_day_sum_6,watch_day_sum_7,segment_encoded
0,,,,,,,82379.0,,,,...,,,3412.0,1737.0,15878.0,20974.0,10975.0,16580.0,17820.0,0
1,,,,,,,15640.0,,,,...,241.0,374.0,5745.0,3346.0,3025.0,3007.0,123.0,10.0,1108.0,0
2,,,,,,,,,,,...,,,,,4142.0,,,,,0
3,,,,,,,7690.0,,,,...,658.0,,658.0,,5867.0,1339.0,413.0,,71.0,0
4,,,,,,,3283.0,,,5503.0,...,,,1641.0,480.0,,1445.0,,4900.0,1663.0,0


In [8]:
df_test_dropped.head()

Unnamed: 0,Action_genre,Athletics_genre,Awards_genre,Badminton_genre,Boxing_genre,Comedy_genre,Cricket_genre,Crime_genre,Documentary_genre,Drama_genre,...,watch_hour_sum_7,watch_hour_sum_8,watch_hour_sum_9,watch_day_sum_1,watch_day_sum_2,watch_day_sum_3,watch_day_sum_4,watch_day_sum_5,watch_day_sum_6,watch_day_sum_7
0,,,,,,,702.0,,,,...,,,,,182.0,,701.0,,,
1,3501.0,,,,,,,,,,...,,,,4830.0,8359.0,3078.0,4185.0,2946.0,,451.0
2,,,,,,,,,,,...,,,508.0,5409.0,740.0,,1162.0,,3807.0,2755.0
3,,,,,,,,,,6459.0,...,,,,3227.0,,2043.0,,,,1464.0
4,,,,,,,,,,1204.0,...,,,,1204.0,,,,,,


### Train the model

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

X = df_train_dropped.copy()
y = df_train_dropped['segment_encoded'].copy()

del X['id']
del X['segment_encoded']

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 72 columns):
Action_genre          35998 non-null float64
Athletics_genre       244 non-null float64
Awards_genre          11611 non-null float64
Badminton_genre       6570 non-null float64
Boxing_genre          26 non-null float64
Comedy_genre          46808 non-null float64
Cricket_genre         114205 non-null float64
Crime_genre           22829 non-null float64
Documentary_genre     872 non-null float64
Drama_genre           98337 non-null float64
Family_genre          43543 non-null float64
Football_genre        7466 non-null float64
Formula1_genre        373 non-null float64
FormulaE_genre        33 non-null float64
Hockey_genre          1534 non-null float64
Horror_genre          6585 non-null float64
IndiaVsSa_genre       1 non-null float64
Kabaddi_genre         936 non-null float64
Kids_genre            4906 non-null float64
LiveTV_genre          28353 non-null float64
Mythology_g

In [10]:
df_test_dropped.head()

Unnamed: 0,Action_genre,Athletics_genre,Awards_genre,Badminton_genre,Boxing_genre,Comedy_genre,Cricket_genre,Crime_genre,Documentary_genre,Drama_genre,...,watch_hour_sum_7,watch_hour_sum_8,watch_hour_sum_9,watch_day_sum_1,watch_day_sum_2,watch_day_sum_3,watch_day_sum_4,watch_day_sum_5,watch_day_sum_6,watch_day_sum_7
0,,,,,,,702.0,,,,...,,,,,182.0,,701.0,,,
1,3501.0,,,,,,,,,,...,,,,4830.0,8359.0,3078.0,4185.0,2946.0,,451.0
2,,,,,,,,,,,...,,,508.0,5409.0,740.0,,1162.0,,3807.0,2755.0
3,,,,,,,,,,6459.0,...,,,,3227.0,,2043.0,,,,1464.0
4,,,,,,,,,,1204.0,...,,,,1204.0,,,,,,


In [11]:
import xgboost as xgb
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import log_loss, roc_auc_score, precision_recall_fscore_support, accuracy_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE)

In [12]:
for column in X_train.columns:
    print column

Action_genre
Athletics_genre
Awards_genre
Badminton_genre
Boxing_genre
Comedy_genre
Cricket_genre
Crime_genre
Documentary_genre
Drama_genre
Family_genre
Football_genre
Formula1_genre
FormulaE_genre
Hockey_genre
Horror_genre
IndiaVsSa_genre
Kabaddi_genre
Kids_genre
LiveTV_genre
Mythology_genre
NA_genre
Reality_genre
Romance_genre
Science_genre
Sport_genre
Swimming_genre
Table Tennis_genre
TalkShow_genre
Teen_genre
Tennis_genre
Thriller_genre
Travel_genre
Volleyball_genre
Wildlife_genre
cities_count
dow_count
genres_count
time_spent
titles_count
tod_count
watch_hour_sum_0
watch_hour_sum_1
watch_hour_sum_10
watch_hour_sum_11
watch_hour_sum_12
watch_hour_sum_13
watch_hour_sum_14
watch_hour_sum_15
watch_hour_sum_16
watch_hour_sum_17
watch_hour_sum_18
watch_hour_sum_19
watch_hour_sum_2
watch_hour_sum_20
watch_hour_sum_21
watch_hour_sum_22
watch_hour_sum_23
watch_hour_sum_3
watch_hour_sum_4
watch_hour_sum_5
watch_hour_sum_6
watch_hour_sum_7
watch_hour_sum_8
watch_hour_sum_9
watch_day_sum_1
wa

In [None]:
def get_metrics(test, pred):
    cm = confusion_matrix(test, pred)

    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TP = cm[1][1]

    sensitivity = float(TP) / float(TP + FN)
    specificity = float(TN) / float(TN + FP)

    accuracy = accuracy_score(test, pred)
    fone_score = f1_score(test, pred)

    return (cm, sensitivity, specificity, accuracy, fone_score)

def score(params):
    print "Training with params {0}: ".format(score.iteration)
    print params
    
    score.iteration += 1
    
    num_round = 4000
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    model = xgb.train(params, dtrain, num_round, verbose_eval=1000)
    
    predictions_test = model.predict(dtest)

    auc_score_test = roc_auc_score(y_test, predictions_test)
    y_pred_test = [0 if pred <= 0.5 else 1 for pred in predictions_test]

    cm_test, sensitivity_test, specificity_test, accuracy_test, fone_score_test = get_metrics(y_test, y_pred_test)
    
    print "\t Testing: Confusion Matrix \n{0}\n\n".format(cm_test)
    print "\t Testing: Score on {0}\n\n".format(1 - auc_score_test)
    print "\t Testing: Sensitivity {0}, Specificity {1}, Accuracy Score {2}, ROC AUC Score {3}, F1 Score {4}\n\n".format(
        sensitivity_test, specificity_test, accuracy_test, auc_score_test, fone_score_test
    )

    return {'loss': 1 - auc_score_test, 'status': STATUS_OK}

def optimize(trials):
    space = {
        'n_estimators' : hp.choice('n_estimators', np.arange(100, 5000, dtype=int)),
        'eta' : hp.quniform('eta', 0.001, 0.2, 0.001),
        'max_depth' : hp.choice('max_depth', np.arange(3, 10, dtype=int)),
        'min_child_weight' : hp.choice('min_child_weight', np.arange(1, 8, dtype=int)),
        'subsample' : hp.quniform('subsample', 0.5, 1, 0.01),
        'gamma' : hp.quniform('gamma', 0.0, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.01),
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'scale_pos_weight': hp.quniform('scale_pos_weight', 0, 20, 0.05),
        'nthread' : 6,
        'silent' : 0
    }
    
    score.iteration = 1

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=200)

    print best
    return best

trials = Trials()
best = optimize(trials)

In [13]:
training_rounds = 4000
params = {
    'colsample_bytree': 0.92,
    'silent': 0,
    'eval_metric': 'auc',
    'scale_pos_weight': 4.25,
    'nthread': 6,
    'min_child_weight': 7,
    'n_estimators': 8646,
    'subsample': 0.58,
    'eta': 0.019,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'gamma': 0.25
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'valid')]

model = xgb.train(params, d_train, training_rounds, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-auc:0.781202	valid-auc:0.776699
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-auc:0.801492	valid-auc:0.794122
[20]	train-auc:0.802037	valid-auc:0.793955
[30]	train-auc:0.802813	valid-auc:0.794792
[40]	train-auc:0.804059	valid-auc:0.795349
[50]	train-auc:0.805695	valid-auc:0.796805
[60]	train-auc:0.806883	valid-auc:0.797987
[70]	train-auc:0.808252	valid-auc:0.799111
[80]	train-auc:0.809315	valid-auc:0.799819
[90]	train-auc:0.810214	valid-auc:0.800584
[100]	train-auc:0.810995	valid-auc:0.801176
[110]	train-auc:0.811702	valid-auc:0.801722
[120]	train-auc:0.812524	valid-auc:0.802238
[130]	train-auc:0.8133	valid-auc:0.80262
[140]	train-auc:0.813986	valid-auc:0.802999
[150]	train-auc:0.814736	valid-auc:0.803497
[160]	train-auc:0.815401	valid-auc:0.803875
[170]	train-auc:0.816083	valid-auc:0.804249
[180]	train-auc:0.816722	valid-auc:0.804632
[190]	train-auc:0.817406	valid-auc:

In [15]:
def importance_XGB(clf):
    impdf = []
    for ft, score in clf.get_fscore().iteritems():
        impdf.append({'feature': ft, 'importance': score})
    impdf = pd.DataFrame(impdf)
    impdf = impdf.sort_values(by='importance', ascending=False).reset_index(drop=True)
    impdf['importance'] /= impdf['importance'].sum()

    return impdf

feature_importance = importance_XGB(model)
feature_importance

Unnamed: 0,feature,importance
0,Cricket_genre,0.06882453151618399267
1,TalkShow_genre,0.05195911413969335479
2,Drama_genre,0.04548551959114139481
3,Romance_genre,0.03517887563884156826
4,Family_genre,0.03194207836456558480
5,watch_day_sum_7,0.02546848381601362829
6,Action_genre,0.02487223168654173824
7,Comedy_genre,0.02487223168654173824
8,watch_day_sum_2,0.02453151618398637102
9,watch_hour_sum_23,0.02410562180579216285


In [16]:
P_Ytest = xgb.DMatrix(X_test)
P_Test = model.predict(P_Ytest)

P_Test

array([ 0.04110503,  0.04352903,  0.12215872, ...,  0.02468857,
        0.24739006,  0.03557092], dtype=float32)

In [17]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, P_Test)

0.81075758924136654

### Test the model

In [18]:
df_test_dropped.head()

Unnamed: 0,Action_genre,Athletics_genre,Awards_genre,Badminton_genre,Boxing_genre,Comedy_genre,Cricket_genre,Crime_genre,Documentary_genre,Drama_genre,...,watch_hour_sum_7,watch_hour_sum_8,watch_hour_sum_9,watch_day_sum_1,watch_day_sum_2,watch_day_sum_3,watch_day_sum_4,watch_day_sum_5,watch_day_sum_6,watch_day_sum_7
0,,,,,,,702.0,,,,...,,,,,182.0,,701.0,,,
1,3501.0,,,,,,,,,,...,,,,4830.0,8359.0,3078.0,4185.0,2946.0,,451.0
2,,,,,,,,,,,...,,,508.0,5409.0,740.0,,1162.0,,3807.0,2755.0
3,,,,,,,,,,6459.0,...,,,,3227.0,,2043.0,,,,1464.0
4,,,,,,,,,,1204.0,...,,,,1204.0,,,,,,


In [19]:
P_X = df_test_dropped.copy()

del P_X['id']

P_X

Unnamed: 0,Action_genre,Athletics_genre,Awards_genre,Badminton_genre,Boxing_genre,Comedy_genre,Cricket_genre,Crime_genre,Documentary_genre,Drama_genre,...,watch_hour_sum_7,watch_hour_sum_8,watch_hour_sum_9,watch_day_sum_1,watch_day_sum_2,watch_day_sum_3,watch_day_sum_4,watch_day_sum_5,watch_day_sum_6,watch_day_sum_7
0,,,,,,,702.0,,,,...,,,,,182.0,,701.0,,,
1,3501.0,,,,,,,,,,...,,,,4830.0,8359.0,3078.0,4185.0,2946.0,,451.0
2,,,,,,,,,,,...,,,508.0,5409.0,740.0,,1162.0,,3807.0,2755.0
3,,,,,,,,,,6459.0,...,,,,3227.0,,2043.0,,,,1464.0
4,,,,,,,,,,1204.0,...,,,,1204.0,,,,,,
5,,,,,,,,,,,...,,,93.0,,362.0,,,,,
6,37.0,,,,,4226.0,6511.0,29078.0,,838.0,...,,,1393.0,615.0,5356.0,4603.0,8426.0,13665.0,16141.0,12040.0
7,22.0,,,,,,,,,308.0,...,,,,,52.0,,,,351.0,256.0
8,,,,,,,28686.0,,,67.0,...,,,,4432.0,5568.0,284.0,8747.0,3355.0,3055.0,3359.0
9,5740.0,,532.0,,,4788.0,8961.0,,193.0,5507.0,...,,297.0,,8324.0,7631.0,1423.0,5556.0,963.0,9241.0,4925.0


In [20]:
P_Ytest = xgb.DMatrix(P_X)
P_Test = model.predict(P_Ytest)

P_Test

array([ 0.0871945 ,  0.26494449,  0.50121301, ...,  0.02579476,
        0.04531092,  0.22932629], dtype=float32)

In [21]:
results = pd.DataFrame()
results['ID'] = df_test_dropped['id']

results['segment'] = P_Test

results.info()

results.to_csv(
    os.path.join(DATA_DIR, 'predict_segment_30_06_2017.csv'),
    index=False
)

results

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
ID         100000 non-null object
segment    100000 non-null float32
dtypes: float32(1), object(1)
memory usage: 1.1+ MB


Unnamed: 0,ID,segment
0,test-1,0.08719450235366821289
1,test-10,0.26494449377059936523
2,test-100,0.50121301412582397461
3,test-1000,0.47043484449386596680
4,test-10000,0.39269906282424926758
5,test-100000,0.20152796804904937744
6,test-10001,0.08996385335922241211
7,test-10002,0.24716423451900482178
8,test-10003,0.02320878580212593079
9,test-10004,0.07841690629720687866
