In [1]:
from pprint import pprint
from collections import Counter

import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder

from natsort import natsorted

import xgboost as xgb

In [2]:
df_questions = pd.read_excel('../data/voting outcomes/Questions.xlsx')
df_questions['Question ID'] = 'Q' + df_questions['Question ID'].astype(str)
df_questions.set_index('Question ID', inplace=1)
df_questions.head()

Unnamed: 0_level_0,Question Text,Possible Answers
Question ID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q96024,Are you good at math?,"Yes,No"
Q98059,Do/did you have any siblings?,"Yes,Only-child"
Q98078,"Do you have a ""go-to"" creative outlet?","Yes,No"
Q98197,Do you pray or meditate on a regular basis?,"Yes,No"
Q98578,Do you exercise 3 or more times per week?,"Yes,No"


In [3]:
df_train = pd.read_csv('../data/voting outcomes/train2016.csv')
df_test = pd.read_csv('../data/voting outcomes/test2016.csv')

df_train.shape, df_test.shape

((5568, 108), (1392, 107))

In [4]:
party_encoder = LabelEncoder()
df_train.Party = party_encoder.fit_transform(df_train.Party)

In [5]:
TRAIN = 0
TEST = 1

df_train['source'] = TRAIN
df_test['source'] = TEST

df_all = pd.concat([df_train, df_test])

In [6]:
describe = df_all.YOB.describe(percentiles=[0.01, 0.99])
low = describe['1%']
high = describe['99%']

df_all.loc[(df_all.YOB < low) | (df_all.YOB > high), 'YOB'] = np.nan 
df_all.loc[df_all.YOB.isnull(), 'YOB'] = df_all.YOB.median()

In [7]:
all_columns = sorted(set(df_all.columns) - {'USER_ID', 'Party', 'source'})

categorical = [col for col in all_columns if df_all[col].dtype == 'O']
questions   = natsorted([q for q in categorical if q.startswith('Q')])
numerical   = [col for col in all_columns if df_all[col].dtype != 'O']

In [8]:
df_all.loc[:, categorical] = df_all.loc[:, categorical].fillna('na')

In [9]:
def clean_level_names(s):
    s = s.replace('$', '')
    s = s.replace(',', '')
    s = s.replace(' - ', '_')
    s = s.replace(' ', '_')
    return s

df_all.Income = df_all.Income.apply(clean_level_names)
df_all.HouseholdStatus = df_all.HouseholdStatus.str.replace(' ', '_')
df_all.EducationLevel = df_all.EducationLevel.str.replace(' ', '_')
df_all.Q117193 = df_all.Q117193.str.replace(' ', '_')
df_all.Q120194 = df_all.Q120194.str.replace(' ', '_')

df_all.Q98059  = df_all.Q98059.map({'yes': 'yes', 'only-child': 'no'})
df_all.Q99982  = df_all.Q99982.map({'check!': 'yes', 'nope': 'no'})
df_all.Q106997 = df_all.Q106997.map({'yay people!': 'yes', 'grrr people': 'no'})
df_all.Q108855 = df_all.Q108855.map({'yes!': 'yes', 'umm...': 'no'})
df_all.Q117186 = df_all.Q117186.map({'hot headed': 'yes', 'cool headed': 'no'})

In [10]:
records = df_all[categorical].to_dict(orient='records')
oh_vectorizer = DictVectorizer(separator='_')
oh = oh_vectorizer.fit_transform(records)

## Other features

In [11]:
yes_no = set(df_questions[df_questions['Possible Answers'] == 'Yes,No'].index) 
yes_no = natsorted(yes_no | {'Q98059', 'Q99982', 'Q106997', 'Q108855', 'Q117186'})

In [12]:
answers_cnt = df_all[yes_no].apply(Counter, axis=1).apply(pd.Series).fillna(0)
answers_cnt.columns = ['%s_cnt' % c for c in answers_cnt.columns]

## Models

In [13]:
train_idx = (df_all.source == TRAIN).values
test_idx = (df_all.source == TEST).values

df_train = df_all[df_all.source == TRAIN].reset_index(drop=1)
df_test = df_all[df_all.source == TEST].reset_index(drop=1)

X = np.hstack([oh[train_idx].toarray(),
               df_train[numerical], 
               answers_cnt[df_all.source == TRAIN]])

features = oh_vectorizer.get_feature_names() + \
            numerical + list(answers_cnt.columns)

y = df_train.Party

In [14]:
dtrain = xgb.DMatrix(X, label=y, feature_names=features, missing=np.nan)

In [15]:
results = []

In [16]:
early_stopping_rounds = 100
n_estimators = 1500

xgb_pars = {
    'eta': 0.01,
    'gamma': 6,
    'max_depth': 4,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 0.83,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'error',
    'nthread': 4,
    'seed': 42
}

In [17]:
history = xgb.cv(xgb_pars, dtrain, num_boost_round=n_estimators, nfold=5, metrics=('error'), as_pandas=True,
                 show_progress=25, early_stopping_rounds=early_stopping_rounds)

score = 1 - history['test-error-mean'].iloc[-1]
results.append((score, len(history), dict(xgb_pars)))
score

Will train until cv error hasn't decreased in 100 rounds.
[0]	cv-test-error:0.3969452+0.0163055711142	cv-train-error:0.3810874+0.00228461293002
[25]	cv-test-error:0.3769992+0.0115983503551	cv-train-error:0.3698114+0.00540871092221
[50]	cv-test-error:0.3762802+0.0118298687972	cv-train-error:0.364241+0.00427504306411
[75]	cv-test-error:0.375921+0.0122744694712	cv-train-error:0.3616802+0.00519134088266
[100]	cv-test-error:0.373585+0.0122720345664	cv-train-error:0.3573224+0.0038104469869
[125]	cv-test-error:0.3716082+0.0127645078464	cv-train-error:0.3515274+0.00337665133527
[150]	cv-test-error:0.3710694+0.0123585177202	cv-train-error:0.3442496+0.00207632363566
[175]	cv-test-error:0.367655+0.0130127309355	cv-train-error:0.3398472+0.00180803488904
[200]	cv-test-error:0.368194+0.0113137476196	cv-train-error:0.3336478+0.00243397102694
[225]	cv-test-error:0.3685534+0.0122062000082	cv-train-error:0.3300092+0.00314269326534
[250]	cv-test-error:0.3708894+0.0121262004206	cv-train-error:0.3256964+0.

0.63755600000000001

In [18]:
for score, ntrees, pars in reversed(results[-5:]):
    print score, ntrees, pars

0.637556 374 {'colsample_bytree': 0.9, 'eval_metric': 'error', 'colsample_bylevel': 1, 'eta': 0.01, 'max_delta_step': 0, 'nthread': 4, 'min_child_weight': 1, 'subsample': 0.83, 'seed': 42, 'objective': 'binary:logistic', 'alpha': 0, 'max_depth': 4, 'gamma': 6, 'lambda': 1}


In [19]:
score, ntrees, best_pars = max(results, key=lambda x: x[0])
print score, ntrees
pprint(pars)

0.637556 374
{'alpha': 0,
 'colsample_bylevel': 1,
 'colsample_bytree': 0.9,
 'eta': 0.01,
 'eval_metric': 'error',
 'gamma': 6,
 'lambda': 1,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 42,
 'subsample': 0.83}


In [20]:
xgb_model = xgb.train(best_pars, dtrain, num_boost_round=ntrees, 
                      evals=[(dtrain, 'train')], verbose_eval=25)

[0]	train-error:0.384698
[25]	train-error:0.374641
[50]	train-error:0.371947
[75]	train-error:0.369253
[100]	train-error:0.364045
[125]	train-error:0.358118
[150]	train-error:0.351473
[175]	train-error:0.348060
[200]	train-error:0.341954
[225]	train-error:0.338721
[250]	train-error:0.336566
[275]	train-error:0.332974
[300]	train-error:0.328484
[325]	train-error:0.323455
[350]	train-error:0.320762
[373]	train-error:0.316810


Now dump the model for xgbfi

In [21]:
def create_feature_map(fmap_filename, features):
    outfile = open(fmap_filename, 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

create_feature_map('xgb.fmap', features) 

In [22]:
!head xgb.fmap

0	EducationLevel_Associate's_Degree	q
1	EducationLevel_Bachelor's_Degree	q
2	EducationLevel_Current_K-12	q
3	EducationLevel_Current_Undergraduate	q
4	EducationLevel_Doctoral_Degree	q
5	EducationLevel_High_School_Diploma	q
6	EducationLevel_Master's_Degree	q
7	EducationLevel_na	q
8	Gender_Female	q
9	Gender_Male	q


In [23]:
xgb_model.dump_model('xgb.dump', fmap='xgb.fmap', with_stats=True)
!head -n 30 xgb.dump

booster[0]:
0:[Q109244_Yes<0.5] yes=1,no=2,missing=1,gain=258.432,cover=1165.25
	1:[Q115611_Yes<0.5] yes=3,no=4,missing=3,gain=86.9162,cover=974
		3:[Q98197_Yes<0.5] yes=7,no=8,missing=7,gain=36.4256,cover=732.5
			7:[Gender_Male<0.5] yes=15,no=16,missing=15,gain=16.659,cover=596.25
				15:leaf=-0.00404933,cover=242.25
				16:leaf=-0.000647887,cover=354
			8:[Q108617_No<0.5] yes=17,no=18,missing=17,gain=10.3826,cover=136.25
				17:leaf=-0.000613497,cover=39.75
				18:leaf=0.0054359,cover=96.5
		4:[Q113181_Yes<0.5] yes=9,no=10,missing=9,gain=11.9666,cover=241.5
			9:[Q123464_No<0.5] yes=19,no=20,missing=19,gain=11.1044,cover=133
				19:leaf=-0.000481928,cover=40.5
				20:leaf=0.0057754,cover=92.5
			10:[YOB<1979.5] yes=21,no=22,missing=21,gain=16.1934,cover=108.5
				21:leaf=0.00470588,cover=58.5
				22:leaf=0.012549,cover=50
	2:[Q118232_Idealist<0.5] yes=5,no=6,missing=5,gain=6.0877,cover=191.25
		5:leaf=-0.0104687,cover=127
		6:[YOB<1995.5] yes=13,no=14,missing=13,g

## Submission

In [24]:
X_test = np.hstack([oh[test_idx].toarray(),
               df_test[numerical], 
               answers_cnt[df_all.source == TEST]])

dtest = xgb.DMatrix(X_test, feature_names=features)

y_pred = xgb_model.predict(dtest)
y_pred = (y_pred > 0.5).astype(int)
y_pred = party_encoder.inverse_transform(y_pred)

In [25]:
user_ids = df_test.USER_ID.values

result = pd.DataFrame({'USER_ID': user_ids, 'Predictions': y_pred})
result.to_csv('xgb3.csv', index=False)

- Public leaderboard score: 0.62787
- Need better cv (now: 0.62787 lb vs 0.63755 cv)

## XGB fi

Let's have a look at xgbfi's output

In [26]:
!mono ~/tmp/soft/xgbfi/bin/XgbFeatureInteractions.exe -m xgb.dump

[?1h=[6n[H[2J[32m********************************
* XGBOOST Feature Interactions *
********************************
[m
Settings:
XgbModelFile: xgb.dump
OutputXlsxFile: XgbFeatureInteractions.xlsx
MaxInteractionDepth: 2
MaxDeepening: -1
MaxTrees: 100
TopK: 100
SortBy: Gain
MaxHistograms: 10

[mParsing xgb.dump
[32mConstructing tree #1 => depth: 4 (23 nodes)
[m[32mConstructing tree #2 => depth: 4 (27 nodes)
[m[32mConstructing tree #3 => depth: 4 (23 nodes)
[m[32mConstructing tree #4 => depth: 4 (23 nodes)
[m[32mConstructing tree #5 => depth: 4 (23 nodes)
[m[32mConstructing tree #6 => depth: 4 (23 nodes)
[m[32mConstructing tree #7 => depth: 4 (25 nodes)
[m[32mConstructing tree #8 => depth: 4 (17 nodes)
[m[32mConstructing tree #9 => depth: 4 (21 nodes)
[m[32mConstructing tree #10 => depth: 4 (23 nodes)
[m[32mConstructing tree #11 => depth: 4 (23 nodes)
[m[32mConstructing tree #12 => depth: 4 (17 nodes)
[m[32mConstructing tree #13 => depth: 4 (21 nodes)
[m

### Most important features:

In [27]:
interactions1 = pd.read_excel('XgbFeatureInteractions.xlsx', sheetname=0)
var_name = interactions1.Interaction.str.split('_').apply(lambda x: x[0])
interactions1['Question'] = df_questions.loc[var_name.head(15)]['Question Text'].reset_index(drop=1)
interactions1[['Question', 'Interaction', 'Gain']].head(n=15)

Unnamed: 0,Question,Interaction,Gain
0,Are you a feminist?,Q109244_Yes,11213.1165
1,Do you personally own a gun?,Q115611_Yes,2920.23189
2,Do you meditate or pray on a regular basis?,Q113181_Yes,1895.89385
3,Are you a feminist?,Q109244_No,1241.84704
4,Do you pray or meditate on a regular basis?,Q98197_Yes,747.61359
5,Do you pray or meditate on a regular basis?,Q98197_No,614.38254
6,Are you a feminist?,Q109244_na,529.4882
7,,YOB,406.38478
8,,Gender_Male,333.88193
9,Do you personally own a gun?,Q115611_No,307.14925


### Most important 2nd order interactions:

In [28]:
interactions2 = pd.read_excel('XgbFeatureInteractions.xlsx', sheetname=1)
features2 = interactions2.Interaction.str.split('|', expand=True).applymap(lambda x: x.split('_')[0])
interactions2['Q1'] = df_questions['Question Text'].loc[features2[0]].reset_index(drop=1)
interactions2['Q2'] = df_questions['Question Text'].loc[features2[1]].reset_index(drop=1)
interactions2[['Q1', 'Q2', 'Interaction', 'Gain']].head(n=25)

Unnamed: 0,Q1,Q2,Interaction,Gain
0,Are you a feminist?,Do you personally own a gun?,Q109244_Yes|Q115611_Yes,8569.4402
1,Are you a feminist?,Do you meditate or pray on a regular basis?,Q109244_Yes|Q113181_Yes,5667.5207
2,Do you meditate or pray on a regular basis?,Do you personally own a gun?,Q113181_Yes|Q115611_Yes,3830.48367
3,Are you a feminist?,,Q109244_Yes|Yes_cnt,3473.48018
4,Are you a feminist?,Are you more of an idealist or a pragmatist?,Q109244_Yes|Q118232_Idealist,2220.05294
5,Do you personally own a gun?,Do you pray or meditate on a regular basis?,Q115611_Yes|Q98197_Yes,1523.20387
6,Are you a feminist?,Do you pray or meditate on a regular basis?,Q109244_No|Q98197_No,1144.33162
7,Are you a feminist?,Are you a feminist?,Q109244_na|Q109244_No,1123.5151
8,Do you personally own a gun?,Do you pray or meditate on a regular basis?,Q115611_Yes|Q98197_No,866.81459
9,Are you a feminist?,Your generally preferred approach to starting ...,Q109244_Yes|Q120194_Study_first,849.00979


### 3rd level interaction

In [29]:
interactions3 = pd.read_excel('XgbFeatureInteractions.xlsx', sheetname=2)
features3 = interactions3.Interaction.str.split('|', expand=True).applymap(lambda x: x.split('_')[0])
interactions3['Q1'] = df_questions['Question Text'].loc[features3[0]].reset_index(drop=1)
interactions3['Q2'] = df_questions['Question Text'].loc[features3[1]].reset_index(drop=1)
interactions3['Q3'] = df_questions['Question Text'].loc[features3[2]].reset_index(drop=1)
interactions3[['Q1', 'Q2', 'Q3', 'Interaction', 'Gain']].head(n=15)

Unnamed: 0,Q1,Q2,Q3,Interaction,Gain
0,Are you a feminist?,Do you meditate or pray on a regular basis?,Do you personally own a gun?,Q109244_Yes|Q113181_Yes|Q115611_Yes,12448.91936
1,Are you a feminist?,Do you personally own a gun?,Do you pray or meditate on a regular basis?,Q109244_Yes|Q115611_Yes|Q98197_Yes,4480.82507
2,Are you a feminist?,Do you pray or meditate on a regular basis?,,Q109244_Yes|Q98197_Yes|Yes_cnt,2270.3946
3,Are you a feminist?,Do you meditate or pray on a regular basis?,Do you personally own a gun?,Q109244_Yes|Q113181_Yes|Q115611_No,2114.83758
4,Are you a feminist?,Do you personally own a gun?,Do you pray or meditate on a regular basis?,Q109244_Yes|Q115611_Yes|Q98197_No,1641.8282
5,Do you meditate or pray on a regular basis?,Do you personally own a gun?,,Q113181_Yes|Q115611_Yes|YOB,1567.31121
6,Are you a feminist?,Do you personally own a gun?,Do you pray or meditate on a regular basis?,Q109244_No|Q115611_Yes|Q98197_No,1483.30854
7,"Which parent ""wore the pants"" in your household?",Are you a feminist?,Do you personally own a gun?,Q101163_Dad|Q109244_Yes|Q115611_Yes,1229.2838
8,Are you a feminist?,Do you meditate or pray on a regular basis?,Have you ever had your life genuinely threaten...,Q109244_Yes|Q113181_Yes|Q118233_Yes,1117.80319
9,Do you meditate or pray on a regular basis?,Do you personally own a gun?,Would you say most of the hardship in your lif...,Q113181_Yes|Q115611_Yes|Q115899_Circumstances,1030.10118
