In [272]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import KFold, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import xgboost as xgb

sns.set_context('poster')
sns.set_style('whitegrid')

np.random.seed(2016)

In [2]:
# load files
train = pd.read_csv('../data/train2016.csv')
test  = pd.read_csv('../data/test2016.csv')
sub   = pd.read_csv('../data/sampleSubmission2016.csv')

In [3]:
train.head()

Unnamed: 0,USER_ID,YOB,Gender,Income,HouseholdStatus,EducationLevel,Party,Q124742,Q124122,Q123464,...,Q100010,Q99716,Q99581,Q99480,Q98869,Q98578,Q98059,Q98078,Q98197,Q96024
0,1,1938.0,Male,,Married (w/kids),,Democrat,No,,No,...,Yes,No,No,,No,,Only-child,No,No,Yes
1,4,1970.0,Female,"over $150,000",Domestic Partners (w/kids),Bachelor's Degree,Democrat,,Yes,No,...,,,,No,No,No,Only-child,Yes,No,No
2,5,1997.0,Male,"$75,000 - $100,000",Single (no kids),High School Diploma,Republican,,Yes,Yes,...,Yes,No,No,No,Yes,No,Yes,No,Yes,No
3,8,1983.0,Male,"$100,001 - $150,000",Married (w/kids),Bachelor's Degree,Democrat,No,Yes,No,...,No,No,No,Yes,Yes,No,Yes,No,No,Yes
4,9,1984.0,Female,"$50,000 - $74,999",Married (w/kids),High School Diploma,Republican,No,Yes,No,...,Yes,No,No,Yes,No,No,Yes,No,No,Yes


In [4]:
test.head()

Unnamed: 0,USER_ID,YOB,Gender,Income,HouseholdStatus,EducationLevel,Q124742,Q124122,Q123464,Q123621,...,Q100010,Q99716,Q99581,Q99480,Q98869,Q98578,Q98059,Q98078,Q98197,Q96024
0,2,1985.0,Female,"$25,001 - $50,000",Single (no kids),Master's Degree,,Yes,No,Yes,...,,,,,Yes,,,,,
1,3,1983.0,Male,"$50,000 - $74,999",Married (w/kids),Current Undergraduate,,,No,,...,,,,,Yes,,Yes,Yes,No,Yes
2,6,1995.0,Male,"$75,000 - $100,000",Single (no kids),Current K-12,,,,,...,No,No,No,Yes,Yes,No,Yes,No,Yes,Yes
3,7,1980.0,Female,"$50,000 - $74,999",Single (no kids),Master's Degree,Yes,Yes,No,Yes,...,Yes,No,No,No,Yes,No,Yes,No,No,Yes
4,14,1980.0,Female,,Married (no kids),Current Undergraduate,,Yes,No,Yes,...,Yes,No,No,Yes,No,No,Yes,No,No,No


In [11]:
print('Shape of training set: ',(train.shape))
print('Shape of test set: ', (test.shape))

Shape of training set:  (5568, 108)
Shape of test set:  (1392, 107)


In [5]:
# list of all the features
print('List of all the features :\n%s'%(train.columns.tolist()))

List of all the features :
['USER_ID', 'YOB', 'Gender', 'Income', 'HouseholdStatus', 'EducationLevel', 'Party', 'Q124742', 'Q124122', 'Q123464', 'Q123621', 'Q122769', 'Q122770', 'Q122771', 'Q122120', 'Q121699', 'Q121700', 'Q120978', 'Q121011', 'Q120379', 'Q120650', 'Q120472', 'Q120194', 'Q120012', 'Q120014', 'Q119334', 'Q119851', 'Q119650', 'Q118892', 'Q118117', 'Q118232', 'Q118233', 'Q118237', 'Q117186', 'Q117193', 'Q116797', 'Q116881', 'Q116953', 'Q116601', 'Q116441', 'Q116448', 'Q116197', 'Q115602', 'Q115777', 'Q115610', 'Q115611', 'Q115899', 'Q115390', 'Q114961', 'Q114748', 'Q115195', 'Q114517', 'Q114386', 'Q113992', 'Q114152', 'Q113583', 'Q113584', 'Q113181', 'Q112478', 'Q112512', 'Q112270', 'Q111848', 'Q111580', 'Q111220', 'Q110740', 'Q109367', 'Q108950', 'Q109244', 'Q108855', 'Q108617', 'Q108856', 'Q108754', 'Q108342', 'Q108343', 'Q107869', 'Q107491', 'Q106993', 'Q106997', 'Q106272', 'Q106388', 'Q106389', 'Q106042', 'Q105840', 'Q105655', 'Q104996', 'Q103293', 'Q102906', 'Q102674

In [13]:
# class balance
print('Party Balance\n ', train.Party.value_counts())

Party Balance
  Democrat      2951
Republican    2617
Name: Party, dtype: int64


In [15]:
# missing values
print('Missing values in training set :\n', train.isnull().any().sum())

Missing values in training set :
 106


In [16]:
print('Missing values in test set: \n', test.isnull().any().sum())

Missing values in test set: 
 106


** Since there are lot of missing values in the data, we can carry out an analysis to see what are the different kind of missing values. **

In [17]:
print('Datatypes of various features:\n%s'%(train.dtypes))

Datatypes of various features:
USER_ID              int64
YOB                float64
Gender              object
Income              object
HouseholdStatus     object
EducationLevel      object
Party               object
Q124742             object
Q124122             object
Q123464             object
Q123621             object
Q122769             object
Q122770             object
Q122771             object
Q122120             object
Q121699             object
Q121700             object
Q120978             object
Q121011             object
Q120379             object
Q120650             object
Q120472             object
Q120194             object
Q120012             object
Q120014             object
Q119334             object
Q119851             object
Q119650             object
Q118892             object
Q118117             object
                    ...   
Q106272             object
Q106388             object
Q106389             object
Q106042             object
Q105840             obje

** Most of the features captured are categorical in nature **

In [19]:
# concat train and test
data = pd.concat((train, test))

In [25]:
def print_unique_values(data):
    categorical_features = data.select_dtypes(include=['object']).columns
    
    for cat in categorical_features:
        print('Number of unique values for feature: %s are: %d'%(cat, data[cat].nunique()))
        
        if data[cat].nunique() < 10:
            print(data[cat].unique())
        else:
            print(data[cat].unique()[:10] + '...')
        
        print('-'*75)
        print('\n')

In [26]:
print_unique_values(data)

Number of unique values for feature: EducationLevel are: 7
[nan "Bachelor's Degree" 'High School Diploma' 'Current K-12'
 'Current Undergraduate' "Master's Degree" "Associate's Degree"
 'Doctoral Degree']
---------------------------------------------------------------------------


Number of unique values for feature: Gender are: 2
['Male' 'Female' nan]
---------------------------------------------------------------------------


Number of unique values for feature: HouseholdStatus are: 6
['Married (w/kids)' 'Domestic Partners (w/kids)' 'Single (no kids)'
 'Married (no kids)' 'Domestic Partners (no kids)' nan 'Single (w/kids)']
---------------------------------------------------------------------------


Number of unique values for feature: Income are: 6
[nan 'over $150,000' '$75,000 - $100,000' '$100,001 - $150,000'
 '$50,000 - $74,999' '$25,001 - $50,000' 'under $25,000']
---------------------------------------------------------------------------


Number of unique values for feature

** As we can see most of the questions have _yes_ / _no_ answers, we can fill missing values appropriately. **

In [27]:
# Methods to get training and test dataset from concatenated data

def get_training_dataset(data):
    mask = data.Party.notnull()
    return data[mask]

def get_test_dataset(data):
    mask = data.Party.isnull()
    return data[mask]

** Relationship between different variables **

```
Checking if two categorical variables are independent can be done with Chi-Squared test of independence.

This is a typical Chi-Square test: 

If we assume that two variables are independent, then the values of the contingency table for these variables should be distributed uniformly. And then we check how far away from uniform the actual values are.
```

```
Let's apply this rule to find out if gender and party are independent or not.

Null Hypothesis: They are independent
Under this hypothesis, we assume their distribution to be uniform.

```

In [29]:
pd.crosstab(train.Party, train.Gender, margins=True)

Gender,Female,Male,All
Party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Democrat,1275,1613,2888
Republican,855,1712,2567
All,2130,3325,5455


In [30]:
pd.crosstab(train.Party, train.Gender, margins=True, normalize=True)

Gender,Female,Male,All
Party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Democrat,0.233731,0.295692,0.529423
Republican,0.156737,0.313841,0.470577
All,0.390467,0.609533,1.0


In [57]:
cont_table = pd.crosstab(train.Party, train.Gender) 
chi_2, p, dof, expected = sp.stats.chi2_contingency(cont_table.values, correction=False)

In [58]:
print('P-value ', p)

P-value  2.57076096692e-16


`P-value is very close to zero which is enough to discard the hypothesis of independence.`

** Compute Cramer's V**

In [60]:
np.sqrt(chi_2 / (len(cont_table) * min(cont_table.shape[0], cont_table.shape[1])))

4.0959636065625222

` Lower this value, lower the correlation `

** We can say that gender and party are not independent variables. **

** We can perform this type of for all of the categorical variables with the target variable to see whether we can any pair of features that are independent and should not be included in the modelling process as it would add noise to the model. **

In [76]:
def chi_square_test(df, target, candidate_features):
    test_summary = {}
    
    for feat in candidate_features:
        test_summary[(target, feat)] = []
        
        cont_table = pd.crosstab(df[target], df[feat])
        chi2_statistic, p_val, dof, expected = sp.stats.chi2_contingency(cont_table.values, correction=False)
        print('P-value for the pair (%s, %s): %f'%(target, feat, p_val))
        
        test_summary[(target, feat)].append(p_val)
        
        cramer_v = np.sqrt(chi2_statistic / (len(cont_table) * min(cont_table.shape[0], cont_table.shape[1])))
        print('Cramer-V value for pair (%s, %s): %f'%(target, feat, cramer_v))
        
        test_summary[(target, feat)].append(cramer_v)
        print('-'*75)
        print('\n')
        
    return test_summary

In [64]:
def get_candidate_features(df):
    columns = df.select_dtypes(include=['object']).columns
    return columns.drop('Party')

In [65]:
candidate_features = get_candidate_features(train)

In [77]:
test_summary = chi_square_test(train, 'Party', candidate_features)

P-value for the pair (Party, Gender): 0.000000
Cramer-V value for pair (Party, Gender): 4.095964
---------------------------------------------------------------------------


P-value for the pair (Party, Income): 0.072121
Cramer-V value for pair (Party, Income): 1.589976
---------------------------------------------------------------------------


P-value for the pair (Party, HouseholdStatus): 0.000000
Cramer-V value for pair (Party, HouseholdStatus): 3.814418
---------------------------------------------------------------------------


P-value for the pair (Party, EducationLevel): 0.014332
Cramer-V value for pair (Party, EducationLevel): 1.993395
---------------------------------------------------------------------------


P-value for the pair (Party, Q124742): 0.350376
Cramer-V value for pair (Party, Q124742): 0.466930
---------------------------------------------------------------------------


P-value for the pair (Party, Q124122): 0.013485
Cramer-V value for pair (Party, Q124122):

In [165]:
def select_features_chi2(test_summary, n=10):
    return [k[1] for k, v in sorted(test_summary.items(), key=lambda x: x[1])[:n]]

In [411]:
selected_features = select_features_chi2(test_summary, n=20)

In [412]:
print('Selected Features are: \n%s'%(selected_features))

Selected Features are: 
['Q109244', 'Q115611', 'Q98197', 'Q113181', 'Gender', 'Q98869', 'HouseholdStatus', 'Q101163', 'Q99480', 'Q105840', 'Q116881', 'Q120379', 'Q120472', 'Q106272', 'Q115899', 'Q121699', 'Q119851', 'Q110740', 'Q102089', 'Q106042']


** Preprocessing **

In [396]:
print('Max YOB in the training set', train.YOB.max())
print('Min YOB in the training set ', train.YOB.min())

Max YOB in the training set 2039.0
Min YOB in the training set  1880.0


In [421]:
# remove entries with YOB < 1930 and YOB greater than 2000

def remove_entries(data):
    mask = (data.YOB < 1930) & (data.YOB > 2000)
    return data[~mask]

In [356]:
def fill_missing_values(data, selected_features):
    for col in selected_features:
        if data[col].isnull().any():
            data[col] = data[col].fillna(data[col].value_counts().argmax())
    
    return data

def fill_missing_values_with_unique_label(data, selected_features):
    for col in selected_features:
        if data[col].isnull().any():
            data[col] = data[col].fillna('-99999')
            
    return data

def fill_missing_values_with_flag(data, selected_features):
    for col in selected_features:
        if data[col].isnull().any():
            data[col + '_missing'] = data[col].isnull().astype(int)
            data[col] = data[col].fillna(data[col].value_counts().argmax())
    
    return data

In [362]:
def missing_value_features(columns):
    return [col for col in columns if 'missing' in col]

In [357]:
def encode_features(data, selected_features):
    for cat in selected_features:
        lbl = LabelEncoder()
        lbl.fit(data[cat])
        
        data[cat] = lbl.transform(data[cat])
    
    return data

In [422]:
# concat train and test data-frames
data = pd.concat((train, test))

In [423]:
data = remove_entries(data)
# fill missing values for only the selected features
data = fill_missing_values_with_flag(data, selected_features)
data = encode_features(data, selected_features)

** Datasets **

In [424]:
train_ = get_training_dataset(data)
test_  = get_test_dataset(data)

In [364]:
selected_features = selected_features + missing_value_features(train_.columns)

In [425]:
X = train_[selected_features]
y = (train_.Party == 'Democrat').astype(int)

Xtest = test_[selected_features]

In [426]:
Xtr, Xte, ytr, yte = train_test_split(X, y, stratify=y, test_size=0.33, random_state=12128)

** Logistic Regression **

In [427]:
log = LogisticRegression()
log.fit(Xtr, ytr)

yhat = log.predict(Xte)
print('Accuracy score on unseen examples ', accuracy_score(yte, yhat))

Accuracy score on unseen examples  0.603373231774


** Random Forest Classifier **

In [428]:
rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=5, n_jobs=-1, random_state=1231)
rf.fit(Xtr, ytr)

yhat = rf.predict(Xte)
print('Accuracy score on unseen examples ', accuracy_score(yte, yhat))

Accuracy score on unseen examples  0.600652883569


** Extra Trees Classifier **

In [429]:
etr = ExtraTreesClassifier(class_weight='balanced', n_estimators=150, max_depth=5, n_jobs=-1, random_state=1232)
etr.fit(Xtr, ytr)

yhat = etr.predict(Xte)
print('Accuracy score on unseen examples ', accuracy_score(yte, yhat))

Accuracy score on unseen examples  0.600108813928


** Extreme Gradient Boosting **

In [430]:
xgb_est = xgb.XGBClassifier(n_estimators=40, seed=21)
xgb_est.fit(Xtr, ytr)

yhat = xgb_est.predict(Xte)
print('Accuracy score on unseen examples ', accuracy_score(yte, yhat))

Accuracy score on unseen examples  0.598476605005


#### Estimating error 

In [431]:
def estimating_error(X, y, est, n_folds=10):
    skf = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=2318)
    cv_score = cross_val_score(est, X, y, scoring='accuracy', cv=skf, n_jobs=-1)
    return cv_score

In [432]:
cv_score_log_10 = estimating_error(Xtr, ytr, log, 10)
print('Mean cross-validation score for Logistic Regression: %f and std: %f '%(np.mean(cv_score_log_10), np.std(cv_score_log_10)))

Mean cross-validation score for Logistic Regression: 0.625472 and std: 0.020944 


In [433]:
cv_score_rf_10 = estimating_error(Xtr, ytr, rf, 10)
print('Mean cross-validation score for Random Forest: %f and std: %f '%(np.mean(cv_score_rf_10), np.std(cv_score_rf_10)))

Mean cross-validation score for Random Forest: 0.632438 and std: 0.020364 


In [434]:
cv_score_etr_10 = estimating_error(Xtr, ytr, etr, 10)
print('Mean cross-validation score for Extra Trees: %f and std: %f '%(np.mean(cv_score_etr_10), np.std(cv_score_etr_10)))

Mean cross-validation score for Extra Trees: 0.632161 and std: 0.024201 


In [435]:
cv_score_xgb_10 = estimating_error(Xtr, ytr, xgb_est, 10)
print('Mean cross-validation score for Extra Trees: %f and std: %f '%(np.mean(cv_score_xgb_10), np.std(cv_score_xgb_10)))

Mean cross-validation score for Extra Trees: 0.633785 and std: 0.015283 


In [436]:
def get_predictions(model, test):
    return model.predict(test)

In [437]:
log_preds = get_predictions(log, Xte)
rf_preds  = get_predictions(rf, Xte)
etr_preds = get_predictions(etr, Xte)
xgb_preds = get_predictions(xgb_est, Xte)

** Analyze our errors **

In [438]:
def get_error_summary(test_df, true, preds, features=['Gender', 'HouseholdStatus']):
    mask = preds != true
    error_summary = {}
    
    gr = test_df[mask].groupby(features).size()
    
    for idx in gr.index.values:
        error_summary[idx] = gr.ix[idx]
    
    return error_summary

In [439]:
log_summary = get_error_summary(Xte, yte, log_preds)
rf_summary = get_error_summary(Xte, yte, rf_preds)
etr_summary = get_error_summary(Xte, yte, etr_preds)
xgb_summary = get_error_summary(Xte, yte, xgb_preds)

In [440]:
log_summary

{(0, 0): 10,
 (0, 1): 7,
 (0, 2): 34,
 (0, 3): 62,
 (0, 4): 138,
 (0, 5): 14,
 (1, 0): 15,
 (1, 1): 6,
 (1, 2): 57,
 (1, 3): 144,
 (1, 4): 231,
 (1, 5): 11}

In [441]:
rf_summary

{(0, 0): 9,
 (0, 1): 7,
 (0, 2): 34,
 (0, 3): 60,
 (0, 4): 134,
 (0, 5): 14,
 (1, 0): 15,
 (1, 1): 6,
 (1, 2): 58,
 (1, 3): 145,
 (1, 4): 241,
 (1, 5): 11}

In [442]:
etr_summary

{(0, 0): 10,
 (0, 1): 7,
 (0, 2): 34,
 (0, 3): 60,
 (0, 4): 137,
 (0, 5): 13,
 (1, 0): 14,
 (1, 1): 5,
 (1, 2): 57,
 (1, 3): 143,
 (1, 4): 239,
 (1, 5): 16}

In [443]:
xgb_summary

{(0, 0): 8,
 (0, 1): 7,
 (0, 2): 33,
 (0, 3): 60,
 (0, 4): 135,
 (0, 5): 13,
 (1, 0): 17,
 (1, 1): 4,
 (1, 2): 59,
 (1, 3): 145,
 (1, 4): 244,
 (1, 5): 13}

** So it is proving to be difficult to get gender type 2 and household type 5 correct in general **

In [444]:
Xte.groupby(['Gender', 'HouseholdStatus']).size()

Gender  HouseholdStatus
0       0                   27
        1                   12
        2                   93
        3                  184
        4                  354
        5                   36
1       0                   42
        1                   10
        2                  132
        3                  339
        4                  577
        5                   32
dtype: int64

** Reason might be that this is the most common pair **

** Lets see whether predictions of these models are correlated or not **

In [445]:
pd.crosstab(log_preds, rf_preds, margins=True, normalize=True)

col_0,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.400979,0.01741,0.41839
1,0.107182,0.474429,0.58161
All,0.508161,0.491839,1.0


In [446]:
pd.crosstab(etr_preds, rf_preds, margins=True, normalize=True)

col_0,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.498368,0.028836,0.527203
1,0.009793,0.463003,0.472797
All,0.508161,0.491839,1.0


In [447]:
pd.crosstab(log_preds, etr_preds, margins=True, normalize=True)

col_0,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.404244,0.014146,0.41839
1,0.12296,0.458651,0.58161
All,0.527203,0.472797,1.0


In [448]:
pd.crosstab(log_preds, xgb_preds, margins=True, normalize=True)

col_0,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.336235,0.082155,0.41839
1,0.019587,0.562024,0.58161
All,0.355822,0.644178,1.0


In [449]:
pd.crosstab(rf_preds, xgb_preds, margins=True, normalize=True)

col_0,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.350381,0.15778,0.508161
1,0.005441,0.486398,0.491839
All,0.355822,0.644178,1.0


In [450]:
pd.crosstab(etr_preds, xgb_preds, margins=True, normalize=True)

col_0,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.352013,0.17519,0.527203
1,0.003808,0.468988,0.472797
All,0.355822,0.644178,1.0


In [451]:
def majority_voting(preds, n_examples, ytest=None):
    final_preds = []
    for i in range(n_examples):
        model_preds = []
        for j in range(len(preds)):
            model_preds.append(preds[j][i])
        
        bin_count = np.bincount(model_preds)
        final_preds.append(np.argmax(bin_count))
    
    if ytest is not None:
        print('Accuracy of majority voting ensemble: %f'%(accuracy_score(ytest, final_preds)))
    return final_preds

In [452]:
_ = majority_voting([log_preds, rf_preds, etr_preds, xgb_preds], len(yte), yte)

Accuracy of majority voting ensemble: 0.601197


#### Full Training

In [282]:
log.fit(X, y)
log_preds = log.predict(Xtest)

rf.fit(X, y)
rf_preds  = rf.predict(Xtest)

etr.fit(X, y)
etr_preds = etr.predict(Xtest)

xgb_est.fit(X, y)
xgb_preds = xgb_est.predict(Xtest)

predictions = majority_voting([log_preds, rf_preds, etr_preds, xgb_preds], len(Xtest))

In [283]:
prediction_labels = map(lambda x: 'Democrat' if x == 1 else 'Republican', predictions)

In [284]:
sub['Predictions'] = list(prediction_labels)

* Logistic Regression - Public : 0.61638	Private: 0.60632
* Ensemble Model ( Logistic, Random Forest, Extra Trees ) - Public : 0.62356	Private : 0.62213
* Ensemble with -999 filled in for missing values - Public: 0.62069 Private: 0.63218

In [285]:
sub.to_csv('../submissions/ensemble_with_xgb.csv', index=False)