In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle

%matplotlib inline

In [21]:
### Read in inputs dataframe as well as weights and target series from job 02_clean_data.ipynb

X = pickle.load(open("inputs.p",'rb'))
wt = pickle.load(open("weights.p",'rb'))
y = pickle.load(open("target.p",'rb'))
#y_ = pickle.load(open("target_.p",'rb')) --> numeric target; unexpected results

In [22]:
### Scale inputs so continuous variables (with higher values) don't monopolize the model 
### Also turn the result into a dataframe for easier interaction with feature importance  

Xscaled= pd.DataFrame(StandardScaler().fit_transform(X))

In [23]:
### Rename columns back to original names instead of just index number
old_names = Xscaled.columns
new_names = X.columns
Xscaled.rename(columns=dict(zip(old_names, new_names)), inplace=True)

In [24]:
### Split inputs, weights, and target into 30% test and 70% train samples, stratifying on target
### to ensure class distribution in test and train are comparable

X_train, X_test, y_train, y_test, wt_train, wt_test = train_test_split(Xscaled, y, wt,  stratify=y, test_size=.5, random_state=4444)

In [25]:
### Check the shapes of series/dataframe
print(X_train.shape)
print(X_test.shape)
print(wt_train.shape)
print(wt_test.shape)
print(y_train.shape)
print(y_test.shape)

(1577, 493)
(1577, 493)
(1577,)
(1577,)
(1577,)
(1577,)


In [26]:
### sample_weight parameter in fit() call is expected in array format
### thus, record weight series as arrays

wt_train_=np.array(wt_train)
wt_test_ =np.array(wt_test)

### First pass at training a model: random forest 
#### Due to its ability to handle large number of inputs 
#### Also for ranking features by importance

In [27]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 1000)
rfc.fit(X_train, y_train, sample_weight=wt_train_)
y_pred = rfc.predict(X_train)
print('train accuracy:', metrics.accuracy_score(y_train, y_pred))
y_pred2 = rfc.predict(X_test)
print('test accuracy:', metrics.accuracy_score(y_test, y_pred2))

train accuracy: 0.981610653139
test accuracy: 0.204185161699


In [None]:
### 98% accuracy on train and only 20% on test indicates overfit 
### Will do a grid search for max_depth parameter for RFC as well as PCA
### But first, let's get rid off least important inputs

In [29]:
### Save feature importance from random forest results
### Sort by importance in descending order
rfc.fit(X_train, y_train, sample_weight=wt_train_)
features = X_train.columns
feature_importances = rfc.feature_importances_

important_features = pd.DataFrame({'Features': features, 'Importance_Score': feature_importances})
important_features.sort('Importance_Score', inplace=True, ascending=False)

important_features.Importance_Score.describe()



count    493.000000
mean       0.002028
std        0.003211
min        0.000000
25%        0.000165
50%        0.001065
75%        0.002297
max        0.021580
Name: Importance_Score, dtype: float64

In [30]:
important_features.Importance_Score.median()

0.001065144369470696

In [31]:
### Create a subset dataframe of more important half of the features (with importance score above median)
imp_f = important_features.query('Importance_Score > 0.001065144369470696')
## most important is nib7616_1 - Age (2-year increments)

In [32]:
### Create series of more important features
print(imp_f.shape)
imp_f_ = imp_f['Features']

(246, 2)


In [33]:
imp_f_


474     nib7616_1
479       nib8642
490       nib9616
473     nib7607_1
471       nib7110
485       nib9350
486       nib9351
488       nib9358
462       nib3588
463       nib3589
452       nib2778
482       nib8727
481       nib8718
460       nib3448
480       nib8717
453       nib2779
458       nib3446
489       nib9607
459       nib3447
487       nib9356
461       nib3587
484       nib8729
464       nib3590
457       nib2897
454       nib2780
468       nib3594
470       nib4000
483       nib8728
465       nib3591
469       nib3595
          ...    
76     nib7600_16
210    nib8600_16
100    nib8167_29
430     nib9528_w
219     nib8604_4
195     nib8573_v
103     nib8443_c
308     nib8666_2
165     nib8496_r
316     nib8843_f
422     nib9518_f
402     nib9510_3
434     nib9533_h
376    nib9181_c4
182    nib8560_10
243     nib8605_7
85      nib7609_1
299     nib8648_b
28      nib3101_a
167     nib8505_d
189     nib8570_f
298     nib8648_a
277     nib8637_3
324     nib8844_b
408     ni

In [None]:
# Top 12 most important features:

### nib7616_1 - age (100%)
### nib8642 - Home Market Value - Estimated
### nib9616 - age (1st individual)
### nib7607_1 - Home Length of Residence - 100% 
### nib7110 - Economic Stability Indicator Financial
### nib9350 - Economic Stability Indicator
### nib9351 - UnderBanked Indicator
### nib9358 - HeavyTransactors
### nib3588 - Media Channel Usage - Cell Phone
### nib3589 - Media Channel Usage - Primetime TV
### nib2778 - Brand Name Medicine Propensity Score
### nib8727 - TeleTrends - International Long Distance User


In [34]:
### Create new set of train and test inputs reduced to top half based on importance score from rfc
new_X_train = X_train[imp_f_]
print(new_X_train.shape)
new_X_test = X_test[imp_f_]
print(new_X_test.shape)

(1577, 246)
(1577, 246)


In [35]:
### Pickle test and train targets, weights, and new inputs (subset on importance) for modeling
pickle.dump(new_X_train, open("new_train_inputs.p",'wb'))
pickle.dump(new_X_test, open("new_test_inputs.p",'wb'))
pickle.dump(y_train, open("y_train.p",'wb'))
pickle.dump(y_test, open("y_test.p",'wb'))
pickle.dump(wt_train_, open("wt_train_.p",'wb'))
pickle.dump(wt_test_, open("wt_test_.p",'wb'))