## Feature Selection

In [1]:
import time
import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25

from pandas.api.types import CategoricalDtype
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

  from numpy.core.umath_tests import inner1d


In [2]:
#enable/disable feature selection
enable_rf_features = True

In [3]:
#load data
datafile = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'
df = pd.read_csv(datafile, engine = 'python')

In [4]:
# convert to the correct type
df = df.astype('category')
df['C_YEAR'] = df['C_YEAR'].astype(CategoricalDtype(ordered=True))
df['C_MNTH'] = df['C_MNTH'].astype(CategoricalDtype(ordered=True))
df['C_WDAY'] = df['C_WDAY'].astype(CategoricalDtype(ordered=True))
df['C_HOUR'] = df['C_HOUR'].astype(CategoricalDtype(ordered=True))
df['V_YEAR'] = df['V_YEAR'].astype(CategoricalDtype(ordered=True))
df['P_AGE'] = df['P_AGE'].astype('int')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3655334 entries, 0 to 3655333
Data columns (total 17 columns):
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_YEAR    category
P_SEX     category
P_AGE     int32
P_PSN     category
P_USER    category
P_ISEV    category
dtypes: category(16), int32(1)
memory usage: 69.7 MB
None


In [5]:
#Split between data and class
# leaking data here, need to split between test and train.  TBD
Y_ = df[df.columns[-1]]
X_ = df[df.columns[0:df.columns.size -1]]

#sprint into train and test 70/30
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

### Feature Selection by Chi Squre

In [6]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
# Ref: https://machinelearningmastery.com/feature-selection-machine-learning-python/
np.set_printoptions(edgeitems=12)
tcd_as_array = df.values
X = tcd_as_array[:,0:tcd_as_array.shape[1] -1]
Y = tcd_as_array[:,tcd_as_array.shape[1] -1]
Y=Y.astype('int')

# feature extraction
test = SelectKBest(score_func=chi2, k=12)
fit = test.fit(X, Y)

# summarize scores
#np.set_printoptions(precision=3)
print(fit.scores_)
t1 = np.array(fit.scores_).tolist()
t2 = t1.copy()
#t2 = [ '%.3f' % elem for elem in t2 ]
t2.sort(reverse=True)

chi_feature = []
for i in range(0, X.shape[1]):
    chi_feature.append(df.columns[t1.index(t2[i])])
    #print("{}".format(df.columns[t1.index(t2[i])]))

#np.set_printoptions(precision=3)
features = fit.transform(X)
# summarize selected features
#print(features[0:11,:])
print(chi_feature[0:12])

#[C_HOUR, C_VEHS, C_CONF, C_RCFG, C_WTHR, C_RSUR, C_RALN, C_TRAF, P_SEX, P_AGE]

[  1954.24907601    322.28755297    459.50289494   5433.78874556
  23709.83745052 173018.38553879   5373.5707024    4428.12342496
  11645.96187121  12554.83844127  62895.38506861   1159.95293739
  40816.98463318 105952.64089518   2213.69538156    543.39109637]
['C_CONF', 'P_AGE', 'C_TRAF', 'P_SEX', 'C_VEHS', 'C_RALN', 'C_RSUR', 'C_HOUR', 'C_RCFG', 'C_WTHR', 'P_PSN', 'C_YEAR']


### Feature Extraction: Recursive Feature Elimination 

In [7]:
# Feature Extraction with RFE

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 12)
fit = rfe.fit(X_, Y_)
print("Num Features: {}".format(fit.n_features_))
print("Selected Features: {}".format(fit.support_))
print("Selected Features: {}".format(X_.columns[fit.support_]))
print("Feature Ranking: {}".format(fit.ranking_))


Num Features: 12
Selected Features: [False False  True  True  True False  True  True  True  True  True  True
  True False  True  True]
Selected Features: Index(['C_WDAY', 'C_HOUR', 'C_VEHS', 'C_RCFG', 'C_WTHR', 'C_RSUR', 'C_RALN',
       'C_TRAF', 'V_YEAR', 'P_SEX', 'P_PSN', 'P_USER'],
      dtype='object')
Feature Ranking: [5 4 1 1 1 2 1 1 1 1 1 1 1 3 1 1]


### Feature Extraction: Extra Trees

In [8]:
# Feature Importance with Extra Trees Classifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X_, Y_)
print(model.feature_importances_)

indices = np.argsort(model.feature_importances_)[::-1]
print(indices)
featureLabel = X_.columns[0:]
print(featureLabel)
rankedFeature = []
for f in range(X_.shape[1]):
    rankedFeature.append(featureLabel[indices[f]])
    print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], model.feature_importances_[indices[f]]))
print(rankedFeature[0:12])

[0.15093099 0.12915253 0.09491015 0.04921887 0.02688118 0.07549055
 0.02297781 0.02534752 0.02230674 0.02775081 0.03268589 0.02954307
 0.02928058 0.2546498  0.00921362 0.01965989]
[13  0  1  2  5  3 10 11 12  9  4  7  6  8 15 14]
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_YEAR', 'P_SEX', 'P_AGE',
       'P_PSN', 'P_USER'],
      dtype='object')
 1) P_AGE                          0.254650
 2) C_YEAR                         0.150931
 3) C_MNTH                         0.129153
 4) C_WDAY                         0.094910
 5) C_CONF                         0.075491
 6) C_HOUR                         0.049219
 7) C_TRAF                         0.032686
 8) V_YEAR                         0.029543
 9) P_SEX                          0.029281
10) C_RALN                         0.027751
11) C_VEHS                         0.026881
12) C_WTHR                         0.025348
13) C_RCFG                         0.0229

### Feature Extraction: RandomForest

In [9]:
verbose_level = 0
if enable_rf_features:
    print("Random Forest Feature Selection: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1, verbose=verbose_level)
    print("Random Forest Feature Selection: Fit Start")
    forest.fit(X_, Y_)
    print("Random Forest Feature Selection: Fit")

    importFeatures = forest.feature_importances_
    print("Random Forest Feature Selection: Feature Importance")
    print(importFeatures)
    
    indices = np.argsort(importFeatures)[::-1]
    print(indices)
    featureLabel = X_.columns[0:]
    print(featureLabel)
    rankedFeature = []
    for f in range(X_.shape[1]):
        rankedFeature.append(featureLabel[indices[f]])
        print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], importFeatures[indices[f]]))
    print(rankedFeature[0:12])

Random Forest Feature Selection: Start
Thu Nov 29 18:53:13 2018
Random Forest Feature Selection: Fit Start
Random Forest Feature Selection: Fit
Random Forest Feature Selection: Feature Importance
[0.14991917 0.12922359 0.09493915 0.0603878  0.02668629 0.06213154
 0.0228237  0.02552461 0.02380172 0.02883489 0.03181076 0.03314014
 0.02821974 0.2481038  0.00936773 0.02508539]
[13  0  1  2  5  3 11 10  9 12  4  7 15  8  6 14]
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_YEAR', 'P_SEX', 'P_AGE',
       'P_PSN', 'P_USER'],
      dtype='object')
 1) P_AGE                          0.248104
 2) C_YEAR                         0.149919
 3) C_MNTH                         0.129224
 4) C_WDAY                         0.094939
 5) C_CONF                         0.062132
 6) C_HOUR                         0.060388
 7) V_YEAR                         0.033140
 8) C_TRAF                         0.031811
 9) C_RALN           

In [10]:
df.to_csv('NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Feature_Selected.csv', encoding='utf-8', index=False)