## Feature Selection

In [1]:
import time
import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25

from pandas.api.types import CategoricalDtype
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

inputfile = 'CKME136X10_2018_Data_Cleaned_Transformed.csv'
outputfile = 'CKME136X10_2018_Data_CTF.csv'

  from numpy.core.umath_tests import inner1d


In [2]:
#enable/disable feature selection
enable_rf_features = True

In [3]:
#load data

df = pd.read_csv(inputfile, engine = 'python')

In [4]:
# convert to the correct type
df = df.astype('category')
df['C_YEAR'] = df['C_YEAR'].astype(CategoricalDtype(ordered=True))
df['C_MNTH'] = df['C_MNTH'].astype(CategoricalDtype(ordered=True))
df['C_WDAY'] = df['C_WDAY'].astype(CategoricalDtype(ordered=True))
df['C_HOUR'] = df['C_HOUR'].astype(CategoricalDtype(ordered=True))
df['C_VEHS'] = df['C_VEHS'].astype('int')
df['V_YEAR'] = df['V_YEAR'].astype(CategoricalDtype(ordered=True))
df['P_PSN'] = df['P_PSN'].astype(CategoricalDtype(ordered=True))
df['P_AGE'] = df['P_AGE'].astype('int')
df['P_ISEV'] = df['P_ISEV'].astype('int')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4336558 entries, 0 to 4336557
Data columns (total 19 columns):
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    int32
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_TYPE    category
V_YEAR    category
P_SEX     category
P_AGE     int32
P_PSN     category
P_SAFE    category
P_USER    category
P_ISEV    int32
dtypes: category(16), int32(3)
memory usage: 115.8 MB
None


In [5]:
#Split between data and class
# leaking data here, need to split between test and train.  TBD
Y_ = df[df.columns[-1]]
X_ = df[df.columns[0:df.columns.size -1]]

#sprint into train and test 70/30
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

### Feature Selection by Chi Squre

In [6]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
# Ref: https://machinelearningmastery.com/feature-selection-machine-learning-python/
np.set_printoptions(edgeitems=12)
tcd_as_array = df.values
X = tcd_as_array[:,0:tcd_as_array.shape[1] -1]
Y = tcd_as_array[:,tcd_as_array.shape[1] -1]
Y=Y.astype('int')

# feature extraction
test = SelectKBest(score_func=chi2, k=12)
fit = test.fit(X, Y)

# summarize scores
#np.set_printoptions(precision=3)
print(fit.scores_)
t1 = np.array(fit.scores_).tolist()
t2 = t1.copy()
#t2 = [ '%.3f' % elem for elem in t2 ]
t2.sort(reverse=True)

chi_feature = []
for i in range(0, X.shape[1]):
    chi_feature.append(df.columns[t1.index(t2[i])])
    #print("{}".format(df.columns[t1.index(t2[i])]))

#np.set_printoptions(precision=3)
features = fit.transform(X)
# summarize selected features
#print(features[0:11,:])
print(chi_feature[0:12])


[4.35494924e+03 2.39992031e+02 4.65987366e+02 4.89185225e+03
 4.24128149e+04 2.97977521e+05 3.58932621e+03 3.75529013e+03
 1.00332083e+04 1.29696899e+04 6.70731479e+04 1.07965440e+06
 1.75882834e+04 4.40951940e+04 9.10405781e+04 4.13985582e+03
 3.09568526e+03 2.13322551e+04]
['V_TYPE', 'C_CONF', 'P_AGE', 'C_TRAF', 'P_SEX', 'C_VEHS', 'P_USER', 'V_YEAR', 'C_RALN', 'C_RSUR', 'C_HOUR', 'C_YEAR']


### Feature Extraction: Recursive Feature Elimination 

In [7]:
# Feature Extraction with RFE

# feature extraction
#model = LogisticRegression()
model = ExtraTreesClassifier()
rfe = RFE(model, 12)
fit = rfe.fit(X_, Y_)
print("Num Features: {}".format(fit.n_features_))
print("Selected Features: {}".format(fit.support_))
print("Selected Features: {}".format(X_.columns[fit.support_]))
print("Feature Ranking: {}".format(fit.ranking_))


Num Features: 12
Selected Features: [ True  True  True  True False  True False False False False  True  True
  True  True  True False  True  True]
Selected Features: Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_CONF', 'C_TRAF', 'V_TYPE',
       'V_YEAR', 'P_SEX', 'P_AGE', 'P_SAFE', 'P_USER'],
      dtype='object')
Feature Ranking: [1 1 1 1 3 1 5 2 6 4 1 1 1 1 1 7 1 1]


### Feature Extraction: Extra Trees

In [8]:
# Feature Importance with Extra Trees Classifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X_, Y_)
print(model.feature_importances_)

indices = np.argsort(model.feature_importances_)[::-1]
print(indices)
featureLabel = X_.columns[0:]
print(featureLabel)
rankedFeature = []
for f in range(X_.shape[1]):
    rankedFeature.append(featureLabel[indices[f]])
    print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], model.feature_importances_[indices[f]]))
print(rankedFeature[0:12])

[0.13978251 0.11909301 0.08241831 0.0461788  0.02599212 0.07809988
 0.02111473 0.02444418 0.02749249 0.02397392 0.03007561 0.02645503
 0.0357194  0.02589592 0.22057505 0.01131882 0.03264949 0.02872073]
[14  0  1  2  5  3 12 16 10 17  8 11  4 13  7  9  6 15]
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_TYPE', 'V_YEAR', 'P_SEX',
       'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER'],
      dtype='object')
 1) P_AGE                          0.220575
 2) C_YEAR                         0.139783
 3) C_MNTH                         0.119093
 4) C_WDAY                         0.082418
 5) C_CONF                         0.078100
 6) C_HOUR                         0.046179
 7) V_YEAR                         0.035719
 8) P_SAFE                         0.032649
 9) C_TRAF                         0.030076
10) P_USER                         0.028721
11) C_RSUR                         0.027492
12) V_TYPE                         0.

### Feature Extraction: RandomForest

In [9]:
verbose_level = 0
if enable_rf_features:
    print("Random Forest Feature Selection: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1, verbose=verbose_level)
    print("Random Forest Feature Selection: Fit Start")
    forest.fit(X_, Y_)
    print("Random Forest Feature Selection: Fit")

    importFeatures = forest.feature_importances_
    print("Random Forest Feature Selection: Feature Importance")
    print(importFeatures)
    
    indices = np.argsort(importFeatures)[::-1]
    print(indices)
    featureLabel = X_.columns[0:]
    print(featureLabel)
    rankedFeature = []
    for f in range(X_.shape[1]):
        rankedFeature.append(featureLabel[indices[f]])
        print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], importFeatures[indices[f]]))
    print(rankedFeature[0:12])

Random Forest Feature Selection: Start
Sun Dec  2 22:05:03 2018
Random Forest Feature Selection: Fit Start
Random Forest Feature Selection: Fit
Random Forest Feature Selection: Feature Importance
[0.13810023 0.1144342  0.08289854 0.05295084 0.02895296 0.06445717
 0.02420928 0.02465523 0.0249129  0.02693571 0.03392114 0.02734204
 0.03666382 0.02682164 0.21881604 0.01004577 0.03515624 0.02872623]
[14  0  1  2  5  3 12 16 10  4 17 11  9 13  8  7  6 15]
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_TYPE', 'V_YEAR', 'P_SEX',
       'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER'],
      dtype='object')
 1) P_AGE                          0.218816
 2) C_YEAR                         0.138100
 3) C_MNTH                         0.114434
 4) C_WDAY                         0.082899
 5) C_CONF                         0.064457
 6) C_HOUR                         0.052951
 7) V_YEAR                         0.036664
 8) P_SAFE       

In [10]:
### Keep all features, in genearl, they are important indicator

In [11]:
df.to_csv(outputfile, encoding='utf-8', index=False)