## Feature Selection

In [1]:
import time
import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25

from pandas.api.types import CategoricalDtype
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

inputfile = 'CKME136X10_2018_Data_Cleaned_Transformed.csv'
outputfile = 'CKME136X10_2018_Data_CTF.csv'

  from numpy.core.umath_tests import inner1d


In [2]:
#enable/disable feature selection
enable_rf_features = True

In [3]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

In [4]:
#load data

df = pd.read_csv(inputfile, engine = 'python')

In [5]:
# convert to the correct type
df = convert_type(df)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4816153 entries, 0 to 4816152
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(14), int32(1)
memory usage: 82.7 MB
None


In [6]:
#Split between data and class
# leaking data here, need to split between test and train.  TBD
Y = df[df.columns[-1]]
X = df[df.columns[0:df.columns.size -1]]

### Feature Selection by Chi Squre

In [7]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
# Ref: https://machinelearningmastery.com/feature-selection-machine-learning-python/
np.set_printoptions(edgeitems=12)
tcd_as_array = df.values
Xs = tcd_as_array[:,0:tcd_as_array.shape[1] -1]
Ys = tcd_as_array[:,tcd_as_array.shape[1] -1]
Ys=Y.astype('int')

# feature extraction
test = SelectKBest(score_func=chi2, k=12)
fit = test.fit(Xs, Ys)

# summarize scores
#np.set_printoptions(precision=3)
print(fit.scores_)
t1 = np.array(fit.scores_).tolist()
t2 = t1.copy()
#t2 = [ '%.3f' % elem for elem in t2 ]
t2.sort(reverse=True)

chi_feature = []
for i in range(0, Xs.shape[1]):
    chi_feature.append(df.columns[t1.index(t2[i])])
    #print("{}".format(df.columns[t1.index(t2[i])]))

#np.set_printoptions(precision=3)
features = fit.transform(Xs)
# summarize selected features
print(chi_feature[0:12])


[2.63839127e+02 4.93720089e+02 3.71416521e+03 4.80838471e+04
 3.74398899e+05 3.76883025e+03 3.37096898e+03 7.04938725e+03
 1.49681271e+04 7.32742309e+04 1.62208503e+04 3.70025675e+03
 7.94718880e+03 4.10623311e+04]
['C_CONF', 'C_TRAF', 'C_VEHS', 'P_USER', 'P_SEX', 'C_RALN', 'P_PSN', 'C_RSUR', 'C_RCFG', 'C_HOUR', 'P_AGE', 'C_WTHR']


### Feature Extraction: Recursive Feature Elimination 

In [8]:
# Feature Extraction with RFE

# feature extraction
#model = LogisticRegression()
model = ExtraTreesClassifier()
rfe = RFE(model, 12)
fit = rfe.fit(X, Y)
print("Num Features: {}".format(fit.n_features_))
print("Selected Features: {}".format(fit.support_))
print("Selected Features: {}".format(X.columns[fit.support_]))
print("Feature Ranking: {}".format(fit.ranking_))

Num Features: 12
Selected Features: [ True  True  True  True  True False  True  True  True  True  True  True
 False  True]
Selected Features: Index(['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_WTHR', 'C_RSUR',
       'C_RALN', 'C_TRAF', 'P_SEX', 'P_AGE', 'P_USER'],
      dtype='object')
Feature Ranking: [1 1 1 1 1 2 1 1 1 1 1 1 3 1]


### Feature Extraction: Extra Trees

In [9]:
# Feature Importance with Extra Trees Classifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

indices = np.argsort(model.feature_importances_)[::-1]
print(indices)
featureLabel = X.columns[0:]
print(featureLabel)
rankedFeature = []
for f in range(X.shape[1]):
    rankedFeature.append(featureLabel[indices[f]])
    print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], model.feature_importances_[indices[f]]))
print(rankedFeature[0:12])

[0.19846378 0.12381465 0.06723965 0.04479497 0.1414596  0.0360919
 0.03726477 0.03558932 0.0343261  0.04642132 0.04389446 0.08824912
 0.02349232 0.07889806]
[ 0  4  1 11 13  2  9  3 10  6  5  7  8 12]
Index(['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG', 'C_WTHR',
       'C_RSUR', 'C_RALN', 'C_TRAF', 'P_SEX', 'P_AGE', 'P_PSN', 'P_USER'],
      dtype='object')
 1) C_MNTH                         0.198464
 2) C_CONF                         0.141460
 3) C_WDAY                         0.123815
 4) P_AGE                          0.088249
 5) P_USER                         0.078898
 6) C_HOUR                         0.067240
 7) C_TRAF                         0.046421
 8) C_VEHS                         0.044795
 9) P_SEX                          0.043894
10) C_WTHR                         0.037265
11) C_RCFG                         0.036092
12) C_RSUR                         0.035589
13) C_RALN                         0.034326
14) P_PSN                          0.023492
['C_MNTH'

### Feature Extraction: RandomForest

In [10]:
verbose_level = 0
if enable_rf_features:
    print("Random Forest Feature Selection: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1, verbose=verbose_level)
    print("Random Forest Feature Selection: Fit Start")
    forest.fit(X, Y)
    print("Random Forest Feature Selection: Fit")

    importFeatures = forest.feature_importances_
    print("Random Forest Feature Selection: Feature Importance")
    print(importFeatures)
    
    indices = np.argsort(importFeatures)[::-1]
    print(indices)
    featureLabel = X.columns[0:]
    print(featureLabel)
    rankedFeature = []
    for f in range(X.shape[1]):
        rankedFeature.append(featureLabel[indices[f]])
        print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], importFeatures[indices[f]]))
    print(rankedFeature[0:12])

Random Forest Feature Selection: Start
Tue Dec 11 15:58:42 2018
Random Forest Feature Selection: Fit Start
Random Forest Feature Selection: Fit
Random Forest Feature Selection: Feature Importance
[0.20304632 0.13859933 0.07658904 0.04535783 0.10820298 0.03493019
 0.03707072 0.04158763 0.03829693 0.04798196 0.04340615 0.08839632
 0.02475288 0.07178173]
[ 0  1  4 11  2 13  9  3 10  7  8  6  5 12]
Index(['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG', 'C_WTHR',
       'C_RSUR', 'C_RALN', 'C_TRAF', 'P_SEX', 'P_AGE', 'P_PSN', 'P_USER'],
      dtype='object')
 1) C_MNTH                         0.203046
 2) C_WDAY                         0.138599
 3) C_CONF                         0.108203
 4) P_AGE                          0.088396
 5) C_HOUR                         0.076589
 6) P_USER                         0.071782
 7) C_TRAF                         0.047982
 8) C_VEHS                         0.045358
 9) P_SEX                          0.043406
10) C_RSUR                      

In [11]:
### Keep all features, in genearl, they are important indicator

In [12]:
df.to_csv(outputfile, encoding='utf-8', index=False)