# Feature Selection

In this notebook features are ranked using importance and 2 sets 20- and 40- most important features are determined

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/train_data.tsv.gz", sep="\t", compression="gzip")
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,AGE2,SERVICE,MILSTAT,ACTDEVER,ACTD2001,ACTD9001,ACTD7590,ACTDVIET,ACTDPRIV,COMBATPY,...,TRQANYFLAG,TRQANYYR,STMANYFLAG,STMANYYR,SEDANYFLAG,SEDANYYR,PSYANYFLAG,PSYANYYR,HERRFD,TOTRFD
0,15.0,2.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,14.0,2.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15.0,2.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15.0,2.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,16.0,2.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## XGBoost Feature Importance

Use aggregate score as y

In [3]:
from xgboost import XGBRegressor

In [4]:
X = df.iloc[:,0:399]
y = df.iloc[:,-1]

In [5]:
model = XGBRegressor(objective='reg:logistic')
model.fit(X,y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
             nthread=None, objective='reg:logistic', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [6]:
importance = model.feature_importances_
importance = pd.DataFrame(importance, index=X.columns, 
                          columns=["Importance"])

## Export 
feature_importance_list.xlsx contains a reviewed version of results.    
20 and 40 most-important features are picked.

In [8]:
importance.sort_values(['Importance'], ascending=0).to_csv('data/feature_importance_results5.csv',sep='\t')

## One Hot Encoding 

1. Remove the columns so that we have our test and train datasets containing top 20 and top 40 most important features.

2. Apply one hot encoding to categorical features for both 20 and 40 feature sets.

In [8]:
df_train_imp0 = pd.read_csv("data/train_data.tsv.gz", sep="\t", compression="gzip")
df_test_imp0 = pd.read_csv("data/test_data.tsv.gz", sep="\t", compression="gzip")

In [17]:
# get categorical features as a list
df_features40 = pd.read_csv('data/feature_importance_results5.csv',sep='\t')[:40]
categorical_features = []
for i in range(len(df_features40['Categorical'])):
    if df_features40['Categorical'][i] != 0:
        categorical_features.append(df_features40['Unnamed: 0'][i])
len(categorical_features)

35

In [13]:
# top 20 and 40 feature lists 
features40 = [item for item in np.array(df_features40['Unnamed: 0'])]
features20 = [item for item in np.array(df_features40['Unnamed: 0'][:20])]

In [15]:
# remove columns that are not in our 40/20 features list
rm = []
features40.append('HERRFD')
features40.append('TOTRFD')
features20.append('HERRFD')
features20.append('TOTRFD')
for i in range(len(df_train_imp0.columns)):
    if df_train_imp0.columns[i] not in features40:
        rm.append(df_train_imp0.columns[i])
df_train40 = df_train_imp0.drop(columns=rm)
df_test40  = df_test_imp0.drop(columns=rm)

rm = []
for i in range(len(df_train_imp0.columns)):
    if df_train_imp0.columns[i] not in features20:
        rm.append(df_train_imp0.columns[i])
df_train20 = df_train_imp0.drop(columns=rm)
df_test20  = df_test_imp0.drop(columns=rm)

In [19]:
df_test20.head()

Unnamed: 0,IRMARITSTAT,WRKDRGHLP,WRKOKPREH,IRWRKSTAT,IRPINC3,RSKCOCWK,DIFGETHER,APPDRGMON,RSKYFQTES,CIGAVOID,...,K6SCMON,K6SCMAX,SMIPP_U,CIGYR,SMKLSSYR,OXYCNANYYR,TRQANYYR,PSYANYYR,HERRFD,TOTRFD
0,1.0,2.0,1.0,1.0,2.0,3.0,1.0,2.0,1.0,91.0,...,0.0,0.0,0.008998,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,99.0,99.0,4.0,1.0,4.0,2.0,2.0,1.0,91.0,...,5.0,10.0,0.023636,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,2.0,1.0,1.0,2.0,4.0,1.0,2.0,1.0,91.0,...,0.0,0.0,0.008998,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,99.0,99.0,4.0,5.0,4.0,1.0,2.0,2.0,93.0,...,1.0,1.0,0.008998,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,2.0,4.0,1.0,2.0,1.0,93.0,...,4.0,4.0,0.008998,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# get dummies
df_train40_encoded = pd.get_dummies(df_train40, columns=categorical_features)
df_test40_encoded  = pd.get_dummies(df_test40, columns=categorical_features)
df_train20_encoded = pd.get_dummies(df_train20, columns=categorical_features[:17])
df_test20_encoded  = pd.get_dummies(df_test20, columns=categorical_features[:17])

In [24]:
# Move RFD score columns to the end for convenience. 
cols_to_end = ['HERRFD','TOTRFD']
df_train40 = df_train40_encoded[[c for c in df_train40_encoded if c not in cols_to_end]+cols_to_end]
df_test40  = df_test40_encoded[[c for c in df_test40_encoded if c not in cols_to_end]+cols_to_end]

df_train20 = df_train20_encoded[[c for c in df_train20_encoded if c not in cols_to_end]+cols_to_end]
df_test20  = df_test20_encoded[[c for c in df_test20_encoded if c not in cols_to_end]+cols_to_end]

In [29]:
df_train40.head()

Unnamed: 0,WRKSKIPMO,HIGHBPAGE,K6SCMON,K6SCMAX,SMIPP_U,APPDRGMON_1.0,APPDRGMON_2.0,CIGYR_0.0,CIGYR_1.0,IRPINC3_1.0,...,RKFQPBLT_1.0,RKFQPBLT_2.0,RKFQPBLT_3.0,RKFQPBLT_4.0,WRKDPSTWK_1.0,WRKDPSTWK_2.0,WRKDPSTWK_4.0,WRKDPSTWK_99.0,HERRFD,TOTRFD
0,0.0,999.0,0.0,0.0,0.008998,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0.0,0.0
1,0.0,999.0,0.0,0.0,0.008998,0,1,1,0,0,...,0,0,1,0,1,0,0,0,0.0,0.0
2,0.0,999.0,0.0,0.0,0.008998,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0.0,0.0
3,0.0,999.0,0.0,0.0,0.008998,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0.0,0.0
4,0.0,999.0,1.0,1.0,0.008998,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0.0,0.0


## Export

In [30]:
df_test40.to_csv('data/test40.tsv',sep='\t',index=False)
df_train40.to_csv('data/train40.tsv',sep='\t',index=False)
df_test20.to_csv('data/test20.tsv',sep='\t',index=False)
df_train20.to_csv('data/train20.tsv',sep='\t',index=False)