This notebook does feature selection and save the selected features to a pickle file

- Removes features that have only one unique value
- Remove features highly correlated
- Remove features only slightly correlated with target

The thresholds that worked best were:

remove highly correlated:              < -0.85 and > 0.85
remove slightly correlated to target:  > -0.05 and < 0.05

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import RFECV
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pickle

In [54]:
#load test and train data
train_file='train_data_2906'
test_file='test_data_2906'

#write features to pickle file
features_file='features_2906'

df=pd.read_pickle(train_file+".pkl")
df_test=pd.read_pickle(test_file+".pkl")

y=df.SalePrice
X=df.drop('SalePrice', axis=1)

X_test=df_test

In [62]:
#check if features have more than 1 unique value
uv=[]
for col in X.columns:
    uv.append(X[col].unique().shape[0])
uvdf=pd.DataFrame(uv, index=X.columns)

features=list(uvdf[uvdf[0].gt(1)].index)

In [63]:
#Filter X for the selected features
X=X[features]

In [64]:
import seaborn as sns

data=pd.concat([X,y],axis=1)

#get correlations of each features in dataset
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(30,40))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [65]:
#exclude highly correlated features and features low corrrelated with target

a=pd.DataFrame()

threshold_high=0.8
threshold_low=0.05

for col in corrmat.columns:
    a=pd.concat([a,pd.DataFrame(list(corrmat[col][corrmat[col].gt(threshold_high) | corrmat[col].lt(-threshold_high)].index))], ignore_index=True, axis=1)
    if col=='SalePrice':
        #remote features not correlated with SalePrice
        b=pd.DataFrame(list(corrmat[col][corrmat[col].gt(-threshold_low) & corrmat[col].lt(threshold_low)].index))

a.rename(columns=dict(zip(list(a.columns), list(corrmat.columns))), inplace=True)

features_to_remove=[]
for col in a.columns:
    if a[col].dropna().shape[0]>1:
        features_to_remove.append(list(a[col].dropna().iloc[1:]))

features_to_remove.append(list(b[0]))

#flatten list
features_to_remove = [x for xs in features_to_remove for x in xs]

#remove duplicates and SalePrice
features_to_remove=list(set(features_to_remove))
if ('SalePrice' in features_to_remove):
    features_to_remove.remove('SalePrice')

In [66]:
features=[x for x in features if x not in features_to_remove]
print('New number of features: 'len(features))

with open(features_file, "wb") as fp:   #Pickling
    pickle.dump(features, fp)

58