# Feature Selection

In [1]:
# Utils.models contains all libraries needed
from utils.models import *
import os

In [2]:
# notebook parameters

models_path = '../models'
data_path = '../data'
output_path = '../outputs'
filename_data = 'clean-dataset'

In [3]:
try:
    os.makedirs(f"{models_path}/features_by_models")
except FileExistsError:
    # directory already exists
    pass

try:
    os.makedirs(f"{models_path}/features_scores_models")
except FileExistsError:
    # directory already exists
    pass

In [4]:
# load data
data = pd.read_csv(f"{data_path}/{filename_data}.csv")
print('This dataset has shape: ', data.shape)
data.head()

This dataset has shape:  (30000, 42)


Unnamed: 0,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,PAY_TO_BILL_5,PAY_TO_BILL_6,PAY_AMOUNT_STD_LAST_6M,BILL_AMOUNT_STD_LAST_6M,AGE_<=40,AGE_<=60,AGE_<=80,RISKY_GROUP1,RISKY_GROUP2,RISKY_GROUP3
0,20000,2,2,-1,-1,-2,-2,3913,3102,689,...,0.0,0.0,281.283072,1761.633219,1,0,0,1,0,1
1,120000,-1,2,0,0,0,2,2682,1725,2682,...,0.0,0.613121,752.772653,637.967841,1,0,0,1,0,1
2,90000,0,0,0,0,0,0,29239,14027,13559,...,0.066894,0.321543,1569.815488,6064.518593,1,0,0,1,0,1
3,50000,0,0,0,0,0,0,46990,48233,49291,...,0.036913,0.033843,478.058155,10565.793518,1,0,0,1,0,1
4,50000,-1,0,-1,0,0,0,8617,5670,35835,...,0.035985,0.03549,13786.230736,10668.590074,0,1,0,0,1,0


***
### Selecting features


In [5]:
X, y = preprocess(data, 'dpnm')

n_colums = X.shape[1]
min_cols= 30 # minimum quantity of features to try in Selectkbest

In [6]:
# init models

tree = DecisionTreeClassifier()
sgd = SGDClassifier(loss='log')
lr = LogisticRegression(solver = 'lbfgs')
svc = SVC()
rf = RandomForestClassifier()
xgboost = xgb.XGBClassifier()
naive = GaussianNB()
knn = KNeighborsClassifier()
ada = AdaBoostClassifier()

classifiers = [tree,sgd, lr, svc, rf, xgboost, naive, knn, ada]

In [7]:
for clf in tqdm(classifiers):
    # perform selection of features by model
    features_names, features_scores, artifact_name = select_k_variables(clf, X, y, n_colums, min_cols)
    # saving features used by model
    save_features_name(features_names, models_path, artifact_name)
    # saving features used with important scores
    features_scores.to_csv(f"{models_path}/features_scores_models/{artifact_name}.csv", index=False)

100%|██████████| 9/9 [1:12:29<00:00, 483.33s/it]  


Model DecisionTreeClassifier works with 25 features
---------------------------------------
DecisionTreeClassifier_0.73 saved successfully!
Model SGDClassifier works with 24 features
---------------------------------------
SGDClassifier_0.78 saved successfully!
Model LogisticRegression works with 17 features
---------------------------------------
LogisticRegression_0.78 saved successfully!
Model SVC works with 41 features
---------------------------------------
SVC_0.78 saved successfully!
Model RandomForestClassifier works with 27 features
---------------------------------------
RandomForestClassifier_0.81 saved successfully!
Model XGBClassifier works with 17 features
---------------------------------------
XGBClassifier_0.82 saved successfully!
Model GaussianNB works with 15 features
---------------------------------------
GaussianNB_0.34 saved successfully!
Model KNeighborsClassifier works with 28 features
---------------------------------------
KNeighborsClassifier_0.76 saved succ

In [17]:
print('All done :) ')

All done :) 


***