# Feature Selection & CPA

In [1]:
import pandas as pd

In [3]:
bank_df = pd.read_csv("UniversalBank.csv")

In [4]:
bank_df.head()

Unnamed: 0,UserID,Personal Loan,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,1,0,25,1,49,4,1.6,1,0,1,0,0,0
1,2,0,45,19,34,3,1.5,1,0,1,0,0,0
2,3,0,39,15,11,1,1.0,1,0,0,0,0,0
3,4,0,35,9,100,1,2.7,2,0,0,0,0,0
4,5,0,35,8,45,4,1.0,2,0,0,0,0,1


Setting Variables

In [5]:
X = bank_df.iloc[:,2:13]
y = bank_df["Personal Loan"]

## Recursive feature selection

In [6]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [7]:
lr = LogisticRegression(max_iter=5000)
rfe = RFE(lr, n_features_to_select=3)
model = rfe.fit(X, y)
model.support_

array([False, False, False, False, False, False, False,  True,  True,
       False,  True])

In [11]:
pd.DataFrame(list(zip(X.columns,model.ranking_)), columns = ['predictor','ranking'])

Unnamed: 0,predictor,ranking
0,Age,8
1,Experience,7
2,Income,6
3,Family,5
4,CCAvg,3
5,Education,2
6,Mortgage,9
7,Securities Account,1
8,CD Account,1
9,Online,4


## Lasso

Predictors Standardization

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

Running model

In [13]:
from sklearn.linear_model import Lasso
ls = Lasso(alpha=0.01) # you can control the number of predictors through alpha
model = ls.fit(X_std,y)
model.coef_

array([ 0.        ,  0.        ,  0.13299623,  0.02631129,  0.01431789,
        0.05454379,  0.        , -0.00234322,  0.05970924, -0.        ,
       -0.00513873])

In [14]:
pd.DataFrame(list(zip(X.columns,model.coef_)), columns = ['predictor','coefficient'])


Unnamed: 0,predictor,coefficient
0,Age,0.0
1,Experience,0.0
2,Income,0.132996
3,Family,0.026311
4,CCAvg,0.014318
5,Education,0.054544
6,Mortgage,0.0
7,Securities Account,-0.002343
8,CD Account,0.059709
9,Online,-0.0


## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=0)

model = randomforest.fit(X, y)

In [16]:
model.feature_importances_

array([0.03622587, 0.03631667, 0.33341256, 0.11905741, 0.16432808,
       0.21642604, 0.02860299, 0.00362799, 0.04759828, 0.00677493,
       0.00762918])

In [17]:
pd.DataFrame(list(zip(X.columns,model.feature_importances_)), columns = ['predictor','feature importance'])

Unnamed: 0,predictor,feature importance
0,Age,0.036226
1,Experience,0.036317
2,Income,0.333413
3,Family,0.119057
4,CCAvg,0.164328
5,Education,0.216426
6,Mortgage,0.028603
7,Securities Account,0.003628
8,CD Account,0.047598
9,Online,0.006775


## PCA

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(X)

pca.explained_variance_ratio_


array([8.21325056e-01, 1.57747806e-01, 2.05275482e-02, 1.39734364e-04,
       1.01152134e-04, 7.01739449e-05, 4.17376965e-05, 1.91024279e-05,
       1.66944409e-05, 8.05191858e-06])

In [20]:
pca.components_

array([[-1.75106316e-03, -1.48430525e-03,  1.15367832e-01,
        -3.18360921e-04,  2.42992635e-03, -3.50541511e-04,
         9.93317075e-01, -1.63891462e-05,  2.26106317e-04,
        -2.48981873e-05, -3.23953616e-05],
       [-1.54663514e-02, -1.33334994e-02,  9.92787935e-01,
        -3.97162302e-03,  2.46167496e-02, -3.42748375e-03,
        -1.15416443e-01, -8.39527604e-06,  7.96097959e-04,
         1.74633262e-04, -6.14803688e-06],
       [-7.06621550e-01, -7.07279337e-01, -2.05205421e-02,
         4.19500645e-03,  1.47275876e-03, -8.52646037e-04,
         7.82794178e-05,  1.88155161e-05, -2.81172367e-04,
        -4.45504413e-04, -2.31554927e-04],
       [ 3.47877174e-02, -3.21161265e-02, -2.47494270e-02,
        -2.94613692e-02,  9.97997058e-01, -1.22946635e-02,
         4.31016950e-04,  5.30389339e-03,  6.50014892e-03,
        -7.11290622e-03, -2.72296092e-03],
       [ 8.43257161e-02, -7.85621844e-02,  3.88428329e-03,
         9.88786091e-01,  2.48951625e-02,  9.04223224e-02,
  

## Comparison model with CPA & without 

Without CPA

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 5)

In [24]:
lr1 = LogisticRegression(max_iter=5000)
model1 = lr1.fit(X_train,y_train)

In [25]:
from sklearn import metrics
y_test_pred = model1.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred)

0.9509090909090909

With CPA

In [26]:
pca = PCA(n_components=3)
pca.fit(X)
X_new = pca.transform(X)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.33, random_state = 5)

In [29]:
lr2 = LogisticRegression(max_iter=1000)
model2 = lr2.fit(X_train,y_train)

In [30]:
y_test_pred = model2.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred)

0.9115151515151515