## Feature selection - practising

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
from sklearn.feature_selection import VarianceThreshold
import statsmodels.api as sm

In [2]:
customer_data = pd.read_csv("customer_case_study_data.csv")

In [3]:
customer_data.drop(columns = ["Unnamed: 0", "Effective To Date Month"], inplace = True)

In [4]:
customer_data

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Vehicle Type
0,DK49336,Arizona,4809.216960,No,Basic,College,2/18/11,Employed,M,48029,...,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.800000,Four-Door Car,Medsize,
1,KX64629,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,0,...,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,LZ68649,Washington,14947.917300,No,Basic,Bachelor,2/10/11,Employed,M,22139,...,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.000000,SUV,Medsize,A
3,XL78013,Oregon,22332.439460,Yes,Extended,College,1/11/11,Employed,M,49078,...,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,23675,...,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10905,FE99816,Nevada,15563.369440,No,Premium,Bachelor,1/19/11,Unemployed,F,0,...,,7,Personal Auto,Personal L1,Offer3,Web,1214.400000,Luxury Car,Medsize,A
10906,KX53892,Oregon,5259.444853,No,Basic,College,1/6/11,Employed,F,61146,...,0.0,6,Personal Auto,Personal L3,Offer2,Branch,273.018929,Four-Door Car,Medsize,A
10907,TL39050,Arizona,23893.304100,No,Extended,Bachelor,2/6/11,Employed,F,39837,...,0.0,2,Corporate Auto,Corporate L3,Offer1,Web,381.306996,Luxury SUV,Medsize,
10908,WA60547,California,11971.977650,No,Premium,College,2/13/11,Employed,F,64195,...,4.0,6,Personal Auto,Personal L1,Offer1,Branch,618.288849,SUV,Medsize,A


#### Splitting and cleaning the data

In [5]:
X = customer_data.drop("Total Claim Amount", axis = 1)
y = customer_data["Total Claim Amount"]

In [6]:
X = X._get_numeric_data()

In [7]:
X.isna().sum()

Customer Lifetime Value            0
Income                             0
Monthly Premium Auto               0
Months Since Last Claim          633
Months Since Policy Inception      0
Number of Open Complaints        633
Number of Policies                 0
dtype: int64

In [8]:
months_last_claim_median = X["Months Since Last Claim"].median()
months_last_claim_median
X["Months Since Last Claim"] = X["Months Since Last Claim"].fillna(months_last_claim_median)

In [9]:
nr_of_open_complaints_median = X["Number of Open Complaints"].median()
nr_of_open_complaints_median
X["Number of Open Complaints"] = X["Number of Open Complaints"].fillna(nr_of_open_complaints_median)

In [10]:
X.isna().sum()

Customer Lifetime Value          0
Income                           0
Monthly Premium Auto             0
Months Since Last Claim          0
Months Since Policy Inception    0
Number of Open Complaints        0
Number of Policies               0
dtype: int64

In [11]:
y.isna().sum()

0

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [13]:
X_train.describe()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
count,8728.0,8728.0,8728.0,8728.0,8728.0,8728.0,8728.0
mean,8025.739678,37593.503093,93.291247,15.038497,48.059808,0.357012,2.970784
std,6973.335781,30343.602668,34.710942,9.739232,27.969144,0.878525,2.387027
min,1898.007675,0.0,61.0,0.0,0.0,0.0,1.0
25%,4016.439689,0.0,68.0,7.0,24.0,0.0,1.0
50%,5764.823237,33889.5,83.0,14.0,48.0,0.0,2.0
75%,8956.200142,62198.75,109.0,23.0,71.0,0.0,4.0
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0


### Ridge

In [91]:
model=Ridge(alpha=1000)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.5236017131914261, Test -> 0.5069325760159102


### Lasso

In [93]:
model=Lasso(alpha=0.05)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.40536804877773536, Test -> 0.373271768541234


### ElasticNet

In [94]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.4053677833135618, Test -> 0.373273769220575


### OLS

In [95]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.40536809194635415, Test -> 0.3732694163492163


### RFE

In [92]:
lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 5, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 7 features.
Fitting estimator with 6 features.
Final selected features: 


Unnamed: 0,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
0,62.0,26.0,62.0,0.0,3.0
1,127.0,19.0,12.0,0.0,3.0
2,126.0,4.0,62.0,0.0,1.0
3,68.0,24.0,31.0,0.0,2.0
4,62.0,26.0,81.0,0.0,1.0
...,...,...,...,...,...
8723,108.0,7.0,57.0,0.0,1.0
8724,98.0,12.0,83.0,0.0,1.0
8725,69.0,5.0,78.0,0.0,2.0
8726,70.0,18.0,74.0,0.0,3.0


### Variance threshold method

In [110]:
selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Final number of numerical columns:  (8728, 4)



Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Policy Inception
0,4665.129599,0.0,62.0,62.0
1,10288.924950,96337.0,127.0,12.0
2,4873.436612,18866.0,126.0,62.0
3,6944.739992,0.0,68.0,31.0
4,2472.469209,63860.0,62.0,81.0
...,...,...,...,...
8723,3810.238281,0.0,108.0,57.0
8724,3815.851163,38651.0,98.0,83.0
8725,7850.590399,0.0,69.0,78.0
8726,4974.235309,0.0,70.0,74.0


### P-value method

In [16]:
X_added_constant = sm.add_constant(X)
X_added_constant

Unnamed: 0,const,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
0,1.0,4809.216960,48029,61,7.0,52,0.0,9
1,1.0,2228.525238,0,64,3.0,26,0.0,1
2,1.0,14947.917300,22139,100,34.0,31,0.0,2
3,1.0,22332.439460,49078,97,10.0,3,0.0,2
4,1.0,9025.067525,23675,117,14.0,31,0.0,7
...,...,...,...,...,...,...,...,...
10905,1.0,15563.369440,0,253,14.0,40,0.0,7
10906,1.0,5259.444853,61146,65,7.0,68,0.0,6
10907,1.0,23893.304100,39837,201,11.0,63,0.0,2
10908,1.0,11971.977650,64195,158,0.0,27,4.0,6


In [17]:
model = sm.OLS(y,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,Total Claim Amount,R-squared:,0.521
Model:,OLS,Adj. R-squared:,0.52
Method:,Least Squares,F-statistic:,1691.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,16:33:06,Log-Likelihood:,-73410.0
No. Observations:,10910,AIC:,146800.0
Df Residuals:,10902,BIC:,146900.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,69.8543,8.042,8.686,0.000,54.090,85.618
Customer Lifetime Value,-0.0005,0.000,-1.779,0.075,-0.001,5.57e-05
Income,-0.0033,6.39e-05,-52.362,0.000,-0.003,-0.003
Monthly Premium Auto,5.3766,0.062,87.392,0.000,5.256,5.497
Months Since Last Claim,-0.0337,0.198,-0.170,0.865,-0.422,0.355
Months Since Policy Inception,-0.1126,0.069,-1.623,0.105,-0.249,0.023
Number of Open Complaints,-1.1232,2.178,-0.516,0.606,-5.392,3.146
Number of Policies,0.0891,0.808,0.110,0.912,-1.495,1.673

0,1,2,3
Omnibus:,1203.053,Durbin-Watson:,1.971
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7450.921
Skew:,0.343,Prob(JB):,0.0
Kurtosis:,6.99,Cond. No.,202000.0


In [18]:
X_added_constant = X_added_constant.drop(["Months Since Last Claim","Number of Open Complaints", "Number of Policies", "Months Since Policy Inception", "Customer Lifetime Value"], axis=1)
model = sm.OLS(y,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,Total Claim Amount,R-squared:,0.52
Model:,OLS,Adj. R-squared:,0.52
Method:,Least Squares,F-statistic:,5914.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,16:35:10,Log-Likelihood:,-73414.0
No. Observations:,10910,AIC:,146800.0
Df Residuals:,10907,BIC:,146900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,63.7976,6.102,10.455,0.000,51.836,75.759
Income,-0.0033,6.38e-05,-52.468,0.000,-0.003,-0.003
Monthly Premium Auto,5.3307,0.056,94.745,0.000,5.220,5.441

0,1,2,3
Omnibus:,1194.055,Durbin-Watson:,1.971
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7327.815
Skew:,0.341,Prob(JB):,0.0
Kurtosis:,6.957,Cond. No.,152000.0
