In [1]:
from sklearn.feature_selection import SequentialFeatureSelector
from datasets import load_diabetes
from tools import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [2]:
df_original, df_train, df_test = load_diabetes()

In [3]:
train_X = df_train.drop(['target'], axis=1)
train_y = df_train['target']
test_X = df_test.drop(['target'], axis=1)
test_y = df_test['target']
original_X = df_original.drop(['target'], axis=1)
original_y = df_original['target']
features = train_X.columns

In [4]:
LinearRegression = SMWrapper(sm.OLS)

## Forward selection

In [5]:
# forward - kazdy krok prida (greedy), vyuziva cross validaci
selection_forward = SequentialFeatureSelector(LinearRegression, n_features_to_select=5,
                                        direction='forward').fit(train_X, train_y)

In [6]:
selected_features = train_X.columns[selection_forward.get_support()]
selected_features

Index(['sex', 'bmi', 'bp', 's3', 's5'], dtype='object')

In [7]:
model = LinearRegression
model = model.fit(train_X[selected_features], train_y)

In [9]:
model.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.5
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,60.69
Date:,"Mon, 14 Jun 2021",Prob (F-statistic):,1.1300000000000001e-43
Time:,10:57:53,Log-Likelihood:,-1670.7
No. Observations:,309,AIC:,3353.0
Df Residuals:,303,BIC:,3376.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,150.7556,3.104,48.566,0.000,144.647,156.864
sex,-269.7500,72.884,-3.701,0.000,-413.173,-126.327
bmi,474.8454,77.015,6.166,0.000,323.294,626.397
bp,324.7581,76.986,4.218,0.000,173.263,476.254
s3,-413.0448,87.842,-4.702,0.000,-585.903,-240.186
s5,468.6017,80.378,5.830,0.000,310.431,626.772

0,1,2,3
Omnibus:,4.18,Durbin-Watson:,1.954
Prob(Omnibus):,0.124,Jarque-Bera (JB):,3.011
Skew:,0.084,Prob(JB):,0.222
Kurtosis:,2.547,Cond. No.,34.3


In [10]:
model_performance(model, train_X[selected_features], train_y, test_X[selected_features], test_y)

R-squared:
Train score
0.5003648131085888
Test score
0.5118888487575164
MSE:
Train score
2909.224291996577
Test score
3015.9682455413413


In [14]:
scores = []

for i in range(1,len(features)-1):
    selection_forward = SequentialFeatureSelector(LinearRegression, n_features_to_select=i,
                                        direction='forward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression, original_X[selected_features], original_y).mean())

result = pd.DataFrame(zip(range(1,len(features)-1), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Backward selection

In [15]:
# forward - kazdy krok odebere (greedy), vyuziva cross validaci
selection_backward = SequentialFeatureSelector(LinearRegression, n_features_to_select=6,
                                        direction='backward').fit(train_X, train_y)

In [16]:
selected_features = train_X.columns[selection_backward.get_support()]
selected_features

Index(['sex', 'bmi', 'bp', 's1', 's3', 's5'], dtype='object')

In [17]:
model = LinearRegression
model = model.fit(train_X[selected_features], train_y)

In [18]:
model.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.505
Model:,OLS,Adj. R-squared:,0.496
Method:,Least Squares,F-statistic:,51.42
Date:,"Mon, 14 Jun 2021",Prob (F-statistic):,2.07e-43
Time:,10:58:35,Log-Likelihood:,-1669.1
No. Observations:,309,AIC:,3352.0
Df Residuals:,302,BIC:,3378.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,150.7275,3.094,48.720,0.000,144.639,156.816
sex,-264.3858,72.704,-3.636,0.000,-407.457,-121.315
bmi,491.1312,77.321,6.352,0.000,338.975,643.287
bp,335.1725,76.959,4.355,0.000,183.728,486.617
s1,-141.9383,81.322,-1.745,0.082,-301.968,18.091
s3,-361.7775,92.343,-3.918,0.000,-543.495,-180.060
s5,549.1090,92.439,5.940,0.000,367.203,731.015

0,1,2,3
Omnibus:,3.449,Durbin-Watson:,1.963
Prob(Omnibus):,0.178,Jarque-Bera (JB):,2.624
Skew:,0.08,Prob(JB):,0.269
Kurtosis:,2.578,Cond. No.,39.8


In [19]:
model_performance(model, train_X[selected_features], train_y, test_X[selected_features], test_y)

R-squared:
Train score
0.5053544710155173
Test score
0.5160856731926371
MSE:
Train score
2880.1710259888246
Test score
2990.0366740207637


In [20]:
scores = []

for i in range(1,len(features)-1):
    selection_forward = SequentialFeatureSelector(LinearRegression, n_features_to_select=i,
                                        direction='backward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression, original_X[selected_features], original_y).mean())

result = pd.DataFrame(zip(range(1,len(features)-1), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Dle coef

In [25]:
# Dle coeficientu z klasicke regrese "nejlepsi features"
model = LinearRegression
highest_coef = ['s5','s2','bmi','bp','s4']
model = model.fit(train_X[highest_coef], train_y)

In [26]:
cross_val_score(LinearRegression, train_X[highest_coef], train_y).mean()

0.4230792865959362

In [27]:
model.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.473
Model:,OLS,Adj. R-squared:,0.465
Method:,Least Squares,F-statistic:,54.49
Date:,"Mon, 14 Jun 2021",Prob (F-statistic):,2.93e-40
Time:,10:59:02,Log-Likelihood:,-1678.8
No. Observations:,309,AIC:,3370.0
Df Residuals:,303,BIC:,3392.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,151.1203,3.187,47.411,0.000,144.848,157.393
s5,453.1239,95.260,4.757,0.000,265.668,640.580
s2,-258.5661,90.576,-2.855,0.005,-436.804,-80.328
bmi,557.6871,78.233,7.129,0.000,403.739,711.635
bp,282.4371,77.692,3.635,0.000,129.553,435.321
s4,304.3446,114.395,2.660,0.008,79.235,529.454

0,1,2,3
Omnibus:,11.536,Durbin-Watson:,1.993
Prob(Omnibus):,0.003,Jarque-Bera (JB):,6.741
Skew:,0.185,Prob(JB):,0.0344
Kurtosis:,2.379,Cond. No.,45.4


In [28]:
model_performance(model, train_X[highest_coef], train_y, test_X[highest_coef], test_y)

R-squared:
Train score
0.473456994987314
Test score
0.5335312130632615
MSE:
Train score
3065.9003632118174
Test score
2882.2432049672566
