In [1]:
from sklearn.feature_selection import SequentialFeatureSelector
from datasets import load_diabetes
from tools import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [2]:
df_original, df_train, df_test = load_diabetes()

In [3]:
train_X = df_train.drop(['target'], axis=1)
train_y = df_train['target']
test_X = df_test.drop(['target'], axis=1)
test_y = df_test['target']
original_X = df_original.drop(['target'], axis=1)
original_y = df_original['target']
features = train_X.columns

## Forward selection

In [4]:
# forward - kazdy krok prida (greedy), vyuziva cross validaci
selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=5,
                                        direction='forward').fit(train_X, train_y)

In [5]:
selected_features = train_X.columns[selection_forward.get_support()]
selected_features

Index(['sex', 'bmi', 'bp', 's3', 's5'], dtype='object')

In [6]:
model = LinearRegression()
model = model.fit(train_X[selected_features], train_y)

In [7]:
coef_table(model.coef_, selected_features)

Unnamed: 0,feature,coef
1,bmi,474.845438
4,s5,468.60172
2,bp,324.758076
0,sex,-269.749955
3,s3,-413.044757


In [8]:
model_performance(model, train_X[selected_features], train_y, test_X[selected_features], test_y)

Train score
0.5003648131085888
Test score
0.5118888487575156


In [9]:
scores = []

for i in range(1,len(features)-1):
    selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=i,
                                        direction='forward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression(), original_X[selected_features], original_y).mean())

result = pd.DataFrame(zip(range(1,len(features)-1), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Backward selection

In [10]:
# forward - kazdy krok odebere (greedy), vyuziva cross validaci
selection_backward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=6,
                                        direction='backward').fit(train_X, train_y)

In [11]:
selected_features = train_X.columns[selection_backward.get_support()]
selected_features

Index(['sex', 'bmi', 'bp', 's1', 's3', 's5'], dtype='object')

In [12]:
model = LinearRegression()
model = model.fit(train_X[selected_features], train_y)

In [13]:
coef_table(model.coef_, selected_features)

Unnamed: 0,feature,coef
5,s5,549.108988
1,bmi,491.131235
2,bp,335.172502
3,s1,-141.93835
0,sex,-264.385753
4,s3,-361.777537


In [14]:
model_performance(model, train_X[selected_features], train_y, test_X[selected_features], test_y)

Train score
0.5053544710155173
Test score
0.5160856731926349


In [15]:
scores = []

for i in range(1,len(features)-1):
    selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=i,
                                        direction='backward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression(), original_X[selected_features], original_y).mean())

result = pd.DataFrame(zip(range(1,len(features)-1), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Dle coef

In [16]:
# Dle coeficientu z klasicke regrese "nejlepsi features"
model = LinearRegression()
highest_coef = ['s5','s2','bmi','bp','s4']
model.fit(train_X[highest_coef], train_y)

LinearRegression()

In [17]:
cross_val_score(LinearRegression(), train_X[highest_coef], train_y).mean()

0.42307928659594174

In [18]:
model_performance(model, train_X[highest_coef], train_y, test_X[highest_coef], test_y)

Train score
0.47345699498731386
Test score
0.5335312130632605


In [19]:
coef_table(model.coef_, highest_coef)

Unnamed: 0,feature,coef
2,bmi,557.687106
0,s5,453.123924
4,s4,304.344595
3,bp,282.437105
1,s2,-258.566063
