In [1]:
from sklearn.feature_selection import SequentialFeatureSelector
from datasets import load_wine
from tools import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [2]:
df_original, df_train, df_test = load_wine()
df_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
779,7.1,0.52,0.03,2.6,0.076,21.0,92.0,0.99745,3.5,0.6,9.8,5
185,8.9,0.31,0.57,2.0,0.111,26.0,85.0,0.9971,3.26,0.53,9.7,5
309,7.4,0.29,0.38,1.7,0.062,9.0,30.0,0.9968,3.41,0.53,9.5,6
1513,6.4,0.56,0.15,1.8,0.078,17.0,65.0,0.99294,3.33,0.6,10.5,6
1185,7.0,0.43,0.3,2.0,0.085,6.0,39.0,0.99346,3.33,0.46,11.9,6


In [3]:
train_X = df_train.drop(['target'], axis=1)
train_y = df_train['target']
test_X = df_test.drop(['target'], axis=1)
test_y = df_test['target']
original_X = df_original.drop(['target'], axis=1)
original_y = df_original['target']
features = train_X.columns

In [33]:
# scaler = StandardScaler().fit(train_X)
# train_X = scaler.transform(train_X)
# test_X = scaler.transform(test_X)

# train_X = pd.DataFrame(train_X, columns=features)
# test_X = pd.DataFrame(test_X, columns=features)

## Forward selection

In [13]:
# forward - kazdy krok prida (greedy), vyuziva cross validaci
selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=6,
                                        direction='forward').fit(train_X, train_y)

In [14]:
selected_features = train_X.columns[selection_forward.get_support()]
selected_features

Index(['volatile acidity', 'chlorides', 'total sulfur dioxide', 'pH',
       'sulphates', 'alcohol'],
      dtype='object')

In [15]:
model = LinearRegression()
model = model.fit(train_X[selected_features], train_y)

In [16]:
cross_val_score(LinearRegression(), train_X[selected_features], train_y).mean()

0.34441401952575557

In [17]:
coef_table(model.coef_, selected_features)

Unnamed: 0,feature,coef
4,sulphates,0.978604
5,alcohol,0.273421
2,total sulfur dioxide,-0.002623
3,pH,-0.415286
0,volatile acidity,-1.178353
1,chlorides,-2.180536


In [18]:
model_performance(model, train_X[selected_features], train_y, test_X[selected_features], test_y)

Train score
0.36082293132011867
Test score
0.3297192218645292


In [12]:
scores = []

for i in range(1,len(features)-1):
    selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=i,
                                        direction='forward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression(), train_X[selected_features], train_y, cv=10).mean())

result = pd.DataFrame(zip(range(1,len(features)-1), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Backward selection

In [19]:
# forward - kazdy krok odebere (greedy), vyuziva cross validaci
selection_backward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=6,
                                        direction='backward').fit(train_X, train_y)

In [20]:
selected_features = train_X.columns[selection_backward.get_support()]
selected_features

Index(['volatile acidity', 'chlorides', 'total sulfur dioxide', 'pH',
       'sulphates', 'alcohol'],
      dtype='object')

In [25]:
scores = []

for i in range(1,len(features)-1):
    selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=i,
                                        direction='backward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression(), train_X[selected_features], train_y, cv=10).mean())

result = pd.DataFrame(zip(range(1,len(features)-1), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Dle coef

In [26]:
# Dle coeficientu z klasicke regrese "nejlepsi features"
model = LinearRegression()
highest_coef = ['alcohol', 'sulphates', 'residual sugar', 'free sulfur dioxide', 'fixed acidity', 'density']
model.fit(train_X[highest_coef], train_y)

LinearRegression()

In [27]:
cross_val_score(LinearRegression(), train_X[highest_coef], train_y).mean()

0.2644664033834167

In [28]:
# zde rozdil, cross_val nemusi mit vhodnou generalizaci na data z dalsiho roku (vliv casu a jinych vlivu)
model_performance(model, train_X[highest_coef], train_y, test_X[highest_coef], test_y)

Train score
0.28108829389799705
Test score
0.3354792282356741


In [29]:
coef_table(model.coef_, highest_coef)

Unnamed: 0,feature,coef
1,sulphates,1.077099
0,alcohol,0.27704
4,fixed acidity,0.094391
2,residual sugar,0.026216
3,free sulfur dioxide,-0.00259
5,density,-72.159067
