In [30]:
from sklearn.feature_selection import SequentialFeatureSelector
from datasets import load_happines
from tools import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [31]:
df_train, df_test = load_happines()
df_train.columns

Index(['region', 'economy', 'family', 'health', 'freedom', 'trust',
       'generosity', 'target'],
      dtype='object')

In [32]:
train_X = df_train.drop(['region', 'target'], axis=1)
train_y = df_train['target']
test_X = df_test.drop(['region', 'target'], axis=1)
test_y = df_test['target']
features = train_X.columns

In [33]:
scaler = StandardScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)

train_X = pd.DataFrame(train_X, columns=features)
test_X = pd.DataFrame(test_X, columns=features)

## Forward selection

In [34]:
# forward - kazdy krok prida (greedy), vyuziva cross validaci
selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=3,
                                        direction='forward').fit(train_X, train_y)

In [35]:
selected_features = train_X.columns[selection_forward.get_support()]
selected_features

Index(['economy', 'family', 'freedom'], dtype='object')

In [36]:
model = LinearRegression()
model = model.fit(train_X[selected_features], train_y)

LinearRegression()

In [37]:
cross_val_score(LinearRegression(), train_X[selected_features], train_y, cv=10).mean()

0.7074187013237277

In [38]:
coef_table(model.coef_, selected_features)

Unnamed: 0,feature,coef
0,economy,0.551227
1,family,0.364425
2,freedom,0.283526


In [39]:
model_performance(model, train_X[selected_features], train_y, test_X[selected_features], test_y)

Train score
0.7547754126367489
Test score
0.7137697841546065


In [40]:
scores = []

for i in range(1,6):
    selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=i,
                                        direction='forward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression(), train_X[selected_features], train_y, cv=10).mean())

result = pd.DataFrame(zip(range(1,6), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')

## Backward selection

In [41]:
# forward - kazdy krok odebere (greedy), vyuziva cross validaci
selection_backward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=3,
                                        direction='backward').fit(train_X, train_y)

In [42]:
selected_features = train_X.columns[selection_backward.get_support()]
selected_features

Index(['economy', 'family', 'freedom'], dtype='object')

## Dle coef

In [46]:
# Dle coeficientu z klasicke regrese "nejlepsi features"
model = LinearRegression()
highest_coef = ['family', 'economy', 'health']
model.fit(train_X[highest_coef], train_y)

LinearRegression()

In [47]:
cross_val_score(LinearRegression(), train_X[highest_coef], train_y, cv=10).mean()

0.6817865365518576

In [48]:
# zde rozdil, cross_val nemusi mit vhodnou generalizaci na data z dalsiho roku (vliv casu a jinych vlivu)
model_performance(model, train_X[highest_coef], train_y, test_X[highest_coef], test_y)

Train score
0.7274643883039997
Test score
0.6467712916862787


In [49]:
coef_table(model.coef_, highest_coef)

Unnamed: 0,feature,coef
0,family,0.46073
1,economy,0.356872
2,health,0.290514


In [50]:
scores = []

for i in range(1,6):
    selection_forward = SequentialFeatureSelector(LinearRegression(), n_features_to_select=i,
                                        direction='backward').fit(train_X, train_y)
    selected_features = train_X.columns[selection_forward.get_support()]
    scores.append(cross_val_score(LinearRegression(), train_X[selected_features], train_y, cv=10).mean())

result = pd.DataFrame(zip(range(1,6), scores), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)')