In [None]:
import dalex as dx
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
dalex_df = dx.datasets.load_apartments()
dalex_df.head()

In [None]:
dalex_df.info()

Jako drugi zbiór danych wziąłem zbiór dotyczący mieszkań w Bostonie z Lab1.

In [None]:
boston_dict = load_boston()
boston_df = pd.DataFrame(boston_dict['data'], columns=boston_dict['feature_names'])
boston_df['MEDV'] = boston_dict['target']

boston_df.head()

In [None]:
boston_df.info()

In [None]:
len(dalex_df['district'].unique())

Ponieważ jest tylko 10 kategorii w ramce DALEX, użyjemy one-hot encodingu.

In [None]:
dalex_df_enc = pd.concat([
    pd.get_dummies(dalex_df.district, prefix='District'),
    dalex_df], axis=1).drop(['district'], axis=1)

# zmieńmy jeszcze kolejność kolumn na bardziej intuicyjną

cols = dalex_df_enc.columns.tolist()
cols = cols[-4:] + cols[:-4]

dalex_df_enc = dalex_df_enc[cols]
dalex_df_enc.head()

In [None]:
X_dalex = dalex_df_enc.drop('m2_price', axis=1)
Y_dalex = dalex_df_enc.m2_price

X_boston = boston_df.drop(['MEDV'], axis=1)
Y_boston = boston_df['MEDV']

X_train_dalex, X_test_dalex, y_train_dalex, y_test_dalex = train_test_split(
    X_dalex, Y_dalex, test_size = 0.33, random_state = 34)

X_train_boston, X_test_boston, y_train_boston, y_test_boston = train_test_split(
    X_boston, Y_boston, test_size = 0.33, random_state = 34)

# SVM

In [None]:
svm = SVR()
svm.fit(X_train_dalex, y_train_dalex)
y_hat_dalex = svm.predict(X_test_dalex)
print("Dalex")
print("Wynik R2: " + str(r2_score(y_test_dalex, y_hat_dalex)))
print("Miara RMSE: " + str(mean_squared_error(y_test_dalex, y_hat_dalex, squared = False)))

In [None]:
# przeskalujmy nasze dane i ponownie zbudujmy model
scaler = MinMaxScaler()
dalex_df_enc[['construction_year', 'surface', 'floor', 'no_rooms']] = scaler.fit_transform(dalex_df_enc[[
    'construction_year', 'surface', 'floor', 'no_rooms']])

X_dalex = dalex_df_enc.drop('m2_price', axis=1)
Y_dalex = dalex_df_enc.m2_price

X_train_dalex, X_test_dalex, y_train_dalex, y_test_dalex = train_test_split(
    X_dalex, Y_dalex, test_size = 0.33, random_state = 34)

In [None]:
svm = SVR()
svm.fit(X_train_dalex, y_train_dalex)
y_hat_dalex = svm.predict(X_test_dalex)
print("Dalex po przeskalowaniu")
print("Wynik R2: " + str(r2_score(y_test_dalex, y_hat_dalex)))
print("Miara RMSE: " + str(mean_squared_error(y_test_dalex, y_hat_dalex, squared = False)))

Widzimy, że po przeskalowaniu wyniki modelu uległy poprawieniu. Ten sam eksperyment przeprowadźmy dla datasetu bostońskiego

In [None]:
svm_boston = SVR()
svm_boston.fit(X_train_boston, y_train_boston)
y_hat_boston = svm_boston.predict(X_test_boston)
print("Boston")
print("Wynik R2: " + str(r2_score(y_test_boston, y_hat_boston)))
print("Miara RMSE: " + str(mean_squared_error(y_test_boston, y_hat_boston, squared = False)))

In [None]:
scaler = MinMaxScaler()
boston_df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']] = scaler.fit_transform(boston_df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']])

X_boston = boston_df.drop('MEDV', axis=1)
Y_boston = boston_df.MEDV

X_train_boston, X_test_boston, y_train_boston, y_test_boston = train_test_split(
    X_boston, Y_boston, test_size = 0.33, random_state = 34)

svm = SVR()
svm.fit(X_train_boston, y_train_boston)
y_hat_boston = svm.predict(X_test_boston)
print("Boston po przeskalowaniu")
print("Wynik R2: " + str(r2_score(y_test_boston, y_hat_boston)))
print("Miara RMSE: " + str(mean_squared_error(y_test_boston, y_hat_boston, squared = False)))


Wniosek: Skalowanie danych przynosi dobre efekty.

# Random Search

In [None]:
parameters = dict(
    C = np.arange(start = 0.1, stop = 10000, step = 0.05),
    gamma = ['scale', 'auto'],
    degree = np.arange(1, 80, 1))

svm_rand_dalex = RandomizedSearchCV(svm_boston, parameters, cv=3, n_iter=200)

svm_rand_dalex.fit(X_train_dalex, y_train_dalex)
print("Najlepsze parametry: " + str(svm_rand_dalex.best_params_))

best_estimator = svm_rand_dalex.best_estimator_
print("Wynik R2: " + str(r2_score(y_test_dalex, best_estimator.predict(X_test_dalex))))
print(f'RMSE: {mean_squared_error(y_test_dalex, best_estimator.predict(X_test_dalex), squared=False)}')

In [None]:
svm_rand_boston = RandomizedSearchCV(svm_boston, parameters, cv=3, n_iter=200)

svm_rand_boston.fit(X_train_boston, y_train_boston)
print("Najlepsze parametry: " + str(svm_rand_boston.best_params_))

best_estimator = svm_rand_boston.best_estimator_
print("Wynik R2: " + str(r2_score(y_test_boston, best_estimator.predict(X_test_boston))))
print(f'RMSE: {mean_squared_error(y_test_boston, best_estimator.predict(X_test_boston), squared=False)}')