# Mikołaj Spytek - praca domowa 4

In [None]:
import dalex as dx
import pandas as pd
from scipy.stats import uniform
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
#załadowanie zbiorów danych
apartments = dx.datasets.load_apartments()
apartments_test = dx.datasets.load_apartments_test()

x_apartments_train = apartments.iloc[:,0:4]
y_apartments_train = apartments.iloc[:, 5]
x_apartments_test = apartments_test.iloc[:,0:4]
y_apartments_test = apartments_test.iloc[:, 5]


data = load_breast_cancer()

x_cancer = pd.DataFrame(data=data.data, columns=data.feature_names)
y_cancer = pd.DataFrame(data.target)

x_cancer_train,  x_cancer_test, y_cancer_train, y_cancer_test = train_test_split(x_cancer, y_cancer, random_state=42)

In [None]:
#przeskalowanie danych
apartments_scaler = StandardScaler()
x_apartments_train_scaled =  apartments_scaler.fit_transform(x_apartments_train)
x_apartments_test_scaled =  apartments_scaler.fit_transform(x_apartments_test)

cancer_scaler = StandardScaler()
x_cancer_train_scaled = cancer_scaler.fit_transform(x_cancer_train)
x_cancer_test_scaled = cancer_scaler.fit_transform(x_cancer_test)

In [None]:
#wytrenowanie modeli na danych z i bez skalowania
sv_a1 = SVC(kernel="rbf", random_state=42)
sv_a2 = SVC(kernel="rbf", random_state=42)
sv_c1 = SVC(kernel="rbf", random_state=42)
sv_c2 = SVC(kernel="rbf", random_state=42)

sv_a1.fit(x_apartments_train, y_apartments_train)
unscaled_apartments_pred = sv_a1.predict(x_apartments_test)

sv_a2.fit(x_apartments_train_scaled, y_apartments_train)
scaled_apartments_pred = sv_a1.predict(x_apartments_test_scaled)


sv_c1.fit(x_cancer_train, y_cancer_train)
unscaled_cancer_pred = sv_c1.predict(x_cancer_test)

sv_c2.fit(x_cancer_train_scaled, y_cancer_train)
scaled_cancer_pred = sv_c1.predict(x_cancer_test_scaled)

unscaled_apartments_acc = accuracy_score(y_apartments_test, unscaled_apartments_pred)
scaled_apartments_acc = accuracy_score(y_apartments_test, scaled_apartments_pred)
unscaled_cancer_acc = accuracy_score(y_cancer_test, unscaled_cancer_pred)
scaled_cancer_acc = accuracy_score(y_cancer_test, scaled_cancer_pred)

results = [[unscaled_apartments_acc,scaled_apartments_acc],
           [unscaled_cancer_acc,scaled_cancer_acc]]


pd_results =pd.DataFrame(data=results, columns=["unscaled","scaled"], index=["apartments","cancer"])

pd_results

Jak widać skalowanie nie dało pozytywnego efektu. Zbiór danych dotyczący raka piersi był już przeskalowany. Dodatkowo wg. dokumentacji sklearna SVM również przeprowadza skalowanie. Takie potrójne skalowanie przynosi więc ujemne efekty. Jeśli sprawa tyczy się zbioru apartments skalowanie również nie przynosi pożądanych efektów.

In [None]:
#hiperparametry do optymalizacji
distributions = dict(C= uniform(loc=0, scale=4),
                     degree=[i for i in range(1,15)],
                     gamma = uniform(loc=0, scale=1)
                    )

In [None]:
clf_a = RandomizedSearchCV(sv_a1, distributions, n_iter=1000, verbose=True, random_state=42)
search_a = clf_a.fit(x_apartments_train, y_apartments_train)

clf_c = RandomizedSearchCV(sv_c1, distributions, n_iter=1000, verbose=True, random_state=42)
search_c = clf_c.fit(x_cancer_train, y_cancer_train)

In [None]:
search_a.best_params_

In [None]:
search_c.best_params_

In [None]:
#sprawdzenie, czy te parametry poprawiają model
sv_a_new = SVC(kernel="rbf",C=3.5589922261307083, degree=6, gamma=0.0013385008146062916, random_state=42)
sv_a_new.fit(x_apartments_train, y_apartments_train)
unscaled_apartments_pred_new = sv_a_new.predict(x_apartments_test)
accuracy_score(y_apartments_test, unscaled_apartments_pred_new)

Jak widać, w tym przypadku mamy niewielki zysk.

In [None]:
sv_c_new = SVC(kernel="rbf", C= 1.8800995309030366, degree= 5, gamma=0.0010431294303261396, random_state=42)
sv_c_new.fit(x_cancer_train, y_cancer_train)
unscaled_cancer_pred_new = sv_c_new.predict(x_cancer_test)
accuracy_score(y_cancer_test, unscaled_cancer_pred_new)

W tym przypadku, zmiana hiperparametrów pogorszyła wynik. Być może dlatego, że już z domyślnymi parametrami model osiągnął tak dobry wynik, albo random search działał zbyt krótko i nie wystarczająco przeszukał przestrzeń parametrów.