In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dalex as dx
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Dataset 1 - apartments

In [2]:
dat1 = dx.datasets.load_apartments()
dat1.head()

Unnamed: 0,m2_price,construction_year,surface,floor,no_rooms,district
1,5897,1953,25,3,1,Srodmiescie
2,1818,1992,143,9,5,Bielany
3,3643,1937,56,1,2,Praga
4,3517,1995,93,7,3,Ochota
5,3013,1992,144,6,5,Mokotow


In [3]:
dat1.describe()

Unnamed: 0,m2_price,construction_year,surface,floor,no_rooms
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3487.019,1964.823,85.59,5.623,3.36
std,906.691651,25.831511,37.874799,2.899876,1.381415
min,1607.0,1920.0,20.0,1.0,1.0
25%,2857.0,1943.0,53.0,3.0,2.0
50%,3386.0,1965.0,85.5,6.0,3.0
75%,4018.25,1988.0,118.0,8.0,4.0
max,6595.0,2010.0,150.0,10.0,6.0


## Dataset 2 - voice gender 

Jako drugi dataset wykorzystam dane z pierwszego projektu, dotyczące rozpoznawania na podstawie głosu płeć rozmówcy.

In [4]:
dat2 = pd.read_csv("https://lovespreadsheet-tutorials.s3.amazonaws.com/APIDatasets/gender_voice_dataset.csv")
dat2.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402905,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [5]:
dat2.describe()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
count,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0
mean,0.180907,0.057126,0.185621,0.140456,0.224765,0.084309,3.140168,36.568461,0.895127,0.408216,0.165282,0.180907,0.142807,0.036802,0.258842,0.829211,0.052647,5.047277,4.99463,0.173752
std,0.029918,0.016652,0.03636,0.04868,0.023639,0.042783,4.240529,134.928661,0.04498,0.177521,0.077203,0.029918,0.032304,0.01922,0.030077,0.525205,0.063299,3.521157,3.520039,0.119454
min,0.039363,0.018363,0.010975,0.000229,0.042946,0.014558,0.141735,2.068455,0.738651,0.036876,0.0,0.039363,0.055565,0.009775,0.103093,0.007812,0.004883,0.007812,0.0,0.0
25%,0.163662,0.041954,0.169593,0.111087,0.208747,0.04256,1.649569,5.669547,0.861811,0.258041,0.118016,0.163662,0.116998,0.018223,0.253968,0.419828,0.007812,2.070312,2.044922,0.099766
50%,0.184838,0.059155,0.190032,0.140286,0.225684,0.09428,2.197101,8.318463,0.901767,0.396335,0.186599,0.184838,0.140519,0.04611,0.271186,0.765795,0.023438,4.992188,4.945312,0.139357
75%,0.199146,0.06702,0.210618,0.175939,0.24366,0.114175,2.931694,13.648905,0.928713,0.533676,0.221104,0.199146,0.169581,0.047904,0.277457,1.177166,0.070312,7.007812,6.992188,0.209183
max,0.251124,0.115273,0.261224,0.247347,0.273469,0.252225,34.725453,1309.612887,0.981997,0.842936,0.28,0.251124,0.237636,0.204082,0.279114,2.957682,0.458984,21.867188,21.84375,0.932374


## Przygotowanie danych

In [6]:
dataset1 = train_test_split(
    pd.get_dummies(dat1.iloc[:,1:],['district']).drop("district_Bemowo", axis=1), dat1['m2_price'], test_size=0.2, random_state=7
)

dataset2 = train_test_split(
    dat2.iloc[:,:-1], dat2['label'], stratify=dat2['label'], test_size=0.2, random_state=7
)

## Modele SVM

In [7]:
results = pd.DataFrame({'Accuracy': [], 'Recall': [], 'Precision': []})
resultsRMSE = pd.DataFrame({'RMSE': []})
def runSVM(dataset, label, metric):
    x_train, x_test, y_train, y_test = dataset
    
    # Not scaled
    svm = SVC(gamma='auto')
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    noteResults(y_pred, y_test, label+" (no scaling)", metric)
    
    # Scaled
    svm_scaled = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    svm_scaled.fit(x_train, y_train)
    y_pred_scaled = svm_scaled.predict(x_test)
    noteResults(y_pred_scaled, y_test, label+" (scaling)", metric)
    

def noteResults(y_pred, y_target, label, metric):
    if metric == 'acc':
        acc = accuracy_score(y_target,y_pred)
        rec = recall_score(y_target,y_pred,average='weighted')
        prec = precision_score(y_target,y_pred,average='weighted')
        results.loc[label] = [acc, rec, prec]
    else:
        rmse = mean_squared_error(y_target,y_pred,squared=False)
        resultsRMSE.loc[label] = [rmse]
    

runSVM(dataset1,"Apartments",'rmse')
runSVM(dataset2,"Voice gender",'acc')
display(resultsRMSE.T)
display(results.T)

Unnamed: 0,Apartments (no scaling),Apartments (scaling)
RMSE,976.159083,625.118457


Unnamed: 0,Voice gender (no scaling),Voice gender (scaling)
Accuracy,0.706625,0.976341
Recall,0.706625,0.976341
Precision,0.709637,0.976573


W obu przypadakch widać bardzo dużą poprawę jakości wyników przy użyciu skalowania. W przypadku zbioru danych apartamentów wartość RMSE spadła o około 36%, a dla zbioru głosów mamy wzrost accuracy z marnych 71% aż do 98%. Od razu zatem widać jak ważne skalowanie danych jest przy użyciu modeli SVM.

## Optymalizacja hiperparametrów

In [8]:
def optimizeSVM(dataset, label, metric):
    x_train, x_test, y_train, y_test = dataset
    
    # Scalling data
    scaler = StandardScaler().fit(x_train)
    x_train_scalled = scaler.transform(x_train)
    x_test_scalled = scaler.transform(x_test)
    
    # Optimizing width random search
    opt = RandomizedSearchCV(
        SVC(),
        {
            'C': np.logspace(-6, 6, 39),
            'gamma': np.logspace(-8, 3, 36),
            'kernel': ['rbf']
        }, 
        scoring = ('accuracy' if metric == "acc" else "neg_root_mean_squared_error"),
        cv = 3, 
        n_iter = 200, 
        n_jobs = -1, 
        random_state = 7
    )
    params = opt.fit(x_train_scalled, y_train).best_params_
    
    print('Best parameters for "'+label+'":')
    print(params)
    
    # Testing SVM with optimal parameters
    optimal_svm = SVC(**params)
    optimal_svm.fit(x_train_scalled, y_train)
    y_pred = optimal_svm.predict(x_test_scalled)
    noteResults(y_pred, y_test, label+" (optimal + scaling)", metric)
    
optimizeSVM(dataset1, "Apartments", 'rmse')
optimizeSVM(dataset2, "Voice gender", 'acc')
display(resultsRMSE.sort_values("RMSE", ascending=False).T)
display(results.sort_values("Accuracy").T)



Best parameters for "Apartments":
{'kernel': 'rbf', 'gamma': 0.1692666615037876, 'C': 18.32980710832434}
Best parameters for "Voice gender":
{'kernel': 'rbf', 'gamma': 0.019306977288832496, 'C': 162.37767391887175}


Unnamed: 0,Apartments (no scaling),Apartments (scaling),Apartments (optimal + scaling)
RMSE,976.159083,625.118457,308.905714


Unnamed: 0,Voice gender (no scaling),Voice gender (optimal + scaling),Voice gender (scaling)
Accuracy,0.706625,0.973186,0.976341
Recall,0.706625,0.973186,0.976341
Precision,0.709637,0.973568,0.976573


Strojenie hiperparametrów dało pozwoliło nam zmniejszyć RMSE dla zbioru danych apartamentów o ponad połowę, co jest znaczącym polepszeniem dla ogólnej jakości sieci.   
W przypadku drugiego zbioru danych nie widać żadnej różnicy. Najprawdopodobniej jest to związane z samym zbiorem danych - dane są łatwo separowalne więc dobór hiperparametrów nie robi większej różnicy. Należy także uwzględnić że już w tym momencie accuracy tego modelu wynosi 97.6%, co jest bardzo dobrym wynikiem.