# Praca domowa 4

In [128]:
import pandas as pd
import rdata
import matplotlib.pyplot as plt
from scipy.stats import expon

from sklearn.svm import SVR, SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error

## Apartments dataset
Za pomocą SVM zajmiemy się zadaniem regresji, a mianowicie będziemy przywidywać cenę metru kwadratowego mieszkania na podstawie reszty danych z ramki `apartments_df`.
### Exploration

In [30]:
parsed = rdata.parser.parse_file("apartments.rda")
converted = rdata.conversion.convert(parsed)
apartments_df = converted["apartments"]
apartments_df.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897.0,1953.0,25.0,3,1.0,Srodmiescie
1,1818.0,1992.0,143.0,9,5.0,Bielany
2,3643.0,1937.0,56.0,1,2.0,Praga
3,3517.0,1995.0,93.0,7,3.0,Ochota
4,3013.0,1992.0,144.0,6,5.0,Mokotow


In [3]:
apartments_df.info()
# nie ma braków
# kolumna district jest kategoryczna, do niej zastosujemy one-hot encoding

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   m2.price           1000 non-null   float64 
 1   construction.year  1000 non-null   float64 
 2   surface            1000 non-null   float64 
 3   floor              1000 non-null   int64   
 4   no.rooms           1000 non-null   float64 
 5   district           1000 non-null   category
dtypes: category(1), float64(4), int64(1)
memory usage: 40.6 KB


In [4]:
apartments_df.describe()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3487.019,1964.823,85.59,5.623,3.36
std,906.691651,25.831511,37.874799,2.899876,1.381415
min,1607.0,1920.0,20.0,1.0,1.0
25%,2857.0,1943.0,53.0,3.0,2.0
50%,3386.0,1965.0,85.5,6.0,3.0
75%,4018.25,1988.0,118.0,8.0,4.0
max,6595.0,2010.0,150.0,10.0,6.0


### Train-test split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(apartments_df.drop("m2.price", axis=1), 
                                                    apartments_df["m2.price"], 
                                                    test_size=0.3, shuffle=True, random_state=42)

### Modeling

In [119]:
def get_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    return (r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred, squared=False))

def modeling(X_train, y_train, scale=True):
    
    params = dict({'clf__C': expon(scale = 100),
              'clf__gamma': expon(scale = .1),
              'clf__epsilon': expon(scale = 10)})
    
    ohe = OneHotEncoder(sparse=False)
    columns_to_encode = ["district"]
    
    if scale:
        scaler = StandardScaler()
        columns_to_scale = list(filter(lambda x: x != "district", X_train.columns.to_list()))

        pipeline = Pipeline(
            [("coltransformer", ColumnTransformer(
                transformers=[
                    ("other", Pipeline([("scale", scaler)]), columns_to_scale),
                    ("district", Pipeline([("encode", ohe)]), columns_to_encode),
                ]),
            ),
            ("clf", SVR(kernel="rbf"))]
        )
        
    else:
        pipeline = Pipeline(
            [("coltransformer", ColumnTransformer(
                transformers=[
                    ("district", Pipeline([("encode", ohe)]), columns_to_encode),
                ]),
            ),
            ("clf", SVR(kernel="rbf"))]
        )
    
    random_search = RandomizedSearchCV(pipeline, params, random_state=0, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    return random_search.best_estimator_
        

In [130]:
clf = modeling(X_train, y_train)
scores = get_scores(clf, X_test, y_test)
print(f"SVM ze skalowaniem: r2: {scores[0]:.2f}, rmse: {scores[1]:.2f}")

SVM ze skalowaniem: r2: 0.95, rmse: 211.82


In [129]:
clf = modeling(X_train, y_train, scale=False)
get_scores(clf, X_test, y_test)
scores = get_scores(clf, X_test, y_test)
print(f"SVM bez skalowania: r2: {scores[0]:.2f}, rmse: {scores[1]:.2f}")

SVM bez skalowania: r2: 0.60, rmse: 574.11


## Australia dataset
Tutaj będzie zadanie klasyfikacji: będzie deszcz następnego dnia czy nie. Jest to zbiór danych z poprzedniej pracy domowej, więc eskploracja już za nami. 

### Modeling