# Model Selection

## 1. Setup from previous notebook and cross-val

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

PATH = '../data/train.csv'

def encoding_values(df):
    '''
    encode categorical data to float
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''
    df_objects = (df.dtypes=='object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_values(df):
    '''
    Impute missing values in features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    imputer = SimpleImputer()
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def scaling_values(df):
    '''
    Scaling features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = MinMaxScaler()
    x_train = df.drop(['Id', 'SalePrice'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Id', value=df['Id'])
    scaled_data['SalePrice'] = df['SalePrice']
    return scaled_data

def apply_preprocessing(df):
    '''
    Applying data cleaning functions to data sets
    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    # since some columns have to much NaN we will drop them here
    df = df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
    df = encoding_values(df)
    df = impute_values(df)
    df = scaling_values(df)
    return df

def get_df():
    '''
    Sharing dataframe after aplying preprocessing
    Returns:
        pandas.DataFrame
    '''
    df = pd.read_csv(PATH)
    df = apply_preprocessing(df)
    return df


In [2]:
df = get_df()

In [3]:
X = df.drop(['SalePrice', 'Id'], axis=1)

In [4]:
LABEL = 'SalePrice'
y = df[LABEL]

In [5]:
from sklearn.model_selection import KFold
k_fold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

## 2. Linear Regression

In [6]:
scores = []

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = LinearRegression()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))

        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.sum(np.std(scores))/5, 3))


34076.22457697364
35265.18075045233
35031877991714.348
30261.18472230593
22659.364193354035

Average: 7006375679182.119
Std: 2101912676808.219


## 3. Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = LogisticRegression(random_state=42, max_iter=1000)

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))

        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.sum(np.std(scores))/5, 3))


61201.729016587946
58418.65722394957
53109.62855399984
57484.478223607555
51719.80490565923

Average: 56386.85958476083
Std: 698.785


##### NOTE: After a little submission in kaggle its clearly overfitted :(

## 4. XGBoost

In [15]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = xgb.XGBRegressor()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))


26233.66709675354
32411.402992601532
45860.008614988874
29863.427410321547
26115.84526842815

Average: 32096.870276618727
Std: 7275.452


## 5. Bayesian Ridge Regression

In [14]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = Ridge(alpha=1.0)

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))

34221.67759441894
34607.83101903152
53813.9059277378
30227.692297131387
23064.947118213116

Average: 35187.21079130655
Std: 10195.762


## 6. Support Vector Regression

In [19]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = SVR()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))

88631.00798226075
85648.12819018435
76975.25229768059
80619.0594806844
74298.09728340106

Average: 81234.30904684223
Std: 5307.016


## 7. Decision Tree Regressor

In [20]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = DecisionTreeRegressor(random_state=42)

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))

40097.67377399363
46569.11974641006
51299.22124776079
40490.6007822245
40679.31526969995

Average: 43827.18616401779
Std: 4433.975


## 8. Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = RandomForestRegressor(random_state=42)

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))

29165.486625763384
25419.434977919722
46126.898580330446
28383.461312751453
23405.919044013288

Average: 30500.240108155656
Std: 8082.063


## 9. Cat Boost Regressor

In [22]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = CatBoostRegressor()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))

Learning rate set to 0.04196
0:	learn: 75091.8478869	total: 50.4ms	remaining: 50.3s
1:	learn: 73101.9301659	total: 51.6ms	remaining: 25.8s
2:	learn: 71164.4042442	total: 53ms	remaining: 17.6s
3:	learn: 69354.3058700	total: 54.1ms	remaining: 13.5s
4:	learn: 67620.5245943	total: 55.3ms	remaining: 11s
5:	learn: 65920.8016650	total: 56.6ms	remaining: 9.37s
6:	learn: 64241.8867976	total: 57.7ms	remaining: 8.19s
7:	learn: 62720.8784882	total: 59.3ms	remaining: 7.35s
8:	learn: 61171.8243504	total: 60.7ms	remaining: 6.68s
9:	learn: 59833.2847519	total: 62.6ms	remaining: 6.2s
10:	learn: 58431.5644484	total: 65.2ms	remaining: 5.86s
11:	learn: 57188.0118822	total: 66.5ms	remaining: 5.48s
12:	learn: 55908.1244790	total: 67.8ms	remaining: 5.15s
13:	learn: 54712.7678281	total: 69.1ms	remaining: 4.86s
14:	learn: 53613.0396129	total: 70.5ms	remaining: 4.63s
15:	learn: 52525.4523718	total: 71.7ms	remaining: 4.41s
16:	learn: 51356.4201385	total: 73ms	remaining: 4.22s
17:	learn: 50407.6597420	total: 74.2

##### Cat boost seems op 

## Future Engineering