# Model Selection

## 1. Setup from previous notebook and cross-val

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

PATH = '../data/train.csv'

def encoding_values(df):
    '''
    encode categorical data to float
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''
    df_objects = (df.dtypes=='object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_values(df):
    '''
    Impute missing values in features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    imputer = SimpleImputer()
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def scaling_values(df):
    '''
    Scaling features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = MinMaxScaler()
    x_train = df.drop(['Id', 'SalePrice'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Id', value=df['Id'])
    scaled_data['SalePrice'] = df['SalePrice']
    return scaled_data

def apply_preprocessing(df):
    '''
    Applying data cleaning functions to data sets
    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    # since some columns have to much NaN we will drop them here
    df = df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
    df = encoding_values(df)
    df = impute_values(df)
    df = scaling_values(df)
    return df

def get_df():
    '''
    Sharing dataframe after aplying preprocessing
    Returns:
        pandas.DataFrame
    '''
    df = pd.read_csv(PATH)
    df = apply_preprocessing(df)
    return df


In [2]:
df = get_df()

In [3]:
X = df.drop(['SalePrice', 'Id'], axis=1)

In [4]:
LABEL = 'SalePrice'
y = df[LABEL]

In [5]:
from sklearn.model_selection import KFold
k_fold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

## 2. Linear Regression

In [6]:
scores = []

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = LinearRegression()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))

        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.sum(np.std(scores))/5, 3), "%")


34076.22457697364
35265.18075045233
35031877991714.348
30261.18472230593
22659.364193354035

Average: 7006375679182.119
Std: 2101912676808.219 %


## 3. Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = LogisticRegression(random_state=42, max_iter=1000)

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))

        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.sum(np.std(scores))/5, 3), "%")


61201.729016587946
58418.65722394957
53109.62855399984
57484.478223607555
51719.80490565923

Average: 56386.85958476083
Std: 698.785 %


In [9]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,0.235294,0.75,0.150685,0.033420,1.0,1.0,1.0,0.0,1.0,...,0.000000,0.0,0.0,0.0,0.00000,0.090909,0.50,1.0,0.8,208500.0
1,2.0,0.000000,0.75,0.202055,0.038795,1.0,1.0,1.0,0.0,0.5,...,0.000000,0.0,0.0,0.0,0.00000,0.363636,0.25,1.0,0.8,181500.0
2,3.0,0.235294,0.75,0.160959,0.046507,1.0,0.0,1.0,0.0,1.0,...,0.000000,0.0,0.0,0.0,0.00000,0.727273,0.50,1.0,0.8,223500.0
3,4.0,0.294118,0.75,0.133562,0.038561,1.0,0.0,1.0,0.0,0.0,...,0.492754,0.0,0.0,0.0,0.00000,0.090909,0.00,1.0,0.0,140000.0
4,5.0,0.235294,0.75,0.215753,0.060576,1.0,0.0,1.0,0.0,0.5,...,0.000000,0.0,0.0,0.0,0.00000,1.000000,0.50,1.0,0.8,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,0.235294,0.75,0.140411,0.030929,1.0,1.0,1.0,0.0,1.0,...,0.000000,0.0,0.0,0.0,0.00000,0.636364,0.25,1.0,0.8,175000.0
1456,1457.0,0.000000,0.75,0.219178,0.055505,1.0,1.0,1.0,0.0,1.0,...,0.000000,0.0,0.0,0.0,0.00000,0.090909,1.00,1.0,0.8,210000.0
1457,1458.0,0.294118,0.75,0.154110,0.036187,1.0,1.0,1.0,0.0,1.0,...,0.000000,0.0,0.0,0.0,0.16129,0.363636,1.00,1.0,0.8,266500.0
1458,1459.0,0.000000,0.75,0.160959,0.039342,1.0,1.0,1.0,0.0,1.0,...,0.202899,0.0,0.0,0.0,0.00000,0.272727,1.00,1.0,0.8,142125.0


In [10]:
df.Id = df.Id.astype(int)

##### NOTE: After a little submission in kaggle its clearly overfitted :(

## 4. XGBoost

In [13]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = xgb.XGBRegressor()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        #acc_score = round(mean_squared_error(y_test, y_predict),3)
        acc_score = round(reg.score(X_test, y_test), 3)
        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3), "%")


0.91
0.845
0.619
0.858
0.87

Average: 0.8203999999999999
Std: 2.06 %
