# Model Selection

## 1. Setup from previous notebook and cross-val

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

PATH = '../data/train.csv'

def encoding_values(df):
    '''
    encode categorical data to float
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''
    df_objects = (df.dtypes=='object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_values(df):
    '''
    Impute missing values in features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    imputer = SimpleImputer()
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def scaling_values(df):
    '''
    Scaling features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = StandardScaler()
    x_train = df.drop(['Id', 'SalePrice'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Id', value=df['Id'])
    scaled_data['SalePrice'] = df['SalePrice']
    return scaled_data

def apply_preprocessing(df):
    '''
    Applying data cleaning functions to data sets
    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    # since some columns have to much NaN we will drop them here
    df = df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
    df = encoding_values(df)
    df = impute_values(df)
    df = scaling_values(df)
    return df

def get_df():
    '''
    Sharing dataframe after aplying preprocessing
    Returns:
        pandas.DataFrame
    '''
    df = pd.read_csv(PATH)
    df = apply_preprocessing(df)
    return df

In [2]:
df = get_df()

In [3]:
X = df.drop(['SalePrice', 'Id'], axis=1)

In [4]:
LABEL = 'SalePrice'
y = df[LABEL]

In [5]:
from sklearn.model_selection import KFold
k_fold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

## 2. Linear Regression

In [6]:
scores = []

In [7]:
from sklearn.linear_model import LinearRegression

import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = LinearRegression()

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = round(reg.score(X_test, y_test),3)

        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")


0.849
0.816
-6.7139016279836034e+22
0.856
0.902

Average: -1.3427803255967208e+24 %
Std: 2.685560651193442e+24 %


## 3. Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
scores = []
import numpy as np
for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = LogisticRegression(random_state=42, max_iter=1000)

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = reg.score(X_test, y_test)

        print(acc_score)

        scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5, "%")
print("Std:", np.std(scores)/5, "%")


0.0136986301369863
0.010273972602739725
0.02054794520547945
0.010273972602739725
0.003424657534246575

Average: 0.011643835616438355 %
Std: 0.0011128819732378029 %


In [9]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1.0,0.073375,-0.045532,-0.229372,-0.207142,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.313867,0.208502,208500.0
1,2.0,-0.872563,-0.045532,0.451936,-0.091886,0.064238,0.750731,0.314667,-0.02618,-0.628316,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.489110,-0.614439,0.313867,0.208502,181500.0
2,3.0,0.073375,-0.045532,-0.093110,0.073480,0.064238,-1.378933,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.313867,0.208502,223500.0
3,4.0,0.309859,-0.045532,-0.456474,-0.096897,0.064238,-1.378933,0.314667,-0.02618,-1.861302,...,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,0.313867,-3.426284,140000.0
4,5.0,0.073375,-0.045532,0.633618,0.375148,0.064238,-1.378933,0.314667,-0.02618,-0.628316,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.313867,0.208502,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456.0,0.073375,-0.045532,-0.365633,-0.260560,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.620891,-0.614439,0.313867,0.208502,175000.0
1456,1457.0,-0.872563,-0.045532,0.679039,0.266407,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,1.645210,0.313867,0.208502,210000.0
1457,1458.0,0.309859,-0.045532,-0.183951,-0.147810,0.064238,0.750731,0.314667,-0.02618,0.604670,...,-0.359325,-0.116339,-0.270208,-0.068692,4.953112,-0.489110,1.645210,0.313867,0.208502,266500.0
1458,1459.0,-0.872563,-0.045532,-0.093110,-0.080160,0.064238,0.750731,0.314667,-0.02618,0.604670,...,1.473789,-0.116339,-0.270208,-0.068692,-0.087688,-0.859110,1.645210,0.313867,0.208502,142125.0


In [10]:
df.Id = df.Id.astype(int)

##### NOTE: After a little submission in kaggle its clearly overfitted :(