In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model, svm
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, normalize
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import scipy.stats
import matplotlib.pyplot as plt
import xgboost
import seaborn as sns
from sklearn.neural_network import MLPRegressor
import tqdm
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

import warnings # supress warnings
warnings.filterwarnings('ignore')




In [2]:
raw_training_dataset = pd.read_csv('../EDA/train.csv')
raw_test_dataset = pd.read_csv('../EDA/test.csv')

In [3]:
submission_id = np.array(raw_test_dataset['Id'])

In [4]:
numerical_features = ['MSSubClass',
                    'LotFrontage',
                    'LotArea',
                    'OverallQual',
                    'OverallCond',
                    'YearBuilt',
                    'YearRemodAdd',
                    'MasVnrArea',
                    'BsmtFinSF1',
                    'BsmtFinSF2',
                    'BsmtUnfSF',
                    'TotalBsmtSF',
                    '1stFlrSF',
                    '2ndFlrSF',
                    'LowQualFinSF',
                    'GrLivArea',
                    'BsmtFullBath',
                    'BsmtHalfBath',
                    'FullBath',
                    'HalfBath',
                    'BedroomAbvGr',
                    'KitchenAbvGr',
                    'TotRmsAbvGrd',
                    'Fireplaces',
                    'GarageYrBlt',
                    'GarageCars',
                    'GarageArea',
                    'WoodDeckSF',
                    'OpenPorchSF',
                    'EnclosedPorch',
                    '3SsnPorch',
                    'ScreenPorch',
                    'PoolArea',
                    'MiscVal',
                    'MoSold',
                    'YrSold',]

In [5]:
categorical_features = [col for col in raw_training_dataset.columns if col not in numerical_features+['Id', 'SalePrice']]

###  Experiment 1 - Features with correlation of 0.5 and above with p-value of less than or equal to 0.05

In [8]:
correlation_dict = {}
for col in numerical_features:
    # print(f"Correlation between {col} and SalesPrice = {scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])}")
    raw_training_dataset[col] = raw_training_dataset[col].fillna(0)
    raw_test_dataset[col] = raw_test_dataset[col].fillna(0) 
    correlation_dict[col] = [round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[0], 5), 
                            round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict
relevant_numerical_features = []
for key, value in correlation_dict.items():
    # picking only columns that have absolute correlation >= 0.5 and with a p-value of < 0.05
    if abs(correlation_dict[key][0]) >= 0.5 and correlation_dict[key][1] <= 0.05:
        relevant_numerical_features.append(key)

print(f"Relevant Numerical Features: {relevant_numerical_features}")


correlation_dict_categorical = {}
for col in categorical_features:
    one_hot_df = pd.get_dummies(raw_training_dataset[col])
    one_hot_col_names = one_hot_df.columns
    for ohc in one_hot_col_names:
        correlation_dict_categorical[f'{col}_{ohc}'] = [round(scipy.stats.pearsonr(one_hot_df[ohc], raw_training_dataset['SalePrice'])[0], 5), 
                                            round(scipy.stats.pearsonr(one_hot_df[ohc], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict_categorical
relevant_categorical_features = []
for key in correlation_dict_categorical.keys():
    # picking only columns that have absolute correlation >= 0.5 and with a p-value of <= 0.05
    if abs(correlation_dict_categorical[key][0]) >= 0.5 and correlation_dict_categorical[key][1] <= 0.05:
        relevant_categorical_features.append(key)


print(f"Relevant Categorical Features: {relevant_categorical_features}")




Relevant Numerical Features: ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']
Relevant Categorical Features: ['ExterQual_TA', 'BsmtQual_Ex', 'KitchenQual_Ex', 'KitchenQual_TA']


In [11]:
one_hot_df = pd.get_dummies(raw_training_dataset[[c.split("_")[0] for c in relevant_categorical_features]])

normalized_df = raw_training_dataset[relevant_numerical_features]
n_rows = normalized_df.shape[0]
n_cols = normalized_df.shape[1]
for i in range(n_cols):
    normalized_df.iloc[:,i] = normalize([normalized_df.iloc[:,i]]).reshape(-1,1)

normalized_onehot_df = pd.concat([normalized_df, one_hot_df], axis=1)

X = normalized_onehot_df.values
y = raw_training_dataset.iloc[:,-1:].values

normalized_test_df = raw_test_dataset[relevant_numerical_features]
# normalized_test_df.fillna(0)
n_cols = normalized_test_df.shape[1]
for i in range(n_cols):
    normalized_test_df.iloc[:,i] = normalize([normalized_test_df.iloc[:,i]]).reshape(-1,1)

one_hot_df_test = pd.get_dummies(raw_test_dataset[[c.split("_")[0] for c in relevant_categorical_features]])

normalized_onehot_df_test = pd.concat([normalized_test_df, one_hot_df_test], axis=1)

x_test = normalized_onehot_df_test.values

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41)

In [13]:
# Linear Regression
linear_reg_model = linear_model.LinearRegression()
linear_reg_model.fit(x_train, y_train)
linear_pred = linear_reg_model.predict(x_val)

print(f'Linear model - mean_squared_error: {round(mean_squared_error(y_val, linear_pred),2)}')
print(f'Linear model - mean_squared_log_error: {mean_squared_log_error(y_val, linear_pred)}')


# SVM
svm_model = svm.SVR()
svm_model.fit(x_train, y_train.reshape(-1,1))
svm_pred = svm_model.predict(x_val)

print(f'SVM - mean_squared_error: {round(mean_squared_error(y_val, svm_pred),2)}')
print(f'SVM - mean_squared_log_error: {mean_squared_log_error(y_val, svm_pred)}')


# XGBoost
xgboost_model = xgboost.XGBRegressor()
xgboost_model.fit(x_train, y_train)
xgb_pred = xgboost_model.predict(x_val)

print(f'XGBoost - mean_squared_error: {round(mean_squared_error(y_val, xgb_pred),2)}')
print(f'XGBoost - mean_squared_log_error: {mean_squared_log_error(y_val, xgb_pred)}')


# MLP
mlp_regressor_model = MLPRegressor(hidden_layer_sizes=100, activation='relu', solver='adam', max_iter=50000)
mlp_regressor_model.fit(x_train, y_train.reshape(-1,1))
mlp_pred = mlp_regressor_model.predict(x_val)
print(f'MLP - mean_squared_error: {round(mean_squared_error(y_val.reshape(-1,1), mlp_pred),2)}')
print(f'MLP - mean_squared_log_error: {mean_squared_log_error(y_val.reshape(-1,1), mlp_pred)}')

Linear model - mean_squared_error: 809538862.02
Linear model - mean_squared_log_error: 0.026489359851081886
SVM - mean_squared_error: 5730838074.66
SVM - mean_squared_log_error: 0.14523856411674407
XGBoost - mean_squared_error: 979621780.47
XGBoost - mean_squared_log_error: 0.028150605417908606
MLP - mean_squared_error: 939569497.42
MLP - mean_squared_log_error: 0.030504611776193232


###  Experiment 2 - Features with correlation of 0.7 and above with p-value of less than or equal to 0.05

In [14]:
correlation_dict = {}
for col in numerical_features:
    # print(f"Correlation between {col} and SalesPrice = {scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])}")
    raw_training_dataset[col] = raw_training_dataset[col].fillna(0)
    raw_test_dataset[col] = raw_test_dataset[col].fillna(0) 
    correlation_dict[col] = [round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[0], 5), 
                            round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict
relevant_numerical_features = []
for key, value in correlation_dict.items():
    # picking only columns that have absolute correlation >= 0.7 and with a p-value of < 0.05
    if abs(correlation_dict[key][0]) >= 0.7 and correlation_dict[key][1] <= 0.05:
        relevant_numerical_features.append(key)

print(f"Relevant Numerical Features: {relevant_numerical_features}")


correlation_dict_categorical = {}
for col in categorical_features:
    one_hot_df = pd.get_dummies(raw_training_dataset[col])
    one_hot_col_names = one_hot_df.columns
    for ohc in one_hot_col_names:
        correlation_dict_categorical[f'{col}_{ohc}'] = [round(scipy.stats.pearsonr(one_hot_df[ohc], raw_training_dataset['SalePrice'])[0], 5), 
                                            round(scipy.stats.pearsonr(one_hot_df[ohc], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict_categorical
relevant_categorical_features = []
for key in correlation_dict_categorical.keys():
    # picking only columns that have absolute correlation >= 0.7 and with a p-value of <= 0.05
    if abs(correlation_dict_categorical[key][0]) >= 0.7 and correlation_dict_categorical[key][1] <= 0.05:
        relevant_categorical_features.append(key)


print(f"Relevant Categorical Features: {relevant_categorical_features}")


Relevant Numerical Features: ['OverallQual', 'GrLivArea']
Relevant Categorical Features: []


In [15]:
normalized_df = raw_training_dataset[relevant_numerical_features]
n_rows = normalized_df.shape[0]
n_cols = normalized_df.shape[1]
for i in range(n_cols):
    normalized_df.iloc[:,i] = normalize([normalized_df.iloc[:,i]]).reshape(-1,1)

X = normalized_df.values
y = raw_training_dataset.iloc[:,-1:].values

normalized_test_df = raw_test_dataset[relevant_numerical_features]
n_cols = normalized_test_df.shape[1]
for i in range(n_cols):
    normalized_test_df.iloc[:,i] = normalize([normalized_test_df.iloc[:,i]]).reshape(-1,1)

x_test = normalized_test_df.values

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41)

In [16]:
# Linear Regression
linear_reg_model = linear_model.LinearRegression()
linear_reg_model.fit(x_train, y_train)
linear_pred = linear_reg_model.predict(x_val)

print(f'Linear model - mean_squared_error: {round(mean_squared_error(y_val, linear_pred),2)}')
print(f'Linear model - mean_squared_log_error: {mean_squared_log_error(y_val, linear_pred)}')


# SVM
svm_model = svm.SVR()
svm_model.fit(x_train, y_train.reshape(-1,1))
svm_pred = svm_model.predict(x_val)

print(f'SVM - mean_squared_error: {round(mean_squared_error(y_val, svm_pred),2)}')
print(f'SVM - mean_squared_log_error: {mean_squared_log_error(y_val, svm_pred)}')


# XGBoost
xgboost_model = xgboost.XGBRegressor()
xgboost_model.fit(x_train, y_train)
xgb_pred = xgboost_model.predict(x_val)

print(f'XGBoost - mean_squared_error: {round(mean_squared_error(y_val, xgb_pred),2)}')
print(f'XGBoost - mean_squared_log_error: {mean_squared_log_error(y_val, xgb_pred)}')


# MLP
mlp_regressor_model = MLPRegressor(hidden_layer_sizes=100, activation='relu', solver='adam', max_iter=50000)
mlp_regressor_model.fit(x_train, y_train.reshape(-1,1))
mlp_pred = mlp_regressor_model.predict(x_val)
print(f'MLP - mean_squared_error: {round(mean_squared_error(y_val.reshape(-1,1), mlp_pred),2)}')
print(f'MLP - mean_squared_log_error: {mean_squared_log_error(y_val.reshape(-1,1), mlp_pred)}')

Linear model - mean_squared_error: 1254167558.26
Linear model - mean_squared_log_error: 0.042042309837848
SVM - mean_squared_error: 5733793982.48
SVM - mean_squared_log_error: 0.1452314508925064
XGBoost - mean_squared_error: 1246801872.71
XGBoost - mean_squared_log_error: 0.039042448486032263
MLP - mean_squared_error: 3651351031.52
MLP - mean_squared_log_error: 0.10205991350076908


###  Experiment 3 - Numerical features with correlation of 0.5 and above

In [17]:
correlation_dict = {}
for col in numerical_features:
    # print(f"Correlation between {col} and SalesPrice = {scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])}")
    raw_training_dataset[col] = raw_training_dataset[col].fillna(0)
    raw_test_dataset[col] = raw_test_dataset[col].fillna(0) 
    correlation_dict[col] = [round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[0], 5), 
                            round(scipy.stats.pearsonr(raw_training_dataset[col], raw_training_dataset['SalePrice'])[1], 5)]

# correlation_dict
relevant_numerical_features = []
for key, value in correlation_dict.items():
    # picking only columns that have absolute correlation >= 0.5 and with a p-value of < 0.05
    if abs(correlation_dict[key][0]) >= 0.5 and correlation_dict[key][1] <= 0.05:
        relevant_numerical_features.append(key)

print(f"Relevant Numerical Features: {relevant_numerical_features}")



Relevant Numerical Features: ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']


In [18]:
normalized_df = raw_training_dataset[relevant_numerical_features]
n_rows = normalized_df.shape[0]
n_cols = normalized_df.shape[1]
for i in range(n_cols):
    normalized_df.iloc[:,i] = normalize([normalized_df.iloc[:,i]]).reshape(-1,1)

X = normalized_df.values
y = raw_training_dataset.iloc[:,-1:].values

normalized_test_df = raw_test_dataset[relevant_numerical_features]
n_cols = normalized_test_df.shape[1]
for i in range(n_cols):
    normalized_test_df.iloc[:,i] = normalize([normalized_test_df.iloc[:,i]]).reshape(-1,1)

x_test = normalized_test_df.values

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41)

In [19]:
# Linear Regression
linear_reg_model = linear_model.LinearRegression()
linear_reg_model.fit(x_train, y_train)
linear_pred = linear_reg_model.predict(x_val)

print(f'Linear model - mean_squared_error: {round(mean_squared_error(y_val, linear_pred),2)}')
print(f'Linear model - mean_squared_log_error: {mean_squared_log_error(y_val, linear_pred)}')


# SVM
svm_model = svm.SVR()
svm_model.fit(x_train, y_train.reshape(-1,1))
svm_pred = svm_model.predict(x_val)

print(f'SVM - mean_squared_error: {round(mean_squared_error(y_val, svm_pred),2)}')
print(f'SVM - mean_squared_log_error: {mean_squared_log_error(y_val, svm_pred)}')


# XGBoost
xgboost_model = xgboost.XGBRegressor()
xgboost_model.fit(x_train, y_train)
xgb_pred = xgboost_model.predict(x_val)

print(f'XGBoost - mean_squared_error: {round(mean_squared_error(y_val, xgb_pred),2)}')
print(f'XGBoost - mean_squared_log_error: {mean_squared_log_error(y_val, xgb_pred)}')


# MLP
mlp_regressor_model = MLPRegressor(hidden_layer_sizes=100, activation='relu', solver='adam', max_iter=50000)
mlp_regressor_model.fit(x_train, y_train.reshape(-1,1))
mlp_pred = mlp_regressor_model.predict(x_val)
print(f'MLP - mean_squared_error: {round(mean_squared_error(y_val.reshape(-1,1), mlp_pred),2)}')
print(f'MLP - mean_squared_log_error: {mean_squared_log_error(y_val.reshape(-1,1), mlp_pred)}')

Linear model - mean_squared_error: 884673636.14
Linear model - mean_squared_log_error: 0.030812049124109512
SVM - mean_squared_error: 5738562507.13
SVM - mean_squared_log_error: 0.14541297117259308
XGBoost - mean_squared_error: 929663020.91
XGBoost - mean_squared_log_error: 0.027251699300227403
MLP - mean_squared_error: 1267751422.45
MLP - mean_squared_log_error: 0.04710244907350764
