## Importing Libraries

In [3]:
# data prerprocessing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# machine learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# metrics libraries
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

# other libraries
import time

## Data Preprocessing

### Data Loading

In [5]:
# loading dataset
dataset = pd.read_csv('train.csv')

# splitting the target column
target = dataset['SalePrice']

# dropping unnecessary columns from the dataset
dataset = dataset.drop(columns = ['Id', 'SalePrice', 'MiscFeature', 'MiscVal'])

### Data Exploration

In [6]:
# total features
n_features = len(dataset.columns)
print(f"TOTAL FEATURES: {n_features}")

# feature column names
print(f"\nFEATURES: {dataset.columns}")

# features and their data types
data_types = list(dataset.dtypes.unique())
dataset_dtypes = dict(dataset.dtypes)

for datatype in data_types:
    print(f"\nFEATURES WITH {datatype} DATATYPE:\n")
    print([k for k, v in dataset_dtypes.items() if v == datatype])

# missing values in the dataset
print("\nMISSING VALUES:\n")
for feature in dataset.columns:
    if dataset[feature].isna().sum() > 0:
        print(f"{feature}: {dataset[feature].isna().sum()}: {dataset_dtypes[feature]}")

TOTAL FEATURES: 77

FEATURES: Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageC

### Imputing Missing Values

In [7]:
missing_vals = {"LotFrontage": np.mean(dataset["LotFrontage"]),
               "Alley": "Pave",
               "MasVnrType": "None",
               "MasVnrArea": np.mean(dataset["MasVnrArea"]),
               "BsmtQual": dataset["BsmtQual"].mode(dropna = True).values[0],
               "BsmtCond": dataset["BsmtCond"].mode(dropna = True).values[0],
               "BsmtExposure": "No",
               "BsmtFinType1": "Unf",
               "BsmtFinType2": "No",
               "Electrical": dataset["Electrical"].mode(dropna = True).values[0],
               "FireplaceQu": dataset["FireplaceQu"].mode(dropna = True).values[0],
               "GarageType": dataset["GarageType"].mode(dropna = True).values[0],
               "GarageYrBlt": dataset["GarageYrBlt"].median(skipna = True),
               "GarageFinish": dataset["GarageFinish"].mode(dropna = True).values[0],
               "GarageQual": dataset["GarageQual"].mode(dropna = True).values[0],
               "GarageCond": dataset["GarageCond"].mode(dropna = True).values[0],
               "PoolQC": dataset["PoolQC"].mode(dropna = True).values[0],
               "Fence": dataset["Fence"].mode(dropna = True).values[0]}

dataset = dataset.fillna(value = missing_vals)

print("MISSING VALUES:\n")
if dataset.isna().values.any() == False:
    print("NO MISSING VALUES")
else:
    print("MISSING VALUES FOUND. PLEASE CHECK DATASET AGAIN")

MISSING VALUES:

NO MISSING VALUES


### Encoding Categorical Data

In [8]:
dataset = pd.get_dummies(dataset)

### Splitting Data into Train and Test Set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = 0.2)

## Linear Regression

In [None]:
# modelling
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# predicting
ypred_linearRegression = linear_regressor.predict(X_test)

# evaluation
linearRegression_r2 = r2_score(y_test, ypred_linearRegression)
print(f"R2 Score: {linearRegression_r2}")

linearRegression_mae = mean_absolute_error(y_test, ypred_linearRegression)
print(f"Mean Absolute Error (MAE): {linearRegression_mae}")

R2 Score: 0.8811433982648353
Mean Absolute Error (MAE): 18053.572344074524


## Support Vector Regression

In [None]:
# grid search
svr = SVR()
svr_params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
          'degree': [1, 3, 5]}
scoring = {'r2', 'neg_mean_absolute_error'}
svr_gridsearch = GridSearchCV(svr, svr_params, scoring = scoring, refit = 'neg_mean_absolute_error', cv = 5)

# modelling
svr_gridsearch.fit(X_train, y_train)
print("Best Parameters: ", svr_gridsearch.best_params_)

# predicting
ypred_SVR = svr_gridsearch.predict(X_test)

# evaluation
svr_r2 = r2_score(y_test, ypred_SVR)
print(f"R2 Score: {svr_r2}")

svr_mae = mean_absolute_error(y_test, ypred_SVR)
print(f"Mean Absolute Error (MAE): {svr_mae}")

Best Parameters:  {'degree': 1, 'kernel': 'linear'}
R2 Score: 0.7796873450211113
Mean Absolute Error (MAE): 24332.061036062158


## Decision Tree Regression

In [None]:
# grid search
dtr = DecisionTreeRegressor()
dtr_params = {'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
              'splitter': ['random', 'best'],
              'min_samples_split': [3, 0.5],
              'max_features': [2, 0.5, 'sqrt', 'log2']}
scoring = {'r2', 'neg_mean_absolute_error'}
dtr_gridsearch = GridSearchCV(dtr, dtr_params, scoring = 'neg_mean_absolute_error', cv = 5)

# modelling
dtr_gridsearch.fit(X_train, y_train)
print("Best Parameters: ", dtr_gridsearch.best_params_)

# predicting
ypred_DTR = dtr_gridsearch.predict(X_test)

# evaluation
dtr_r2 = r2_score(y_test, ypred_DTR)
print(f"R2 Score: {dtr_r2}")

dtr_mae = mean_absolute_error(y_test, ypred_DTR)
print(f"Mean Absolute Error (MAE): {dtr_mae}")

Best Parameters:  {'criterion': 'absolute_error', 'max_features': 0.5, 'min_samples_split': 3, 'splitter': 'best'}
R2 Score: 0.7451960949127842
Mean Absolute Error (MAE): 28223.755136986303


## Random Forest Regression

In [None]:
# grid search
rf = RandomForestRegressor()
rf_params = {'n_estimators': [100, 200],
             'min_samples_split': [3, 0.5],
             'max_depth': [3, 5],
             }
rf_gridsearch = GridSearchCV(rf, rf_params, scoring = 'neg_mean_absolute_error', cv = 5)

# modelling
rf_gridsearch.fit(X_train, y_train)
print("Best Parameters: ", rf_gridsearch.best_params_)

# predicting
ypred_RF = rf_gridsearch.predict(X_test)

# evaluation
rf_r2 = r2_score(y_test, ypred_RF)
print(f"R2 Score: {rf_r2}")

rf_mae = mean_absolute_error(y_test, ypred_RF)
print(f"Mean Absolute Error (MAE): {rf_mae}")

Best Parameters:  {'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 200}
R2 Score: 0.8428823251541103
Mean Absolute Error (MAE): 21381.583523691235


## XGBoost

In [14]:
# grid search
xgb = XGBRegressor()
xgb_params = {'n_estimators': [100, 300],
              'max_depth': [3, 5, 7],
              'eta': [0.3, 0.1, 0.01],
              'subsample': [0.5, 1],
              'colsample_bytree': [0.5, 1]}
xgb_gridsearch = GridSearchCV(xgb, xgb_params, scoring = 'neg_mean_absolute_error', cv = 5)

# modelling
xgb_gridsearch.fit(X_train, y_train)
print("Best Parameters: ", xgb_gridsearch.best_params_)

# predicting
ypred_XGB = xgb_gridsearch.predict(X_test)

# evaluation
xgb_r2 = r2_score(y_test, ypred_XGB)
print(f"R2 Score: {xgb_r2}")

xgb_mae = mean_absolute_error(y_test, ypred_XGB)
print(f"Mean Absolute Error (MAE): {xgb_mae}")

Best Parameters:  {'colsample_bytree': 0.5, 'eta': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1}
R2 Score: 0.927513948978635
Mean Absolute Error (MAE): 14120.409581014555
