In [33]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [34]:
# loading data from csv
df = pd.read_csv('Housing.csv')

In [35]:
# printing 5 rows
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [36]:
# creating feature matrix and target vector
X = df.drop('price', axis=1)
Y = df['price']

In [37]:
# creating numerical and categorical columns
numerical_features = [features for features in X.columns if X[features].dtype!='O']
categorical_features = [features for features in X.columns if X[features].dtype=='O']

In [40]:
numerical_features

['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [38]:
def print_unique_categories(df):
    for feature in categorical_features:
        print(f"{feature} has these values {df[feature].unique()}")

print_unique_categories(df)        

mainroad has these values ['yes' 'no']
guestroom has these values ['no' 'yes']
basement has these values ['no' 'yes']
hotwaterheating has these values ['no' 'yes']
airconditioning has these values ['yes' 'no']
prefarea has these values ['yes' 'no']
furnishingstatus has these values ['furnished' 'semi-furnished' 'unfurnished']


In [41]:
categorical_features

['mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'prefarea',
 'furnishingstatus']

In [39]:
# creating the data transformers

numeric_scaler = StandardScaler()
encoder = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", encoder, categorical_features),
        ("StandardScaler", numeric_scaler, numerical_features),
    ]
)

In [19]:
# Spliting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=8)

In [20]:
# Applying the preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [43]:
type(Y_test)

pandas.core.series.Series

In [21]:
# shape of  X_train and X_test
X_train.shape, X_test.shape

((436, 20), (109, 20))

In [22]:
# creating the evaluation metrics
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.round(np.sqrt(mean_squared_error(true, predicted)), 2)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [44]:
# list of models

models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForestRegressor" : RandomForestRegressor(),
    "AdaBoostRegressor" : AdaBoostRegressor(),
    "XGBRegressor" : XGBRegressor(),
    "CatBoosting Regressor" : CatBoostRegressor(verbose=False),
}

model_name_performance = {
    
}

for key, value in models.items():
    model = value
    model.fit(X_train, Y_train)
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    # training performance
    mae_train, rmse_train, r2_square_train = evaluate_model(Y_train, Y_train_pred)

    print(f"{model} Training Performance")
    print("RMSE:", rmse_train)
    print("MAE:", mae_train)
    print("R2 score", r2_square_train)
    
    # testing performance
    mae_test, rmse_test, r2_square_test = evaluate_model(Y_test, Y_test_pred)

    print("Model Testing Performance")
    print("RMSE:", rmse_test)
    print("MAE:", mae_test)
    print("R2 score", r2_square_test)

    print("#"*50)
    print("\n")

    model_name_performance[key] = r2_square_test

LinearRegression() Training Performance
RMSE: 1050423.55
MAE: 792696.256880734
R2 score 0.675635303803966
Model Testing Performance
RMSE: 1125627.08
MAE: 783373.4311926606
R2 score 0.669900102299245
##################################################


Lasso() Training Performance
RMSE: 1044328.99
MAE: 787614.6302454305
R2 score 0.6793883134225902
Model Testing Performance
RMSE: 1125348.1
MAE: 769038.9090689028
R2 score 0.670063706326443
##################################################


Ridge() Training Performance
RMSE: 1044360.62
MAE: 787447.7948986582
R2 score 0.6793688950180845
Model Testing Performance
RMSE: 1122945.19
MAE: 766580.029064419
R2 score 0.671471201608598
##################################################


KNeighborsRegressor() Training Performance
RMSE: 997760.61
MAE: 694501.756880734
R2 score 0.7073440318361278
Model Testing Performance
RMSE: 1105909.01
MAE: 753629.6330275229
R2 score 0.6813637953340408
##################################################


Decision

In [24]:
# sorting the dict with r2 score and get the name of the model with highest r2 score
sorted_model_performance = sorted(model_name_performance.items(), key=lambda x: x[1], reverse=True)
highest_r2_model = sorted_model_performance[0][0]

In [25]:
highest_r2_model

'CatBoosting Regressor'

In [31]:
print(model_name_performance[highest_r2_model])

0.7352097642710412


In [27]:
# Defining the hyperparameters
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    # 'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }
}

In [28]:
# hyperparameter tuning

model_parameters = params[highest_r2_model]
print(model_parameters)

{'depth': [6, 8, 10], 'learning_rate': [0.01, 0.05, 0.1], 'iterations': [30, 50, 100]}


In [30]:
# creating the grid search
model = models[highest_r2_model]
grid_search = GridSearchCV(model, model_parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# print the training performance
y_train_pred = grid_search.predict(X_train)

mae_train, rmse_train, r2_square_train = evaluate_model(Y_train, Y_train_pred)

print(f"{model} Training Performance")
print("RMSE:", rmse_train)
print("MAE:", mae_train)
print("R2 score", r2_square_train)

# print the testing performance
y_test_pred = grid_search.predict(X_test)

mae_test, rmse_test, r2_square_test = evaluate_model(Y_test, Y_test_pred)

print(f"{model} Testing Performance")
print("RMSE:", rmse_test)
print("MAE:", mae_test)
print("R2 score", r2_square_test)



# best parameters
best_params = grid_search.best_params_
print(best_params)


<catboost.core.CatBoostRegressor object at 0x16a8331c0> Training Performance
RMSE: 404982.34
MAE: 306331.8256984977
R2 score 0.9517855897230467
<catboost.core.CatBoostRegressor object at 0x16a8331c0> Testing Performance
RMSE: 1008144.58
MAE: 722382.1396022813
R2 score 0.7352097642710412
{'depth': 6, 'iterations': 100, 'learning_rate': 0.1}


In [32]:
# training the final model with the best params
final_model = grid_search.best_estimator_
final_model.fit(X_train, Y_train)

<catboost.core.CatBoostRegressor at 0x16a824af0>