## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Reading Data

In [2]:
df = pd.read_csv("/content/cleaned_house_data.csv")
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Dwell_Type,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


## Separate out the data into X features and y target label

In [3]:
X = df.drop(['Property_Sale_Price_natural_log','Property_Sale_Price'],axis=1)
y = df['Property_Sale_Price']
log_y = df['Property_Sale_Price_natural_log']

## Perform a Train|Test split on the data, with a 10% test size. Note: The solution uses a random state of 101

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
X_train_log, X_test_log, log_y_train, log_y_test = train_test_split(X, log_y, test_size=0.1, random_state=101)

## Scale the X train and X test data.

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [7]:
scaler_log = StandardScaler()
scaled_X_train_log = scaler.fit_transform(X_train_log)
scaled_X_test_log = scaler.transform(X_test_log)

## Use a GridSearchCV to run a grid search for the best GradientBoostingRegressor() parameters.

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

In [9]:
model = GradientBoostingRegressor()

In [14]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5],
    "min_samples_split": [5],
     "loss": ["squared_error"]
}

In [15]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

In [16]:
grid_search.fit(scaled_X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [17]:
grid_search.best_params_

{'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'min_samples_split': 5,
 'n_estimators': 300}

In [18]:
grid_preds = grid_search.predict(scaled_X_test)

## Evaluate your model's performance on the unseen 10% scaled test set. Using MAE and a RMSE

In [19]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [20]:
MAE = mean_absolute_error(y_test,grid_preds)
#% error from mean
(MAE *100)/180149.242279

9.462642253494211

In [21]:
MSE = mean_squared_error(y_test,grid_preds)
RMSE = np.sqrt(MSE)
#% error from mean
(RMSE *100)/180149.242279

14.956783764347051

In [22]:
r2 = r2_score(y_test, grid_preds)
#% error from mean
(r2 *100)/180149.242279

0.0004967341841815462

## Working with normal y

In [23]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5],
    "min_samples_split": [5],
     "loss": ["squared_error"]
}

In [24]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

In [25]:
grid_search.fit(scaled_X_train_log,log_y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [26]:
grid_search.best_params_

{'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'min_samples_split': 5,
 'n_estimators': 300}

In [27]:
grid_preds_log = grid_search.predict(scaled_X_test_log)

## Evaluate your model's performance on the unseen 10% scaled test set. Using MAE and a RMSE

In [28]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [29]:
MAE = mean_absolute_error(log_y_test,grid_preds_log)
#% error from mean
(MAE*100)/12.021984

0.6694302758531387

In [30]:
MSE = mean_squared_error(log_y_test,grid_preds_log)
RMSE = np.sqrt(MSE)
#% error from mean
(RMSE*100)/12.021984

0.9283336890283864

In [31]:
r2 = r2_score(log_y_test,grid_preds_log)
#% error from mean
(r2*100)/12.021984

7.731141881398281