# Project 3: House Price Prediction

Objective: Predict house prices based on various features like location, size, and house
characteristics.

## Model Building: Train and evaluate at least TWO machine learning models to predict the target variable.

## Imports

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Reading Data

In [30]:
df = pd.read_csv("../otherSolution/cleaned_house_data.csv")
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Dwell_Type,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


## Gradient Boosting Regression Model

**working on log transform of y target label**

**Separate out the data into X features and y target label**

In [31]:
X = df.drop(['Property_Sale_Price_natural_log','Property_Sale_Price'],axis=1)
y = df['Property_Sale_Price']
log_y = df['Property_Sale_Price_natural_log']

**Perform a Train|Test split on the data, with a 10% test size. Note: The solution uses a random state of 101**

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
X_train_log, X_test_log, log_y_train, log_y_test = train_test_split(X, log_y, test_size=0.1, random_state=101)


**Scale the X train and X test data.**

In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [36]:
scaler_log = StandardScaler()
scaled_X_train_log = scaler.fit_transform(X_train_log)
scaled_X_test_log = scaler.transform(X_test_log)

**Use a GridSearchCV to run a grid search for the best GradientBoostingRegressor() parameters.**

In [37]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [38]:
model = GradientBoostingRegressor()

In [11]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5],
    "min_samples_split": [5],
     "loss": ["squared_error"]
}

In [40]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', 
                           n_jobs=-1)


**Working with normal y**

In [41]:
grid_search.fit(scaled_X_train,y_train)


KeyboardInterrupt: 

In [14]:
grid_search.best_params_

{'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'min_samples_split': 5,
 'n_estimators': 200}

In [15]:
grid_preds = grid_search.predict(scaled_X_test)

**Evaluate your model's performance on the unseen 10% scaled test set. Using MAE and a RMSE**

In [16]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [17]:
MAE = mean_absolute_error(y_test,grid_preds)
#% error from mean
(MAE *100)/180149.242279

np.float64(9.451134860679408)

In [18]:
MSE = mean_squared_error(y_test,grid_preds)
RMSE = np.sqrt(MSE)
#% error from mean
(RMSE *100)/180149.242279

np.float64(14.911817368749944)

In [19]:
r2 = r2_score(y_test, grid_preds)
#% error from mean
r2

0.895494091707424

**Working with normal log transform of y**

In [20]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5],
    "min_samples_split": [5],
     "loss": ["squared_error"]
}

In [21]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', 
                           n_jobs=-1)

In [22]:
grid_search.fit(scaled_X_train_log,log_y_train)

In [23]:
grid_search.best_params_

{'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'min_samples_split': 5,
 'n_estimators': 300}

In [24]:
grid_preds_log = grid_search.predict(scaled_X_test_log)

**Evaluate your model's performance on the unseen 10% scaled test set. Using MAE and a RMSE**

In [25]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [26]:
MAE = mean_absolute_error(log_y_test,grid_preds_log)
#% error from mean
(MAE*100)/12.021984

np.float64(0.6704464296878929)

In [27]:
MSE = mean_squared_error(log_y_test,grid_preds_log)
RMSE = np.sqrt(MSE)
#% error from mean
(RMSE*100)/12.021984

np.float64(0.9250863024036519)

In [28]:
r2 = r2_score(log_y_test,grid_preds_log)
#% error from mean
r2

0.9299294492713883

----