In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("kc_house_data.csv")

In [4]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

### Data Preparation

#### Dropping 'id' and 'date' columns

In [6]:
df.drop(columns=["id", "date"], inplace=True)

In [7]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
df.shape

(21613, 19)

#### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_df, test_df = train_test_split(df, train_size=0.8)

train_df.shape, test_df.shape

((17290, 19), (4323, 19))

In [11]:
x_train = train_df.drop(columns="price")
y_train = train_df.price

x_test = test_df.drop(columns="price")
y_test = test_df.price

In [12]:
x_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17632,4,2.5,3170,4979,2.0,0,0,4,7,2570,600,1925,0,98103,47.6655,-122.34,2060,5000
4967,4,3.5,5830,131116,2.0,0,0,3,11,5830,0,2005,0,98024,47.5986,-121.949,5340,207206
8403,3,1.0,1070,5750,1.0,0,0,3,7,1070,0,1952,0,98178,47.5071,-122.255,1420,6500
4206,3,2.5,1900,7604,2.0,0,0,3,8,1900,0,1990,0,98033,47.7002,-122.188,1980,9583
4889,4,2.25,2460,7620,1.0,0,0,3,7,1230,1230,1969,0,98106,47.5285,-122.345,2090,7620


In [15]:
y_train.head()

17632     748000.0
4967     1575000.0
8403      239300.0
4206      512500.0
4889      325000.0
Name: price, dtype: float64

#### The value of target variables are huge so let's scale it for better representation

In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [21]:
y_train = y_train.values.reshape((y_train.shape[0], 1))
y_test = y_test.values.reshape((y_test.shape[0], 1))

In [22]:
scaler = MinMaxScaler()

y_train = scaler.fit_transform(y_train)

y_test = scaler.transform(y_test)

### Model Building

In [25]:
from sklearn.ensemble import AdaBoostRegressor

In [47]:
adb = AdaBoostRegressor(n_estimators=100, learning_rate=0.5)

adb.fit(x_train, y_train)

AdaBoostRegressor(learning_rate=0.5, n_estimators=100)

In [48]:
y_train_pred = adb.predict(x_train)

In [49]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [50]:
r2_score(y_train, y_train_pred)

0.26118472629902323

#### Let's do some hyperparameter tuning to get the feel of the performance

#### We are skipping the hyperparameter tuning using gridsearchcv here because the dataset is big and it will run the pc at 100% for quite some time. 

#### We already know how to do hyperparameter tuning using gridsearchcv so there's nothing new we would be doing here. 