In [34]:
import pandas as pd 
import numpy as np 

In [14]:
house_df = pd.read_csv('../Dataset/Housing.csv')

In [15]:
house_df.sample(5)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
434,3290000,3792,4,1,2,yes,no,no,no,no,0,no,semi-furnished
436,3290000,2145,3,1,2,yes,no,yes,no,no,0,yes,furnished
202,4900000,4120,2,1,1,yes,no,yes,no,no,1,no,semi-furnished
334,3920000,3290,2,1,1,yes,no,no,yes,no,1,no,furnished
538,1890000,3649,2,1,1,yes,no,no,no,no,0,no,unfurnished


In [17]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


### Data Preprocessing 

In [19]:
house_df['furnishingstatus'].unique()

array(['furnished', 'semi-furnished', 'unfurnished'], dtype=object)

In [20]:
mapping_dict = {
    'furnished' : 2, 
    'semi-furnished' : 1, 
    'unfurnished' : 0
}

house_df['furnishingstatus'] = house_df['furnishingstatus'].map(mapping_dict)

In [21]:
house_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,2
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,2
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,1
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,2
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,2


In [31]:
mapping_dict_for_binary_columns = {
    'yes' : 1, 
    'no' : 0
}

for column in house_df.columns:
    # print(house_df[column].dtype)
    if house_df[column].dtype == object: 
        # print(column) 
        house_df[column] = house_df[column].map(mapping_dict_for_binary_columns)

In [32]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             545 non-null    int64
 1   area              545 non-null    int64
 2   bedrooms          545 non-null    int64
 3   bathrooms         545 non-null    int64
 4   stories           545 non-null    int64
 5   mainroad          545 non-null    int64
 6   guestroom         545 non-null    int64
 7   basement          545 non-null    int64
 8   hotwaterheating   545 non-null    int64
 9   airconditioning   545 non-null    int64
 10  parking           545 non-null    int64
 11  prefarea          545 non-null    int64
 12  furnishingstatus  545 non-null    int64
dtypes: int64(13)
memory usage: 55.5 KB


In [33]:
house_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,2
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,2
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,2
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,2


### import required Library.

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [36]:
X = house_df.iloc[ : , 1 : ]
y = house_df['price']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [41]:
print(X_train.shape)
print(X_test.shape) 
print(y_train.shape)
print(y_test.shape)

(436, 12)
(109, 12)
(436,)
(109,)


### Linear Regression. 

In [43]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [44]:
y_pred = lr_model.predict(X_test)

In [47]:
print('R2 Score : ', r2_score(y_test, y_pred))
print('Coff_ are : ', lr_model.coef_)
print('Intercept_ is : ', lr_model.intercept_)

R2 Score :  0.6457161339804305
Coff_ are :  [2.27917971e+02 1.13189385e+05 9.59054811e+05 4.35602396e+05
 5.02158806e+05 2.99226143e+05 4.13489752e+05 1.03020630e+06
 8.49228123e+05 3.53692839e+05 6.12476827e+05 2.15229695e+05]
Intercept_ is :  -283217.1943577323


### Apply ridge regression.

In [49]:
rd_model = Ridge(alpha=1)
rd_model.fit(X_train, y_train)

In [50]:
y_pred = rd_model.predict(X_test)

In [51]:
print('R2 Score : ', r2_score(y_test, y_pred))
print('Coff_ are : ', rd_model.coef_)
print('Intercept_ is : ', rd_model.intercept_)

R2 Score :  0.6468149113618538
Coff_ are :  [2.29147297e+02 1.15750653e+05 9.50963134e+05 4.37192170e+05
 4.92149493e+05 2.97514848e+05 4.12289204e+05 9.76492885e+05
 8.37000692e+05 3.55028153e+05 6.05735547e+05 2.16921035e+05]
Intercept_ is :  -274610.69564767927


### Apply LASSO Regression.

In [62]:
ls_model = Lasso(alpha=0.3, max_iter=400)
ls_model.fit(X_train, y_train)

In [63]:
y_pred = ls_model.predict(X_test)

In [64]:
print('R2 Score : ', r2_score(y_test, y_pred))
print('Coff_ are : ', ls_model.coef_)
print('Intercept_ is : ', ls_model.intercept_)

R2 Score :  0.6457163706250292
Coff_ are :  [2.27918299e+02 1.13189214e+05 9.59054232e+05 4.35602501e+05
 5.02156352e+05 2.99224820e+05 4.13489260e+05 1.03019914e+06
 8.49226592e+05 3.53692794e+05 6.12475678e+05 2.15229671e+05]
Intercept_ is :  -283214.13476149365


### ElasticNet Regression.

In [66]:
el_model = ElasticNet(alpha=1, l1_ratio=0.5)
el_model.fit(X_train, y_train)

In [67]:
y_pred = el_model.predict(X_test)

In [68]:
print('R2 Score : ', r2_score(y_test, y_pred))
print('Coff_ are : ', el_model.coef_)
print('Intercept_ is : ', el_model.intercept_)

R2 Score :  0.5873993211855472
Coff_ are :  [3.45125048e+02 1.99994676e+05 3.67769762e+05 3.54041532e+05
 1.20452362e+05 1.43351425e+05 1.84924798e+05 7.87474861e+04
 2.82396502e+05 2.57190941e+05 2.04079067e+05 2.02669694e+05]
Intercept_ is :  602382.6460501547
