In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/nirshad97/Research-Project/main/cleansing-viz/ToPythonData.csv")
data.head()

Unnamed: 0,location,area,unit,bedrooms,bathrooms_w_cs,floor_area,no_of_floors,car_parking_spaces,availability,price,district
0,rajagiriya,6.0,perches,3,,1500.0,,,Available now,35.0,Colombo
1,colombo 5,20.0,perches,5,5.0,,2.0,2.0,Available now,300.0,Colombo
2,colombo 5,8.0,perches,4,3.0,2750.0,2.0,1.0,Available now,70.0,Colombo
3,colombo 3,10.0,perches,4,3.0,3200.0,2.0,2.0,Available now,210.0,Colombo
4,padukka,153.0,perches,5,5.0,5200.0,3.0,6.0,Available now,70.0,Colombo


In [3]:
def outlier_detection(colname):
  upper_limit = data[colname].mean() + 3*data[colname].std()
  lower_limit = data[colname].mean() - 3*data[colname].std()
  return upper_limit, lower_limit


In [4]:
cleaned_df = data[(data["price"] > outlier_detection("price")[1]) & (data["price"] < outlier_detection("price")[0])]
cleaned_df = cleaned_df[(cleaned_df["no_of_floors"] > outlier_detection("no_of_floors")[1]) & (cleaned_df["no_of_floors"] < outlier_detection("no_of_floors")[0])]
cleaned_df = cleaned_df[(cleaned_df["bathrooms_w_cs"] > outlier_detection("bathrooms_w_cs")[1]) & (cleaned_df["bathrooms_w_cs"] < outlier_detection("bathrooms_w_cs")[0])]

In [5]:
cleaned_df["derived_col"] = cleaned_df["unit"].apply(lambda x: 160 if x=="acres" else 1)
cleaned_df["derived_area"] = cleaned_df["derived_col"] * cleaned_df["area"]

In [6]:
COLS_NEEDED = ["derived_area", "bedrooms", "bathrooms_w_cs", "floor_area", "no_of_floors", "car_parking_spaces",
               "price", "district"]
df = cleaned_df[COLS_NEEDED]

In [7]:
df["car_parking_spaces"] = pd.to_numeric(df["car_parking_spaces"].replace("None", "")) 

In [8]:
df[["bathrooms_w_cs", "no_of_floors", "car_parking_spaces"]] = df[["bathrooms_w_cs", "no_of_floors", "car_parking_spaces"]].fillna(0)

In [9]:

MEAN_AREA_DA = np.mean(df["derived_area"])
MEAN_AREA_FA = np.mean(df["floor_area"])

In [10]:
df["derived_area"] = df["derived_area"].fillna(MEAN_AREA_DA)
df["floor_area"] = df["floor_area"].fillna(MEAN_AREA_FA)

In [11]:
df["no_of_floors"].max()

7.0

In [12]:
COLS_TO_ONE_HOT = ["bathrooms_w_cs", "district", "no_of_floors", 'car_parking_spaces']
df1 = df[COLS_TO_ONE_HOT]

In [13]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown="ignore")
ohe_values = ohe.fit_transform(df1).toarray()
ohe_columns = ohe.get_feature_names_out()
df2 = pd.DataFrame(ohe_values, columns=ohe_columns)

In [14]:
df2.head()

Unnamed: 0,bathrooms_w_cs_1.0,bathrooms_w_cs_2.0,bathrooms_w_cs_3.0,bathrooms_w_cs_4.0,bathrooms_w_cs_5.0,bathrooms_w_cs_6.0,bathrooms_w_cs_7.0,district_Anuradhapura,district_Badulla,district_Colombo,district_Galle,district_Gampaha,district_Kalutara,district_Kandy,district_Kegalle,district_Kurunegala,district_Matale,district_Matara,district_Polonnaruwa,district_Puttalam,district_Ratnapura,district_Trincomalee,no_of_floors_1.0,no_of_floors_2.0,no_of_floors_3.0,no_of_floors_4.0,no_of_floors_5.0,no_of_floors_7.0,car_parking_spaces_0.0,car_parking_spaces_1.0,car_parking_spaces_2.0,car_parking_spaces_3.0,car_parking_spaces_4.0,car_parking_spaces_5.0,car_parking_spaces_6.0
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
df.head()

Unnamed: 0,derived_area,bedrooms,bathrooms_w_cs,floor_area,no_of_floors,car_parking_spaces,price,district
1,20.0,5,5.0,3758.769499,2.0,2.0,300.0,Colombo
2,8.0,4,3.0,2750.0,2.0,1.0,70.0,Colombo
3,10.0,4,3.0,3200.0,2.0,2.0,210.0,Colombo
4,153.0,5,5.0,5200.0,3.0,6.0,70.0,Colombo
5,9.15,5,3.0,1800.0,2.0,2.0,26.5,Colombo


In [16]:
df[["derived_area", "floor_area"]]

Unnamed: 0,derived_area,floor_area
1,20.000000,3758.769499
2,8.000000,2750.000000
3,10.000000,3200.000000
4,153.000000,5200.000000
5,9.150000,1800.000000
...,...,...
2989,11.830000,3167.000000
2991,15.000000,3758.769499
2992,28.220961,2500.000000
2994,17.000000,2000.000000


In [17]:
from sklearn.preprocessing import Normalizer

In [18]:
norm = Normalizer()

In [19]:
norm_values = norm.fit_transform(df[["derived_area", "floor_area"]])
norm_cols = ["norm_derived_area", "norm_floor_area"]

In [20]:


df3 = pd.DataFrame(norm_values, columns=norm_cols)

In [21]:
df3.head()

Unnamed: 0,norm_derived_area,norm_floor_area
0,0.005321,0.999986
1,0.002909,0.999996
2,0.003125,0.999995
3,0.02941,0.999567
4,0.005083,0.999987


In [22]:
final_df = pd.concat([df2, df3], axis=1)

In [23]:
X = final_df
y = df["price"].fillna(value=np.mean(df["price"]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
pred_values = lin_reg.predict(X_test)

In [24]:
final_df.columns

Index(['bathrooms_w_cs_1.0', 'bathrooms_w_cs_2.0', 'bathrooms_w_cs_3.0',
       'bathrooms_w_cs_4.0', 'bathrooms_w_cs_5.0', 'bathrooms_w_cs_6.0',
       'bathrooms_w_cs_7.0', 'district_Anuradhapura', 'district_Badulla',
       'district_Colombo', 'district_Galle', 'district_Gampaha',
       'district_Kalutara', 'district_Kandy', 'district_Kegalle',
       'district_Kurunegala', 'district_Matale', 'district_Matara',
       'district_Polonnaruwa', 'district_Puttalam', 'district_Ratnapura',
       'district_Trincomalee', 'no_of_floors_1.0', 'no_of_floors_2.0',
       'no_of_floors_3.0', 'no_of_floors_4.0', 'no_of_floors_5.0',
       'no_of_floors_7.0', 'car_parking_spaces_0.0', 'car_parking_spaces_1.0',
       'car_parking_spaces_2.0', 'car_parking_spaces_3.0',
       'car_parking_spaces_4.0', 'car_parking_spaces_5.0',
       'car_parking_spaces_6.0', 'norm_derived_area', 'norm_floor_area'],
      dtype='object')

In [25]:
mean_squared_error(y_test, pred_values)

8.36906005943641e+27

In [26]:
final_df.shape

(1474, 37)

In [29]:
def final_model(model):
  model = model()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  error_term = mean_squared_error(y_test, pred_values)
  print("Error: ", error_term)


In [30]:
final_model(LinearRegression)

Error:  8.36906005943641e+27
