In [73]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [94]:
housing =  pd.read_csv("BHP.csv")

In [95]:
housing_df = housing.copy()

In [96]:
housing_df.dropna(subset=["size","bath","location"],inplace=True)

In [97]:
housing_df.loc[housing_df["availability"]!="Ready To Move","availability"] = "Not Ready To Move"
housing_df.loc[housing_df["balcony"].isna(),"balcony"] = 0
housing.society.fillna("No Society",inplace=True)

In [98]:
housing_df = housing_df.astype({"area_type":"str","availability":"str","location":"str","size":"str","society":"str","total_sqft":"str","bath":"int","balcony":"int"})

In [99]:
housing_df = housing_df[~housing_df["total_sqft"].str.contains("[a-zA-Z]")]

In [108]:
total_sqft = []

for i in housing_df["total_sqft"]:
    if(len(i.split("-"))>1):
        total_sqft.append((float(i.split("-")[0])+float(i.split("-")[1]))/2)
    else:
        total_sqft.append(float(i))

housing_df["total_sqft"] = total_sqft

In [113]:
housing_df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,3,0,416,13,456,1056.0,2,1,39.07
1,2,1,314,19,2417,2600.0,5,3,120.00
2,0,1,1173,16,2665,1440.0,2,3,62.00
3,3,1,753,16,2166,1521.0,3,1,95.00
4,3,1,712,13,2665,1200.0,2,1,51.00
...,...,...,...,...,...,...,...,...,...
13315,0,1,1246,22,206,3453.0,4,0,231.00
13316,3,1,998,18,2665,3600.0,5,0,400.00
13317,0,1,966,13,1206,1141.0,2,1,60.00
13318,3,0,901,18,2185,4689.0,4,1,488.00


In [111]:
le = LabelEncoder()
area_type = le.fit_transform( housing_df["area_type"])
housing_df["area_type"] = area_type
area_type = le.fit_transform( housing_df["availability"])
housing_df["availability"] = area_type
area_type = le.fit_transform( housing_df["location"])
housing_df["location"] = area_type
area_type = le.fit_transform( housing_df["size"])
housing_df["size"] = area_type
area_type = le.fit_transform( housing_df["society"])
housing_df["society"] = area_type

In [114]:
X = housing_df.iloc[:,[0,1,2,3,4,5,6,7]]
y = housing_df.iloc[:,[8]]

In [115]:
xtrain, xtest, ytrain, ytest = train_test_split(
    X, y, test_size=0.25, random_state=0)

In [116]:
scaler = StandardScaler()
X_train = scaler.fit_transform(xtrain)
X_test = scaler.transform(xtest)

In [117]:
X_train

array([[ 0.62004487, -1.98125544,  1.58567706, ..., -0.71357072,
        -1.24345109, -1.76318785],
       [ 0.62004487,  0.50473047, -1.31068099, ..., -0.41824953,
        -0.50847194, -0.60118814],
       [-0.25624981,  0.50473047, -1.05786795, ...,  0.03250386,
         0.96148636, -0.60118814],
       ...,
       [-2.00883917,  0.50473047,  1.0195957 , ...,  1.92955391,
         0.96148636,  0.56081158],
       [ 0.62004487,  0.50473047,  1.04707537, ...,  0.18638174,
         0.22650721,  1.7228113 ],
       [-0.25624981,  0.50473047, -1.20625821, ..., -0.68248428,
         0.96148636, -0.60118814]])

In [147]:
lr = LinearRegression()
lr.fit(X_train,ytrain)

In [148]:
predictions = []
for i in X_test:
    predictions.append(lr.predict([i])[0])

In [149]:
predictions

[array([104.99796311]),
 array([142.52243898]),
 array([163.20414912]),
 array([58.78145212]),
 array([38.44771877]),
 array([73.61090957]),
 array([262.03473131]),
 array([133.10354955]),
 array([120.54832035]),
 array([105.66547331]),
 array([6.61103595]),
 array([6.50625538]),
 array([61.65965588]),
 array([86.89966056]),
 array([70.26857873]),
 array([132.8343517]),
 array([167.66099418]),
 array([61.79787083]),
 array([14.03058454]),
 array([118.48096658]),
 array([111.41564117]),
 array([73.57435105]),
 array([318.22868264]),
 array([65.38399501]),
 array([79.44415542]),
 array([207.56467889]),
 array([116.86221156]),
 array([65.28078331]),
 array([96.33704953]),
 array([296.07881818]),
 array([80.47137335]),
 array([89.60198267]),
 array([72.58020832]),
 array([180.24886122]),
 array([169.62231283]),
 array([77.88312219]),
 array([75.34434249]),
 array([60.96983283]),
 array([80.55716916]),
 array([72.24260827]),
 array([4.94981021]),
 array([110.22415631]),
 array([76.90517704]

In [150]:
list(ytest["price"])

[265.0,
 175.0,
 185.0,
 36.0,
 26.0,
 52.0,
 335.0,
 139.0,
 142.0,
 78.0,
 38.0,
 39.0,
 30.0,
 75.0,
 47.39,
 115.0,
 180.0,
 82.0,
 57.0,
 80.0,
 69.0,
 48.5,
 465.0,
 33.0,
 70.0,
 550.0,
 70.25,
 46.0,
 115.0,
 249.0,
 59.0,
 30.0,
 60.0,
 239.0,
 135.0,
 81.0,
 46.13,
 35.0,
 93.0,
 48.0,
 15.0,
 72.54,
 88.0,
 58.0,
 150.0,
 54.79,
 175.0,
 60.0,
 50.0,
 56.12,
 160.0,
 360.0,
 118.0,
 95.0,
 344.0,
 175.0,
 75.0,
 43.09,
 65.0,
 45.0,
 43.0,
 46.0,
 240.0,
 145.0,
 95.0,
 75.0,
 63.0,
 240.0,
 44.0,
 180.0,
 40.0,
 40.08,
 95.0,
 550.0,
 25.0,
 44.0,
 85.0,
 400.0,
 30.0,
 240.0,
 105.0,
 78.57,
 65.0,
 58.0,
 62.0,
 57.0,
 45.0,
 290.0,
 95.0,
 55.0,
 44.5,
 230.0,
 32.0,
 44.6,
 81.0,
 64.0,
 48.0,
 50.66,
 55.0,
 44.0,
 175.0,
 59.96,
 35.0,
 98.0,
 44.0,
 65.0,
 48.45,
 125.0,
 60.0,
 200.0,
 43.5,
 40.0,
 230.0,
 120.0,
 75.0,
 180.0,
 85.0,
 230.0,
 125.0,
 150.0,
 73.0,
 44.6,
 25.88,
 90.0,
 55.0,
 60.0,
 25.0,
 63.5,
 46.8,
 49.8,
 68.0,
 270.0,
 46.0,
 180.0,
 86.0,


In [151]:
from sklearn.metrics import mean_squared_error
import math
y_actual = list(ytest["price"])
y_predicted = predictions
 
MSE = mean_squared_error(y_actual, y_predicted)
 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

Root Mean Square Error:

112.17603138725364
