# Bangalore House Sales Price Prediction

In [1]:
# Importing libraries
import numpy as np
import pandas as pd

In [2]:
#Import all the libraries from sklearn 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [3]:
#importing the dataset
data = pd.read_csv("Bengaluru_House_Data.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


# Simple Analysis of the dataset

In [6]:
data["location"].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Dominic Layout                      1
Nanjappa Layout Vidyaranyapura      1
Hal old airport road                1
vinayakanagar                       1
Havanur extension                   1
Name: location, Length: 1305, dtype: int64

In [7]:
data["size"].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 Bedroom       2
10 BHK           2
11 BHK           2
13 BHK           1
19 BHK           1
14 BHK           1
16 BHK           1
27 BHK           1
12 Bedroom       1
18 Bedroom       1
43 Bedroom       1
Name: size, dtype: int64

# Handling Missing Values

In [8]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [9]:
data_clean = data.copy()

In [10]:
data_clean[pd.isnull(data_clean["size"])]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
579,Plot Area,Immediate Possession,Sarjapur Road,,Asiss B,1200 - 2400,,,34.185
1775,Plot Area,Immediate Possession,IVC Road,,Orana N,2000 - 5634,,,124.0
2264,Plot Area,Immediate Possession,Banashankari,,,2400,,,460.0
2809,Plot Area,Immediate Possession,Sarjapur Road,,AsdiaAr,1200 - 2400,,,28.785
2862,Plot Area,Immediate Possession,Devanahalli,,Ajleyor,1500 - 2400,,,46.8
5333,Plot Area,Immediate Possession,Devanahalli,,Emngs S,2100 - 5405,,,177.115
6423,Plot Area,Immediate Possession,Whitefield,,SRniaGa,2324,,,26.73
6636,Plot Area,Immediate Possession,Jigani,,S2enste,1500,,,25.49
6719,Plot Area,Immediate Possession,Hoskote,,SJowsn,800 - 2660,,,28.545
7680,Plot Area,Immediate Possession,Kasavanhalli,,,5000,,,400.0


In [11]:
data_clean.dropna(axis=0, thresh = 7, inplace = True)

In [12]:
data_clean.isnull().sum()

area_type          0
availability       0
location           1
size               0
society         5499
total_sqft         0
bath              57
balcony          593
price              0
dtype: int64

In [13]:
data_clean[pd.isnull(data_clean["location"])]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
568,Super built-up Area,Ready To Move,,3 BHK,Grare S,1600,3.0,2.0,86.0


In [14]:
data_clean[data_clean["society"] == "Grare S"]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
568,Super built-up Area,Ready To Move,,3 BHK,Grare S,1600,3.0,2.0,86.0
12238,Carpet Area,Ready To Move,Anantapura,3 BHK,Grare S,1600,3.0,2.0,77.0


In [15]:
data_clean["location"] =  data_clean["location"].replace(to_replace = np.nan, value = "Anantapura")

In [16]:
data_clean.isnull().sum()

area_type          0
availability       0
location           0
size               0
society         5499
total_sqft         0
bath              57
balcony          593
price              0
dtype: int64

In [17]:
data_clean["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [18]:
data_clean["bhk"] = data_clean["size"].apply(lambda x: int(x.split(" ")[0]))

In [19]:
data_clean[pd.isnull(data_clean["bath"])]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk
56,Built-up Area,20-Feb,Devanahalli,4 Bedroom,BrereAt,3010 - 3410,,,192.0,4
81,Built-up Area,18-Oct,Hennur Road,4 Bedroom,Gollela,2957 - 3450,,,224.5,4
224,Super built-up Area,19-Dec,Devanahalli,3 BHK,Jurdsig,1520 - 1740,,,74.82,3
344,Super built-up Area,21-Dec,Kanakpura Road,1 BHK,PrarePa,525,,,21.53,1
669,Super built-up Area,18-Dec,JP Nagar,5 BHK,Pehtsa,4400 - 6640,,,375.0,5
702,Super built-up Area,18-Dec,JP Nagar,5 BHK,Pehtsa,4400 - 6800,,,548.5,5
801,Super built-up Area,18-Dec,JP Nagar,4 BHK,Pehtsa,4000 - 5249,,,453.0,4
941,Super built-up Area,Ready To Move,Whitefield,4 Bedroom,PrOakSi,3606 - 5091,,,304.0,4
1264,Built-up Area,18-May,Hennur,3 Bedroom,Asoilul,2264,,,155.0,3
1267,Super built-up Area,18-Jun,Yelahanka,3 BHK,Shalkri,1440 - 1884,,,67.98,3


In [20]:
def isFloat(x):
    try:
        float(x)
    except:
        return False
    return True

data_clean[~data_clean["total_sqft"].apply(isFloat)]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,19-Dec,Yelahanka,4 BHK,LedorSa,2100 - 2850,4.0,0.0,186.000,4
56,Built-up Area,20-Feb,Devanahalli,4 Bedroom,BrereAt,3010 - 3410,,,192.000,4
81,Built-up Area,18-Oct,Hennur Road,4 Bedroom,Gollela,2957 - 3450,,,224.500,4
122,Super built-up Area,18-Mar,Hebbal,4 BHK,SNontle,3067 - 8156,4.0,0.0,477.000,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,2 BHK,Vaarech,1042 - 1105,2.0,0.0,54.005,2
...,...,...,...,...,...,...,...,...,...,...
12990,Super built-up Area,18-May,Talaghattapura,3 BHK,Sodgere,1804 - 2273,3.0,0.0,122.000,3
13059,Super built-up Area,Ready To Move,Harlur,2 BHK,Shodsir,1200 - 1470,2.0,0.0,72.760,2
13240,Super built-up Area,Ready To Move,Devanahalli,1 BHK,Pardsri,1020 - 1130,,,52.570,1
13265,Super built-up Area,20-Sep,Hoodi,2 BHK,Ranuetz,1133 - 1384,2.0,0.0,59.135,2


In [21]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   

In [22]:
data_clean["sqft"] = data_clean["total_sqft"].apply(convert_sqft_to_num)

In [23]:
data_clean["sqft"].describe()

count    13258.000000
mean      1558.812282
std       1238.432276
min          1.000000
25%       1100.000000
50%       1275.000000
75%       1680.000000
max      52272.000000
Name: sqft, dtype: float64

In [24]:
data_clean.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk,sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,2,1056.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,4,2600.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0,3,1440.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,3,1521.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0,2,1200.0


In [25]:
data_clean.isnull().sum()

area_type          0
availability       0
location           0
size               0
society         5499
total_sqft         0
bath              57
balcony          593
price              0
bhk                0
sqft              46
dtype: int64

In [26]:
bhk_groupby_bathroom = data_clean.groupby("bhk")["bath"].value_counts()

In [27]:
def FillBathrooms(bhk_groupby_bathroom, row):
    if pd.isnull(row["bath"]):
        return int(bhk_groupby_bathroom[row["bhk"]].index[0]) 
    else:
        return int(row["bath"])

In [28]:
bhk_groupby_bathroom = data_clean.groupby("bhk")["bath"].value_counts()
data_clean["bath"] = data_clean.apply(lambda row: FillBathrooms(bhk_groupby_bathroom, row), axis=1)

In [29]:
def FillBalcony(bhk_groupby_balcony, row):
    if pd.isnull(row["balcony"]):
        return int(bhk_groupby_bathroom[row["bhk"]].index[0]) 
    else:
        return int(row["balcony"])

In [30]:
bhk_groupby_balcony = data_clean.groupby("bhk")["balcony"].value_counts()
data_clean["balcony"] = data_clean.apply(lambda row: FillBalcony(bhk_groupby_balcony, row), axis=1)

In [31]:
data_clean.drop(["society", "size", "total_sqft"], inplace = True, axis=1)

In [32]:
data_clean['sqft'] = data_clean['sqft'].fillna(data_clean['sqft'].mean())

In [33]:
data_clean.isnull().sum()

area_type       0
availability    0
location        0
bath            0
balcony         0
price           0
bhk             0
sqft            0
dtype: int64

In [34]:
data_clean.head()

Unnamed: 0,area_type,availability,location,bath,balcony,price,bhk,sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2,1,39.07,2,1056.0
1,Plot Area,Ready To Move,Chikka Tirupathi,5,3,120.0,4,2600.0
2,Built-up Area,Ready To Move,Uttarahalli,2,3,62.0,3,1440.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,1,95.0,3,1521.0
4,Super built-up Area,Ready To Move,Kothanur,2,1,51.0,2,1200.0


In [35]:
def Availability(x):
    values = x.split("-")
    try:
        if len(values) > 1:
            return "Soon to be Vacated"
        else:
            return x
    except:
            return ""

In [36]:
data_clean["availability"] = data_clean["availability"].apply(Availability)

# Feature Selection for Outlier Dection 

In [37]:
data_clean.location = data_clean.location.apply(lambda x: x.strip())
location_stats = data_clean['location'].value_counts(ascending=False)
location_stats

Whitefield                            540
Sarjapur  Road                        397
Electronic City                       304
Kanakpura Road                        273
Thanisandra                           237
                                     ... 
Manganahalli                            1
Double Road                             1
Kirloskar layout, Basaveshwarnagar      1
Bettadasanapura                         1
Subhash Nagar                           1
Name: location, Length: 1293, dtype: int64

In [38]:
len(location_stats[location_stats>10])

241

In [39]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

Basapura                              10
Naganathapura                         10
1st Block Koramangala                 10
Dodsworth Layout                      10
Nagadevanahalli                       10
                                      ..
Manganahalli                           1
Double Road                            1
Kirloskar layout, Basaveshwarnagar     1
Bettadasanapura                        1
Subhash Nagar                          1
Name: location, Length: 1052, dtype: int64

In [40]:
data_clean.location = data_clean.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(data_clean.location.unique())

242

In [41]:
data_clean.head(10)

Unnamed: 0,area_type,availability,location,bath,balcony,price,bhk,sqft
0,Super built-up Area,Soon to be Vacated,Electronic City Phase II,2,1,39.07,2,1056.0
1,Plot Area,Ready To Move,Chikka Tirupathi,5,3,120.0,4,2600.0
2,Built-up Area,Ready To Move,Uttarahalli,2,3,62.0,3,1440.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,1,95.0,3,1521.0
4,Super built-up Area,Ready To Move,Kothanur,2,1,51.0,2,1200.0
5,Super built-up Area,Ready To Move,Whitefield,2,1,38.0,2,1170.0
6,Super built-up Area,Soon to be Vacated,Old Airport Road,4,4,204.0,4,2732.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4,4,600.0,4,3300.0
8,Super built-up Area,Ready To Move,Marathahalli,3,1,63.25,3,1310.0
9,Plot Area,Ready To Move,other,6,6,370.0,6,1020.0


In [42]:
data_clean["price_per_sqft"] = data_clean["price"] * 100000 / data_clean["sqft"]

In [43]:
data_clean["sqft_per_bhk"] = data_clean["sqft"] / data_clean["bhk"]

In [44]:
data_clean.head()

Unnamed: 0,area_type,availability,location,bath,balcony,price,bhk,sqft,price_per_sqft,sqft_per_bhk
0,Super built-up Area,Soon to be Vacated,Electronic City Phase II,2,1,39.07,2,1056.0,3699.810606,528.0
1,Plot Area,Ready To Move,Chikka Tirupathi,5,3,120.0,4,2600.0,4615.384615,650.0
2,Built-up Area,Ready To Move,Uttarahalli,2,3,62.0,3,1440.0,4305.555556,480.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,1,95.0,3,1521.0,6245.890861,507.0
4,Super built-up Area,Ready To Move,Kothanur,2,1,51.0,2,1200.0,4250.0,600.0


In [45]:
# Removing samples of sqft_per_room less than 300 and more than 1500
data_clean[data_clean["sqft_per_bhk"] < 300].count()

area_type         748
availability      748
location          748
bath              748
balcony           748
price             748
bhk               748
sqft              748
price_per_sqft    748
sqft_per_bhk      748
dtype: int64

In [46]:
data_clean[data_clean["sqft_per_bhk"] > 1500].count()

area_type         92
availability      92
location          92
bath              92
balcony           92
price             92
bhk               92
sqft              92
price_per_sqft    92
sqft_per_bhk      92
dtype: int64

In [47]:
data_clean = data_clean[~(data_clean["sqft_per_bhk"] < 300)]
data_clean = data_clean[~(data_clean["sqft_per_bhk"] > 1500)]

In [48]:
data_clean.sort_values(["price"], ascending=False)

Unnamed: 0,area_type,availability,location,bath,balcony,price,bhk,sqft,price_per_sqft,sqft_per_bhk
13067,Plot Area,Ready To Move,other,13,10,3600.0,10,7150.000000,50349.650350,715.000000
13200,Plot Area,Ready To Move,other,6,3,2800.0,6,8000.000000,35000.000000,1333.333333
12443,Plot Area,Ready To Move,other,8,4,2600.0,4,4350.000000,59770.114943,1087.500000
6421,Plot Area,Soon to be Vacated,Bommenahalli,3,2,2250.0,4,2940.000000,76530.612245,735.000000
7727,Super built-up Area,Ready To Move,other,6,4,1900.0,4,5422.000000,35042.419771,1355.500000
...,...,...,...,...,...,...,...,...,...,...
1471,Built-up Area,Soon to be Vacated,Kengeri,1,1,10.0,1,340.000000,2941.176471,340.000000
4113,Super built-up Area,Soon to be Vacated,BTM Layout,3,2,10.0,3,1558.812282,641.514063,519.604094
5410,Super built-up Area,Ready To Move,Attibele,1,1,10.0,1,400.000000,2500.000000,400.000000
11091,Built-up Area,Ready To Move,Attibele,1,1,10.0,1,410.000000,2439.024390,410.000000


In [49]:
data_clean.shape

(12464, 10)

In [50]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [51]:
data_clean = remove_bhk_outliers(data_clean)

In [52]:
data_clean = data_clean[data_clean.bath < data_clean.bhk+2]
data_clean.shape

(8514, 10)

# Dropping Correlations based on Attributes 

In [53]:
data_clean.corr()["price"].sort_values(ascending=False)

price             1.000000
sqft              0.753997
price_per_sqft    0.743811
bath              0.583275
bhk               0.578439
sqft_per_bhk      0.443535
balcony           0.258580
Name: price, dtype: float64

In [54]:
data_clean.drop(["price_per_sqft", "sqft_per_bhk", "balcony"], axis = 1, inplace = True)

In [55]:
price = data_clean["price"]
data_clean.drop(["price"], axis = 1, inplace = True)

In [56]:
data_clean = data_clean[["sqft", "bhk", "bath", "availability", "area_type", "location"]]

# Handling Categorical Features

In [57]:
le_avail = LabelEncoder()
data_availability = le_avail.fit_transform(data_clean.iloc[:,3])

In [58]:
le_areaty = LabelEncoder()
data_area_type = le_areaty.fit_transform(data_clean.iloc[:,4])

In [59]:
le_loc = LabelEncoder()
data_location = le_loc .fit_transform(data_clean.iloc[:,5])

In [60]:
ohe1 = OneHotEncoder()
data_availability = ohe1.fit_transform(data_availability.reshape(-1,1))
data_availability = pd.DataFrame(data_availability.toarray(), columns=le_avail.classes_)

In [61]:
ohe_areaty = OneHotEncoder()
data_area_type = ohe_areaty.fit_transform(data_area_type.reshape(-1,1))
data_area_type = pd.DataFrame(data_area_type.toarray(), columns=le_areaty.classes_)

In [62]:
ohe_loc = OneHotEncoder()
data_location = ohe_loc.fit_transform(data_location.reshape(-1,1))
data_location = pd.DataFrame(data_location.toarray(), columns=le_loc.classes_)

# Model for Price Prediction

In [63]:
data_availability.drop([data_availability.columns[len(data_availability.columns)-1]], axis=1, inplace = True)
data_area_type.drop([data_area_type.columns[len(data_area_type.columns)-1]], axis=1, inplace = True)
data_location.drop([data_location.columns[len(data_location.columns)-1]], axis=1, inplace = True)

In [64]:
data_num_features = data_clean.iloc[:, 0:3].reset_index()
data_num_features.drop(["index"], axis = 1, inplace = True)

In [65]:
std_scaler = StandardScaler()
data_num_scaled_features = pd.DataFrame(std_scaler.fit_transform(data_num_features), columns=data_num_features.columns)

In [66]:
data_for_model = pd.concat([data_num_scaled_features, data_availability, data_area_type, data_location], axis=1)

In [67]:
data_for_model

Unnamed: 0,sqft,bhk,bath,Ready To Move,Built-up Area,Carpet Area,Plot Area,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,-0.592729,-0.577824,-0.497286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.106833,0.419661,-0.497286,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.004339,0.419661,0.466057,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.410518,-0.577824,-0.497286,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.528004,1.417146,1.429401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8509,0.241139,0.419661,0.466057,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8510,2.626331,1.417146,2.392745,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8511,-0.485174,-0.577824,-0.497286,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8512,4.004301,1.417146,1.429401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
X, y = data_for_model, price
X_train, X_test, y_train, y_test = train_test_split(data_for_model, price, test_size = 0.2, random_state = 42)

In [69]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6811, 248), (1703, 248), (6811,), (1703,))

In [70]:
y_test

6886      60.00
7054      70.00
9260      68.00
12896    680.00
1468      65.65
          ...  
11602    110.00
858       92.00
8403     150.00
1198      97.42
10703     50.00
Name: price, Length: 1703, dtype: float64

In [71]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [72]:
ridge_reg = Ridge(alpha = 0.1)
ridge_reg.fit(X_train, y_train)
ridge_reg.score(X_test, y_test)

0.7487772996671358

In [73]:
lasso_reg = Lasso(alpha = 0.1)
lasso_reg.fit(X_train, y_train)
lasso_reg.score(X_test, y_test)

0.7305152333892115

# Linear Regression(since it performs better)

In [74]:
pred = ridge_reg.predict(X_test)
pred

array([ 76.88814816,  70.98005541,  91.53681665, ..., 184.65805111,
       118.2405308 ,  69.4501586 ])

In [76]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return ridge_reg.predict([x])[0]

In [77]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

88387.3146519857

In [78]:
predict_price('Vishveshwarya Layout',1500, 1, 2)

132363.0868769958

In [79]:
predict_price('Whitefield',1300, 2, 3)

114796.933941944

In [80]:
pred = lasso_reg.predict(X_test)
pred

array([ 70.50818569,  71.46953598, 113.23335915, ..., 135.56058856,
       151.78811097,  11.67057862])

In [81]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lasso_reg.predict([x])[0]

In [1]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

NameError: name 'predict_price' is not defined

In [83]:
predict_price('Vishveshwarya Layout',1500, 1, 2)

136523.30157764748

In [84]:
predict_price('Whitefield',1300, 2, 3)

118321.04729705869