In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import the dataset
dataset_path = 'dataset\Bengaluru_House_Data.csv'
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

# Working on columns

In [4]:
for col in df.select_dtypes(include='object').columns:
    unique = df[col].unique()
    print(f"Unique values in column '{col}': {unique}")
    print()

Unique values in column 'area_type': ['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']

Unique values in column 'availability': ['19-Dec' 'Ready To Move' '18-May' '18-Feb' '18-Nov' '20-Dec' '17-Oct'
 '21-Dec' '19-Sep' '20-Sep' '18-Mar' '20-Feb' '18-Apr' '20-Aug' '18-Oct'
 '19-Mar' '17-Sep' '18-Dec' '17-Aug' '19-Apr' '18-Jun' '22-Dec' '22-Jan'
 '18-Aug' '19-Jan' '17-Jul' '18-Jul' '21-Jun' '20-May' '19-Aug' '18-Sep'
 '17-May' '17-Jun' '21-May' '18-Jan' '20-Mar' '17-Dec' '16-Mar' '19-Jun'
 '22-Jun' '19-Jul' '21-Feb' 'Immediate Possession' '19-May' '17-Nov'
 '20-Oct' '20-Jun' '19-Feb' '21-Oct' '21-Jan' '17-Mar' '17-Apr' '22-May'
 '19-Oct' '21-Jul' '21-Nov' '21-Mar' '16-Dec' '22-Mar' '20-Jan' '21-Sep'
 '21-Aug' '14-Nov' '19-Nov' '15-Nov' '16-Jul' '15-Jun' '17-Feb' '20-Nov'
 '20-Jul' '16-Sep' '15-Oct' '15-Dec' '16-Oct' '22-Nov' '15-Aug' '17-Jan'
 '16-Nov' '20-Apr' '16-Jan' '14-Jul']

Unique values in column 'location': ['Electronic City Phase II' 'Chikka Tirupathi' 'Uttar

In [5]:
df['society'].unique().shape

(2689,)

In [6]:
df['society'].value_counts()
# Large no. of varying societies across the dataset

society
GrrvaGr    80
PrarePa    76
Prtates    59
Sryalan    59
GMown E    56
           ..
PronsAs     1
NantsGa     1
Vemit S     1
Shamzm      1
SJces R     1
Name: count, Length: 2688, dtype: int64

In [7]:
df['location'].unique().shape

(1306,)

In [8]:
df['location'].value_counts()

location
Whitefield                 540
Sarjapur  Road             399
Electronic City            302
Kanakpura Road             273
Thanisandra                234
                          ... 
Park View Layout             1
Xavier Layout                1
Air View Colony              1
akshaya nagar t c palya      1
mvj engineering college      1
Name: count, Length: 1305, dtype: int64

In [9]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

### Conclusion
#### 1. Society and availability have a large variation in the data and can't be analyzed easily and also it does not play an important role in the pricing of the houses so they can be excluded
#### 2. There are 4 types of area type that have been defined
#### 3. Location has a great impact on the pricing of the houses so we need to consider the location very well
#### 4. Size also has a great impact on the pricing of the houses (how many BHK) and also there is the variation in the Size like its not properly coded
#### 5. total_sqft has a great impact on pricing but the units of the total sqft are vaying so we need to deal with it
#### 6. There are null values in only location, size, society, bath and balcony

## Cleaning the data

In [10]:
# Removing the availability and the society columns from the data
df = df.drop(['availability','society'],axis=1)

In [11]:
# Dealing with the ambigious data in BHK
def check_bhk(x):
    try : 
        arr = x.split(' ')
        return int(arr[0])
    except :
        return 0

df['bhk'] = df['size'].astype(str).apply(check_bhk)
df = df[df['bhk']>0]
df = df.drop(['size'],axis=1)
df.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Built-up Area,Uttarahalli,1440,2.0,3.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Super built-up Area,Kothanur,1200,2.0,1.0,51.0,2


In [12]:
# Dealing with ambigious data in total_sqft
def rectify(x):
    try:
        arr = x.split('-')
        if len(arr)==2:
            return (float(arr[0]) + float(arr[1]))/2
        elif 'Sq. Meter' in x or 'Sq. Meters' in x:
            val = float(x.split()[0])
            return val * 10.7639
        elif 'Acre' in x or 'Acres' in x:
            val = float(x.split()[0])
            return val * 43560
        return float(x)
    except:
        return 0

df['sqft'] = df['total_sqft'].astype(str).apply(rectify)
df = df.drop(['total_sqft'],axis=1)
df.sample(4)

Unnamed: 0,area_type,location,bath,balcony,price,bhk,sqft
11005,Super built-up Area,8th Phase JP Nagar,2.0,1.0,59.0,2,1245.0
3841,Plot Area,Lakkasandra,9.0,,166.0,9,4500.0
5443,Super built-up Area,Panathur,2.0,2.0,85.0,2,1210.0
6506,Plot Area,Sadahalli,2.0,,130.0,3,3800.0


## Dealing with NA values

In [13]:
# Filling the balcony with the median as it has many missing values as compared to other cols
median = df['balcony'].median()
df['new'] = df['balcony'].fillna(median)

In [14]:
# rest of the NA values in the colns can be dropped as those are small in no.
df = df.dropna()

In [15]:
df.isnull().sum()

area_type    0
location     0
bath         0
balcony      0
price        0
bhk          0
sqft         0
new          0
dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12710 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_type  12710 non-null  object 
 1   location   12710 non-null  object 
 2   bath       12710 non-null  float64
 3   balcony    12710 non-null  float64
 4   price      12710 non-null  float64
 5   bhk        12710 non-null  int64  
 6   sqft       12710 non-null  float64
 7   new        12710 non-null  float64
dtypes: float64(5), int64(1), object(2)
memory usage: 893.7+ KB


In [17]:
df['balcony'] = df['balcony'].astype(int)   # Converting the floating type of balcony to int

In [18]:
df['bath'] = df['bath'].astype(int)   # Converting the floating type of the bath to the int

## Handling Location

In [19]:
df.shape

(12710, 8)

In [20]:
df_location_count = df['location'].value_counts().reset_index()

loc_greater_than_10 = list(df_location_count.loc[df_location_count['count'] > 10, 'location'])
len(loc_greater_than_10)

237

In [21]:
df['location'] = df['location'].apply(lambda x : x if x in loc_greater_than_10 else 'Other' )

In [22]:
len(df['location'].unique())
# We have converted the location of the rows to others if the total count of that particular location is less than 10

238

## Creating columns price_per_sqft , sqft_per_bhk

In [23]:
df['price_per_sqft'] = df['price']*100000 / df['sqft']      # because prices are in lakhs (100K)
df.head()

Unnamed: 0,area_type,location,bath,balcony,price,bhk,sqft,new,price_per_sqft
0,Super built-up Area,Electronic City Phase II,2,1,39.07,2,1056.0,1.0,3699.810606
1,Plot Area,Chikka Tirupathi,5,3,120.0,4,2600.0,3.0,4615.384615
2,Built-up Area,Uttarahalli,2,3,62.0,3,1440.0,3.0,4305.555556
3,Super built-up Area,Lingadheeranahalli,3,1,95.0,3,1521.0,1.0,6245.890861
4,Super built-up Area,Kothanur,2,1,51.0,2,1200.0,1.0,4250.0


In [24]:
df['sqft_per_bhk'] = df['sqft'] / df['bhk']
df.head()

Unnamed: 0,area_type,location,bath,balcony,price,bhk,sqft,new,price_per_sqft,sqft_per_bhk
0,Super built-up Area,Electronic City Phase II,2,1,39.07,2,1056.0,1.0,3699.810606,528.0
1,Plot Area,Chikka Tirupathi,5,3,120.0,4,2600.0,3.0,4615.384615,650.0
2,Built-up Area,Uttarahalli,2,3,62.0,3,1440.0,3.0,4305.555556,480.0
3,Super built-up Area,Lingadheeranahalli,3,1,95.0,3,1521.0,1.0,6245.890861,507.0
4,Super built-up Area,Kothanur,2,1,51.0,2,1200.0,1.0,4250.0,600.0


# Outlier Detection

### We can remove all the rows whose sqft_per_bhk is less than 300

In [25]:
df = df[df['sqft_per_bhk']>=300]

In [26]:
df.shape

(12013, 10)

In [27]:
min = df['price_per_sqft'].quantile(0.0015)
max = df['price_per_sqft'].quantile(0.9985)

In [28]:
df = df[(df['price_per_sqft']>=min) & df['price_per_sqft']<=max]

In [29]:
df.shape

(12013, 10)

In [30]:
# Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df = remove_bhk_outliers(df)

df.shape

(8368, 10)

# Model Training

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8368 entries, 0 to 13319
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   area_type       8368 non-null   object 
 1   location        8368 non-null   object 
 2   bath            8368 non-null   int64  
 3   balcony         8368 non-null   int64  
 4   price           8368 non-null   float64
 5   bhk             8368 non-null   int64  
 6   sqft            8368 non-null   float64
 7   new             8368 non-null   float64
 8   price_per_sqft  8368 non-null   float64
 9   sqft_per_bhk    8368 non-null   float64
dtypes: float64(5), int64(3), object(2)
memory usage: 719.1+ KB


In [32]:
df['location'] = df['location'].apply(lambda x: x.strip())
# Removing whitespaces from the string of the location

In [33]:
# If we notice carefully then we will find a mistake in the area_type column
df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [34]:
df.area_type = df.area_type.apply(lambda x : str(x).replace('  ',' '))

In [35]:
df = df.drop(['price_per_sqft','sqft_per_bhk'],axis=1)

In [36]:
# Copying the df to another variable so as to use for future purposes
df_before_encoding = df.copy()

In [37]:
# One hot Encoding of the dataset
df = pd.get_dummies(df,dtype=pd.Int16Dtype())
df.columns = df.columns.str.lower()
df = df.drop(['location_other','area_type_plot area'],axis=1)

In [38]:
X = df.drop(['price'],axis=1)
y = df['price']

# Train Test Split

In [39]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Scaling

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


def find_best_model_using_gridsearchcv(X, y, cv_splits=8, test_size=0.2, random_state=0, scoring='r2'):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {}
        },
        'lasso': {
            'model': Lasso(max_iter=10000, random_state=random_state),
            'params': {
                'alpha': [0.001, 0.01, 0.1, 1],
                'selection': ['cyclic']
            }
        },
        'ridge': {
            'model': Ridge(max_iter=10000, random_state=random_state),
            'params': {
                'alpha': [0.1, 1, 10]
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(random_state=random_state),
            'params': {
                'max_depth': [None, 10, 20],
                'min_samples_leaf': [1, 4]
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(random_state=random_state, n_jobs=-1),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [10, 20],
                'max_features': ['sqrt', 0.8]
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=cv_splits, test_size=test_size, random_state=random_state)

    for algo_name, config in algos.items():
        model = config['model']
        params = config.get('params', {})

        if not params:
            cv_scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
            scores.append({
                'model': algo_name,
                'best_score': cv_scores.mean(),
                'best_params': {}
            })
            continue

        gs = GridSearchCV(model, params, cv=cv, scoring=scoring, n_jobs=-1)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
    
find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.599939,{}
1,lasso,0.600059,"{'alpha': 0.001, 'selection': 'cyclic'}"
2,ridge,0.600769,{'alpha': 1}
3,decision_tree,0.630611,"{'max_depth': 10, 'min_samples_leaf': 4}"
4,random_forest,0.712926,"{'max_depth': 10, 'max_features': 0.8, 'n_esti..."


# Pipeline

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
best_random_forest_parameters = {
    'n_estimators': 200,
    'max_depth': 10,
    'max_features': 0.8,
    'random_state': 0,
    'n_jobs': -1
}

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(**best_random_forest_parameters))
])

In [43]:
pipe.fit(X_train,Y_train)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [44]:
pipe.score(X_train,Y_train)

0.8772256889705086

In [45]:
pipe.score(X_test,Y_test)

0.7347321473456743

In [46]:
def predict_price(area_type,location,total_sqft,bath,balcony,bhk):
    x=pd.DataFrame(columns=X.columns)
    x.loc[0]=np.zeros(len(X.columns))
    x['sqft'] = total_sqft
    x['bath'] = bath
    x['balcony'] = balcony
    x['bhk'] = bhk
    loc='location_'+location.lower()
    area='area_type_'+area_type.lower()
    if loc in x.columns:
        x[loc]=1
    if area in x.columns:
        x[area]=1
        
#     print(x.where(x > 0).dropna(axis=1))
    
    return pipe.predict(x)[0]

In [55]:
df.sample(4)

Unnamed: 0,bath,balcony,price,bhk,sqft,new,area_type_built-up area,area_type_carpet area,area_type_super built-up area,location_1st block jayanagar,...,location_vijayanagar,location_vishveshwarya layout,location_vishwapriya layout,location_vittasandra,location_whitefield,location_yelachenahalli,location_yelahanka,location_yelahanka new town,location_yelenahalli,location_yeshwanthpur
3412,2,1,90.0,3,1372.0,1.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9773,2,2,75.0,2,1172.0,2.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3531,2,1,98.0,3,1200.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8767,3,2,207.0,3,2266.0,2.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df_before_encoding.sample(4,random_state=2)

Unnamed: 0,area_type,location,bath,balcony,price,bhk,sqft,new
5621,Super built-up Area,Kadugodi,3,3,112.0,3,1762.0,3.0
5067,Plot Area,Other,2,2,280.0,3,1350.0,2.0
2199,Super built-up Area,Sonnenahalli,2,1,73.0,2,1268.0,1.0
10637,Super built-up Area,Kannamangala,2,1,55.0,2,957.0,1.0


In [56]:
predict_price('Super built-up Area','Kadugodi',1762,3,3,3)

np.float64(122.04919491107395)

# Exporting model

In [50]:
import joblib

In [57]:
joblib.dump(pipe,'pipeline.pkl')

['pipeline.pkl']

In [59]:
import json
locations = list(df_before_encoding.location.unique())
area = list(df_before_encoding.area_type.unique())
locations.remove('Other')
data_columns = {
    'columns' : X.columns.to_list(),
    'locations' : locations,
    'area type' : area
}
with open('columns.json','w') as file:
    json.dump(data_columns,file,indent=4)