In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestRegressor

In [126]:
data = pd.read_csv('./house_rent_final.csv')
data.head(5)

Unnamed: 0,BHK,Price_per_Sqft,Main_Locality,Sqft,Rent
0,2,3800.0,pal,1173.684211,12636.67
1,2,3600.0,palanpur,1250.0,11250.0
2,3,3392.0,vesu,1300.117925,16170.0
3,2,3751.0,palanpur,1181.018395,11075.0
4,2,3200.0,palanpur,1250.0,10000.0


In [127]:
data.columns

Index(['BHK', 'Price_per_Sqft', 'Main_Locality', 'Sqft', 'Rent'], dtype='object')

In [128]:
data.drop(['Price_per_Sqft'], axis=1, inplace=True)

In [129]:
data.columns

Index(['BHK', 'Main_Locality', 'Sqft', 'Rent'], dtype='object')

In [130]:
x = data.copy()
y = x.pop('Rent')

In [131]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

In [132]:
X_train,X_valid,y_train,y_valid = train_test_split(x,y,train_size=0.8,random_state=0)

In [133]:
num_cols =[ cols for cols in X_train.columns if X_train[cols].dtype in ['int64','float64']]

In [134]:
cate_cols = [ cols for cols in X_train.columns if X_train[cols].dtype == 'object']

In [135]:
print(f'Numerical columns: {num_cols}')
print(f'Categorical columns: {cate_cols}')

Numerical columns: ['BHK', 'Sqft']
Categorical columns: ['Main_Locality']


In [136]:
num_tra = SimpleImputer(strategy='median')

cat_tra = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                          ('onehot',OneHotEncoder(handle_unknown='ignore'))])

In [137]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_tra,num_cols),
        ('cat',cat_tra,cate_cols)
    ]
)

In [138]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [139]:
clf = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('model',model)
])

In [140]:
clf.fit(X_train,y_train)
preds = clf.predict(X_valid)
print('MAE :', mean_absolute_error(y_valid,preds))

MAE : 2194.538568197202


In [141]:
model = RandomForestRegressor(n_estimators=1000,random_state=0)
clf = Pipeline(steps = [
    ('preprocessor',preprocessor),
    ('model',model)
])
clf.fit(X_train,y_train)
preds = clf.predict(X_valid)
print('MAE :', mean_absolute_error(y_valid,preds))

MAE : 2156.3183485682707


In [142]:
import joblib

# Save model
joblib.dump(clf, 'model.pkl')

['model.pkl']

In [143]:
data.columns

Index(['BHK', 'Main_Locality', 'Sqft', 'Rent'], dtype='object')

In [144]:
data['Main_Locality'].unique()

array(['pal', 'palanpur', 'vesu', 'adajan', 'bhatar', 'varachha', 'udhna',
       'katargam'], dtype=object)