In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR


df=pd.read_csv('House_Rent_Dataset.csv')
df = df[['BHK', 'Size', 'Bathroom', 'City', 'Furnishing Status', 'Tenant Preferred', 'Rent','Floor','Area Type','Point of Contact']]

# Handle missing values if any
df = df.dropna()

print(df.dtypes)

BHK                   int64
Size                  int64
Bathroom              int64
City                 object
Furnishing Status    object
Tenant Preferred     object
Rent                  int64
Floor                object
Area Type            object
Point of Contact     object
dtype: object


In [2]:
 
df.rename(columns = {'Furnishing Status':'Furnishing_Status', 'Tenant Preferred':'Tenant_Preferred',
                              'Area Type':'Area_Type'}, inplace = True)

In [3]:
df.head()

Unnamed: 0,BHK,Size,Bathroom,City,Furnishing_Status,Tenant_Preferred,Rent,Floor,Area_Type,Point of Contact
0,2,1100,2,Kolkata,Unfurnished,Bachelors/Family,10000,Ground out of 2,Super Area,Contact Owner
1,2,800,1,Kolkata,Semi-Furnished,Bachelors/Family,20000,1 out of 3,Super Area,Contact Owner
2,2,1000,1,Kolkata,Semi-Furnished,Bachelors/Family,17000,1 out of 3,Super Area,Contact Owner
3,2,800,1,Kolkata,Unfurnished,Bachelors/Family,10000,1 out of 2,Super Area,Contact Owner
4,2,850,1,Kolkata,Unfurnished,Bachelors,7500,1 out of 2,Carpet Area,Contact Owner


In [5]:
print(df['Area_Type'].unique())
print(df['Furnishing_Status'].unique())
print(df['Tenant_Preferred'].unique())
print(df['Floor'].unique())
print(df['Point of Contact'].unique())

['Super Area' 'Carpet Area' 'Built Area']
['Unfurnished' 'Semi-Furnished' 'Furnished']
['Bachelors/Family' 'Bachelors' 'Family']
['Ground out of 2' '1 out of 3' '1 out of 2' 'Ground out of 1'
 'Ground out of 4' '1 out of 4' '1 out of 1' 'Ground out of 3'
 '2 out of 3' '4 out of 5' '2 out of 2' '2 out of 5' '4 out of 14'
 '3 out of 3' '5 out of 5' '4 out of 4' '7 out of 8' '2 out of 4'
 '3 out of 4' '1 out of 5' '8 out of 5' 'Ground out of 6' '2 out of 1'
 'Upper Basement out of 4' 'Ground out of 5' '3 out of 5' '11 out of 19'
 '5 out of 10' '11 out of 14' 'Lower Basement out of 2' '2 out of 7'
 '4 out of 10' '7 out of 10' '2 out of 13' '6 out of 7' '4 out of 7'
 '14 out of 14' '43 out of 78' '2 out of 8' '13 out of 18' '5 out of 12'
 '18 out of 24' '3 out of 7' '17 out of 31' '11 out of 21' '7 out of 19'
 '14 out of 23' '9 out of 20' 'Upper Basement out of 9' '19 out of 24'
 '3 out of 21' '1 out of 22' '8 out of 8' '6 out of 12' '4 out of 58'
 'Upper Basement out of 16' '60 out of 66' 

In [6]:
df=df.drop('Floor', axis=1)

In [7]:
df=pd.get_dummies(df,drop_first=True)
df.head()

Unnamed: 0,BHK,Size,Bathroom,Rent,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Furnishing_Status_Semi-Furnished,Furnishing_Status_Unfurnished,Tenant_Preferred_Bachelors/Family,Tenant_Preferred_Family,Area_Type_Carpet Area,Area_Type_Super Area,Point of Contact_Contact Builder,Point of Contact_Contact Owner
0,2,1100,2,10000,0,0,0,1,0,0,1,1,0,0,1,0,1
1,2,800,1,20000,0,0,0,1,0,1,0,1,0,0,1,0,1
2,2,1000,1,17000,0,0,0,1,0,1,0,1,0,0,1,0,1
3,2,800,1,10000,0,0,0,1,0,0,1,1,0,0,1,0,1
4,2,850,1,7500,0,0,0,1,0,0,1,0,0,1,0,0,1


In [12]:
y=df['Rent']
x=df.drop(['Rent'], axis=1)


In [13]:
scaler = StandardScaler()
# Fit the inputs (calculate the mean and standard deviation feature-wise)
scaler.fit(x)
x = scaler.transform(x)

In [15]:
x

array([[-0.10077301,  0.2089605 ,  0.03859399, ...,  0.9696962 ,
        -0.01451717,  0.68974363],
       [-0.10077301, -0.26412451, -1.09206691, ...,  0.9696962 ,
        -0.01451717,  0.68974363],
       [-0.10077301,  0.0512655 , -1.09206691, ...,  0.9696962 ,
        -0.01451717,  0.68974363],
       ...,
       [ 1.10090711,  1.233978  ,  1.16925489, ..., -1.03125082,
        -0.01451717, -1.44981405],
       [ 1.10090711,  0.8397405 ,  0.03859399, ..., -1.03125082,
        -0.01451717, -1.44981405],
       [-0.10077301,  0.0512655 ,  0.03859399, ..., -1.03125082,
        -0.01451717,  0.68974363]])

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [18]:
enr=ElasticNet(random_state=42)
gbr=GradientBoostingRegressor(random_state=42)

svr=SVR()
enr_param={'alpha': [1.0,0],
           'l1_ratio': [0.5, 1],
           'fit_intercept': [True,False]}
gbr_param={'learning_rate': [0.1,0.01,0.001],
           'max_depth': [2,3,4],
           'n_estimators':[500,1000,2000]}
svr_param={'kernel' : ['linear', 'poly', 'rbf'],
           'C':[0.01, 0.1, 0.5, 1, 2, 5, 10],
           }

In [19]:
enr_grid = GridSearchCV(estimator=enr, 
                        param_grid=enr_param,
                        scoring='r2',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True,
                        verbose=True)
gbr_grid = GridSearchCV(estimator=gbr, 
                        param_grid=gbr_param,
                        scoring='r2',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True,
                        verbose=True)
svr_grid = GridSearchCV(estimator=svr, 
                        param_grid=svr_param,
                        scoring='r2',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True,
                        verbose=True)

In [20]:
enr_grid_fit = enr_grid.fit(x_train, y_train)
gbr_grid_fit = gbr_grid.fit(x_train, y_train)
svr_grid_fit = svr_grid.fit(x_train, y_train)
cv_results_enr = pd.DataFrame.from_dict(enr_grid_fit.cv_results_)
cv_results_gbr = pd.DataFrame.from_dict(gbr_grid_fit.cv_results_)
cv_results_svr = pd.DataFrame.from_dict(svr_grid_fit.cv_results_)
print(cv_results_enr)
print(cv_results_gbr)
print(cv_results_svr)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


  model = cd_fast.enet_coordinate_descent(


Fitting 10 folds for each of 27 candidates, totalling 270 fits
Fitting 10 folds for each of 21 candidates, totalling 210 fits
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_alpha  \
0       0.054155      0.044062         0.004959        0.007031         1.0   
1       0.132378      0.034724         0.007716        0.010412         1.0   
2       0.000483      0.001450         0.000000        0.000000         1.0   
3       0.104756      0.022653         0.000100        0.000299         1.0   
4       0.125713      0.034243         0.000099        0.000296           0   
5       0.118233      0.040905         0.000197        0.000591           0   
6       0.107437      0.035386         0.001301        0.002712           0   
7       0.096729      0.020845         0.000000        0.000000           0   

  param_fit_intercept param_l1_ratio  \
0                True            0.5   
1                True              1   
2               False            0.5   
3 

In [21]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",enr_grid_fit.best_estimator_)
print("\n The best score across ALL searched params:\n",enr_grid_fit.best_score_)
print("\n The best parameters across ALL searched params:\n",enr_grid_fit.best_params_)
print('---------------')
print("\n The best estimator across ALL searched params:\n",gbr_grid_fit.best_estimator_)
print("\n The best score across ALL searched params:\n",gbr_grid_fit.best_score_)
print("\n The best parameters across ALL searched params:\n",gbr_grid_fit.best_params_)
print('---------------------------')
print("\n The best estimator across ALL searched params:\n",svr_grid_fit.best_estimator_)
print("\n The best score across ALL searched params:\n",svr_grid_fit.best_score_)
print("\n The best parameters across ALL searched params:\n",svr_grid_fit.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 ElasticNet(l1_ratio=1, random_state=42)

 The best score across ALL searched params:
 0.45500491035202756

 The best parameters across ALL searched params:
 {'alpha': 1.0, 'fit_intercept': True, 'l1_ratio': 1}
---------------

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.01, max_depth=2, n_estimators=500,
                          random_state=42)

 The best score across ALL searched params:
 0.5777030586073183

 The best parameters across ALL searched params:
 {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 500}
---------------------------

 The best estimator across ALL searched params:
 SVR(C=10, kernel='linear')

 The best score across ALL searched params:
 0.24303197186095898

 The best parameters across ALL searched params:
 {'C': 10, 'kernel': 'linear'}


In [23]:
gbr_best=GradientBoostingRegressor(learning_rate=0.01, max_depth =2, n_estimators= 500)

In [24]:
model=gbr_best.fit(x_train,y_train)

In [25]:
model.score

<bound method RegressorMixin.score of GradientBoostingRegressor(learning_rate=0.01, max_depth=2, n_estimators=500)>

In [26]:
y_predict=model.predict(x_test)

In [28]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_predict))

0.6558211708130982
