In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as snsa
from sklearn.model_selection import train_test_split 

In [2]:
housing=pd.read_csv("Housing.csv")

In [3]:
housing.head()                  

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
#Converting categorical .
housing['mainroad']=housing['mainroad'].map({'yes':1 , 'no':0})
housing['guestroom']=housing['guestroom'].map({'yes':1 , 'no':0})
housing['basement']=housing['basement'].map({'yes':1 , 'no':0})
housing['hotwaterheating']=housing['hotwaterheating'].map({'yes':1 , 'no':0})
housing['airconditioning']=housing['airconditioning'].map({'yes':1 , 'no':0})
housing['prefarea']=housing['prefarea'].map({'yes':1 , 'no':0})

In [5]:
status=pd.get_dummies(housing['furnishingstatus'] , drop_first=True)

In [6]:
housing=pd.concat([status , housing] , axis=1)

In [9]:
housing.drop(['furnishingstatus'] , axis=1 , inplace=True)

In [10]:
housing.head()

Unnamed: 0,semi-furnished,unfurnished,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea
0,0,0,13300000,7420,4,2,3,1,0,0,0,1,2,1
1,0,0,12250000,8960,4,4,4,1,0,0,0,1,3,0
2,1,0,12250000,9960,3,2,2,1,0,1,0,0,2,1
3,0,0,12215000,7500,4,2,2,1,0,1,0,1,3,1
4,0,0,11410000,7420,4,1,2,1,1,1,0,1,2,0


In [11]:
housing['areaperbedroom']=housing['area']/housing['bedrooms']


In [12]:
housing['bbratio']=housing['bathrooms']/housing['bedrooms']


In [13]:
#using min-max-scaler to normalize.
def normalize(x):
    return ((x-np.min(x))/ (max(x) - min(x)))

#applying normalize() to all columns
housing=housing.apply(normalize)

In [14]:
# Putting feature variable to x
X = housing[['area','bedrooms','bathrooms','stories','mainroad',
             'guestroom','basement','hotwaterheating',
             'airconditioning','parking','prefarea','semi-furnished',
             'unfurnished','areaperbedroom','bbratio']]

#Putting response variable to y
y=housing['price']

In [15]:
X_train , X_test , y_train , y_test=train_test_split(X,y,test_size=0.30 , random_state=1)

In [17]:
import statsmodels.api as sm
X_train_sm=X_train

In [18]:
X_train_sm=sm.add_constant(X_train_sm)

In [19]:
X_train_sm.head()

Unnamed: 0,const,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
180,1.0,0.195876,0.6,0.333333,0.0,0.0,0.0,1.0,0.0,1.0,0.666667,0.0,1.0,0.0,0.119633,0.4
189,1.0,0.129897,0.2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.223348,0.4
93,1.0,0.381443,0.4,0.333333,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.32465,0.6
444,1.0,0.101031,0.4,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105966,0.2
81,1.0,0.161512,0.4,0.333333,0.333333,1.0,0.0,1.0,0.0,1.0,0.333333,0.0,1.0,0.0,0.153133,0.6


In [20]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


In [22]:
lm=LinearRegression()
X_train.shape

(381, 15)

In [25]:
rfe=RFE(lm , n_features_to_select=9)

In [26]:
rfe=rfe.fit(X_train , y_train)
print(rfe.support_)     #printing the boolean result
print(rfe.ranking_)

[ True  True  True  True False False  True  True  True  True False False
 False  True False]
[1 1 1 1 2 6 1 1 1 1 3 7 4 1 5]


In [27]:
col=X_train.columns[rfe.support_]

In [28]:
col

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'basement',
       'hotwaterheating', 'airconditioning', 'parking', 'areaperbedroom'],
      dtype='object')