In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
import sklearn.externals
import joblib
import pickle

In [2]:
df = pd.read_csv("./data/houses.csv")
del df["Unnamed: 0"]
del df["Location"]
del df["Property type"]
del df["Property subtype"]
del df["Type of sale"]
del df["Kitchen"]
del df["Furnished"]
del df["Open fireplace"]
del df["Terrace"]
del df["Terrace orientation"]
del df["Garden"]
del df["Garden orientation"]
del df["Pool"]
del df["Condition"]

df.head(10)

Unnamed: 0,Price,Number of bedrooms,Living area,Surface area land,Number of facades
0,296607.0,3.0,130.0,239.0,3.0
1,560000.0,4.0,,626.0,3.0
2,299000.0,5.0,200.0,1150.0,
3,300000.0,0.0,270.0,498.0,3.0
4,195000.0,3.0,135.0,165.0,2.0
5,175000.0,3.0,128.0,260.0,2.0
6,415000.0,3.0,207.0,534.0,3.0
7,405000.0,3.0,207.0,382.0,2.0
8,415000.0,3.0,207.0,532.0,3.0
9,415000.0,3.0,194.0,457.0,2.0


In [3]:
df=df.replace('Unknown', np.NaN, regex=True)

In [4]:
df.head()

Unnamed: 0,Price,Number of bedrooms,Living area,Surface area land,Number of facades
0,296607.0,3.0,130.0,239.0,3.0
1,560000.0,4.0,,626.0,3.0
2,299000.0,5.0,200.0,1150.0,
3,300000.0,0.0,270.0,498.0,3.0
4,195000.0,3.0,135.0,165.0,2.0


In [5]:
df.shape

(10082, 5)

In [6]:
print(df.isnull().sum())

Price                  124
Number of bedrooms     112
Living area           2332
Surface area land      112
Number of facades     2307
dtype: int64


In [7]:
df.columns

Index(['Price', 'Number of bedrooms', 'Living area', 'Surface area land',
       'Number of facades'],
      dtype='object')

In [8]:
df = df.dropna(axis=0, how='any', thresh=None, subset=["Price","Number of bedrooms","Living area","Number of facades", "Surface area land"], inplace=False)

df.shape

(6323, 5)

In [9]:
# generate regression dataset

featurs=["Number of bedrooms","Living area","Number of facades", "Surface area land"]
X= df[featurs]
y=df['Price']

In [10]:
print(X.shape)
print(y.shape)

(6323, 4)
(6323,)


In [12]:
correlation= df.corr()
correlation

Unnamed: 0,Price,Number of bedrooms,Living area,Surface area land
Price,1.0,0.392449,0.570532,0.158391
Number of bedrooms,0.392449,1.0,0.412824,0.072894
Living area,0.570532,0.412824,1.0,0.134033
Surface area land,0.158391,0.072894,0.134033,1.0


In [13]:
# split into train and test sets

from sklearn.model_selection import train_test_split
from sklearn import datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [14]:
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (5058, 4) (5058,)
Test (1265, 4) (1265,)


In [15]:
# Load and fit the model
from sklearn.linear_model import LinearRegression

regressor= LinearRegression()

regressor.fit(X_train, y_train)

LinearRegression()

In [16]:
regressor.score(X_train, y_train)

0.3907144453905257

In [17]:
regressor.score(X_test, y_test)

0.2956051548818358

In [18]:
#Test your model

price_predict= regressor.predict(X_test)
price_predict

array([372911.35571806, 468808.02692676, 679084.89082624, ...,
       345672.63012018, 476417.10250819, 391085.0567743 ])

In [19]:
### Save our model, This will generate .pkl file in your currentl dircetroy, your model is ready to use.
#joblib.dump(regressor,"house_price_prediction.pkl")
pickle.dump(regressor,open('house_price_prediction.pkl','wb'))