## House Price Predictor

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         501 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.4 KB


In [5]:
data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284341,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.705587,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.884,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.208,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.625,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


### Missing attributes

In [6]:
data.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         5
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [7]:
X = data.drop('MEDV',axis=1)
y = data.MEDV

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imp = imputer.fit_transform(X)

### Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)

### Creating a Pipeline

In [10]:
from sklearn.pipeline import Pipeline

In [11]:
my_pipeline = Pipeline([('imputer',SimpleImputer(strategy='median')),
                        ('std_scaler',StandardScaler())])


In [12]:
X = my_pipeline.fit_transform(X)

### Train-Test Splitting

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Selecting a desired model

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB

In [15]:
LR = LinearRegression()
DTR = DecisionTreeRegressor()
RFR = RandomForestRegressor()
SVM = SVR()

In [16]:
models = [LR,DTR,RFR,SVM]

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
for model in models:
    print('Model name:', model)
    model.fit(X_train, y_train)
    print('Score:', model.score(X_test, y_test)*100)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print('Root mean squared error:', np.sqrt(mse))
    print()

Model name: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
Score: 66.65030487253156
Root mean squared error: 4.945360766355321

Model name: DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
Score: 87.27026525002441
Root mean squared error: 3.055355311323279

Model name: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Score: 88.08196749029064
Root mean



### Using better evaluation technique - Cross Validation

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
for model in models:
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print("Scores:", rmse_scores)
    print("Mean: ", rmse_scores.mean())
    print("Standard deviation: ", rmse_scores.std())
    print()

Scores: [ 3.06416301  3.80125803  3.7521992   5.94248753  5.65601713  4.46518092
  3.14148523 12.99357193  5.76880834  3.31553668]
Mean:  5.1900708016212125
Standard deviation:  2.8065033386676856

Scores: [4.00881382 3.47495239 3.67014986 6.72227786 4.14795012 6.69480337
 2.98221394 9.81636389 7.99132029 6.14423307]
Mean:  5.565307860981443
Standard deviation:  2.1441004742946865

Scores: [3.22007794 2.32914105 2.83364208 5.18574309 3.43881021 5.53102054
 2.8000225  9.90523074 5.34222351 3.93756143]
Mean:  4.452347309542196
Standard deviation:  2.1188761430820295





Scores: [ 2.96528092  2.74891954  2.37995901  9.52650064  5.60795941  8.2098187
  2.51915652 10.79735561  6.27747431  3.45229877]
Mean:  5.448472342260309
Standard deviation:  2.983517725738844





## Saving a model

##### Random Forest Regressor Shows the highest scoring predictions with minimal error

In [21]:
model = RandomForestRegressor()
model.fit(X, y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [22]:
from joblib import dump
dump(model, 'Model.joblib')

['Model.joblib']

## Using the model

In [23]:
from joblib import load
model = load('Model.joblib')

In [24]:
features = X[:5]

In [25]:
model.predict(features)

array([24.51, 21.97, 34.96, 33.95, 35.14])

In [26]:
y[:5]

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64