In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
housing = pd.read_csv('data.csv')

In [3]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7


In [4]:
housing.shape

(506, 14)

In [5]:
housing.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [6]:
housing.CHAS.value_counts()

0    471
1     35
Name: CHAS, dtype: int64

In [7]:
# housing.hist(figsize=(20,15))

# StratifiedShuffleSplit 

StratifiedShuffleSplit method returns training and testing sets that have same proportions of class labels as the input dataset

In [8]:
split = StratifiedShuffleSplit(test_size=0.2)
for train_index, test_index in split.split(housing, housing.CHAS):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [9]:
print(strat_test_set.CHAS.value_counts())
print(strat_train_set.CHAS.value_counts())

0    95
1     7
Name: CHAS, dtype: int64
0    376
1     28
Name: CHAS, dtype: int64


# Corelations in data

corr() returns the matrix that shows the relationship between the attributes of the data

In [10]:
corr_matrix = strat_train_set.corr()
# corr_matrix
corr_matrix.MEDV.sort_values()

LSTAT     -0.735415
PTRATIO   -0.514007
INDUS     -0.486211
TAX       -0.471979
NOX       -0.410658
RAD       -0.380789
CRIM      -0.368532
AGE       -0.363453
CHAS       0.161472
DIS        0.223116
B          0.325838
ZN         0.350817
RM         0.710751
MEDV       1.000000
Name: MEDV, dtype: float64

In [11]:
from sklearn.impute import SimpleImputer # imputer is used to handle missing values
imputer = SimpleImputer(strategy='median') # returns median values for all the attributes
# imputer.fit(strat_train_set)
strat_train_set_transformed =imputer.fit_transform(strat_train_set) #fits and also transforms the data at the same time
imputer.statistics_ 
# strat_train_set_transformed

array([2.4751e-01, 0.0000e+00, 9.6900e+00, 0.0000e+00, 5.3800e-01,
       6.1890e+00, 7.7700e+01, 3.1423e+00, 5.0000e+00, 3.3000e+02,
       1.9000e+01, 3.9182e+02, 1.1170e+01, 2.1050e+01])

# selecting a model and training the model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()

In [13]:
X = strat_train_set_transformed[:, 0:13]
X.shape

(404, 13)

In [14]:
y = strat_train_set_transformed[:, -1]
y.shape

(404,)

In [15]:
model.fit(X,y)

RandomForestRegressor()

In [16]:
strat_test_set_tran = imputer.fit_transform(strat_test_set)

In [17]:
test_x = strat_test_set_tran[:, 0:13]
test_y = strat_test_set_tran[:, -1]

In [18]:
predictions = model.predict(test_x)
predictions

array([15.349,  7.677, 20.467, 27.08 , 23.142,  8.977, 14.464, 20.016,
       24.012, 17.3  , 30.894, 10.338, 20.796, 19.162, 24.167, 25.472,
       15.05 , 13.277, 14.171, 20.   , 13.   , 25.536, 19.6  , 33.603,
       20.825, 11.078, 10.107, 26.328, 22.111, 26.475, 10.197,  9.356,
       23.48 , 14.793, 31.842, 22.57 , 34.036, 16.323, 15.397, 14.369,
       27.863, 17.751, 24.621, 11.469, 14.255, 11.985, 21.295, 16.893,
       20.769, 32.241, 22.81 , 31.38 , 28.388, 21.07 , 22.049, 33.327,
       14.603, 21.083, 26.833, 22.705, 47.112, 27.706, 30.597, 22.818,
       21.172, 18.06 , 20.416, 33.244, 38.35 , 14.271, 23.44 , 20.153,
       17.982, 14.59 , 20.211, 34.377, 19.329, 20.65 , 28.452, 26.554,
       18.213, 21.953, 23.994, 21.185, 33.32 , 12.03 , 24.117, 11.937,
       39.422, 22.618, 19.713, 17.761, 20.269, 26.045, 21.926, 29.733,
       28.395, 13.855, 20.673, 13.315,  9.946, 12.001])

# Evaluating the model

In [19]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test_y, predictions)
mse

10.295950999999981

In [20]:
from sklearn.model_selection import cross_val_score
nmse = cross_val_score(model, test_x, test_y, scoring='neg_mean_squared_error')
root_nmse = np.sqrt(-nmse)

In [21]:
root_nmse.mean()

4.603463858122303

# Saving the model

In [22]:
from joblib import dump, load
dump(model, 'boston.joblib')

['boston.joblib']