## Real estate-price predictor

In [1]:
import pandas as pd
import numpy as np

In [2]:
housing=pd.read_csv("data.csv")

In [3]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


import numpy as np
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled= np.random.permutation(len(data))
    test_size= int(len(data)*test_ratio)
    test_indices= shuffled[:test_size]
    train_indices= shuffled[test_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train_set: {len(train_set)}\nRows in test_set: {len(test_set)} \n")

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [5]:
housing=strat_train_set.copy()

## Looking for corelations

In [6]:
corr_matrix= housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.679894
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [7]:
housing= strat_train_set.drop("MEDV",axis=1)
housing_labels=strat_train_set['MEDV'].copy()

## Creating a pipline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipeline=Pipeline([
    ('Imputer',SimpleImputer(strategy='median')),
    ('Std',StandardScaler()),
])
housing_tr=my_pipeline.fit_transform(housing)

In [9]:
housing_tr.shape

(404, 13)

## Creating a design model

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model=LinearRegression()
model=RandomForestRegressor()
#model=DecisionTreeRegressor()
model.fit(housing_tr,housing_labels)

In [11]:
some_data=housing.iloc[:5]
some_labels=housing_labels[:5]
prepared_data=my_pipeline.transform(some_data)

In [12]:
model.predict(prepared_data)

array([22.351, 25.27 , 16.378, 23.396, 23.485])

In [13]:
list(some_labels)

[21.9, 24.5, 16.7, 23.1, 23.0]

## Evaluating the model

In [14]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model,housing_tr,housing_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores=np.sqrt(-scores)
rmse_scores

array([2.75845151, 2.67801325, 4.40166405, 2.52655835, 3.39469419,
       2.62657429, 4.65833584, 3.33100799, 3.2302105 , 3.46852407])

In [15]:
def print_scores(scores):
    print("Scores: ",scores)
    print("Mean: ",scores.mean())
    print("Standard deviation: ",scores.std())

In [16]:
print_scores(rmse_scores)

Scores:  [2.75845151 2.67801325 4.40166405 2.52655835 3.39469419 2.62657429
 4.65833584 3.33100799 3.2302105  3.46852407]
Mean:  3.307403404320379
Standard deviation:  0.6952156288534955


## Saving the model

In [17]:
from joblib import dump,load
dump(model,'RealEstate.joblib')

['RealEstate.joblib']

## Testing the model

In [19]:
x_test=strat_test_set.drop("MEDV",axis=1)
y_test=strat_test_set['MEDV'].copy()
x_test_prepared=my_pipeline.transform(x_test)
final_predictions=model.predict(x_test_prepared)
final_mse=cross_val_score(model,x_test_prepared,y_test,scoring="neg_mean_squared_error",cv=10)
final_rmse=np.sqrt(-final_mse)
final_rmse

array([4.04190808, 4.23463115, 4.9226931 , 4.77102382, 2.73551432,
       5.33986217, 5.10918283, 4.41573449, 2.92697366, 4.58806369])

## Using the model

In [3]:
from joblib import load
import numpy as np
model=load('RealEstate.joblib')
features=np.array([[-5.43942006, 4.12628155, -1.6165014, -0.67288841, -1.42262747,
       -11.44443979304, -49.31238772,  7.61111401, -26.0016879 , -0.5778192 ,
       -0.97491834,  0.41164221, -66.86091034]])
model.predict(features)


array([25.386])