## Dragon Reral estate price predictor Project

In [None]:
import pandas as pd
housing=pd.read_csv("housingdata.csv") # getting the data
print(housing.head()) #checking top 5 rows of data

In [None]:
housing.info() #Exploring data types and null values

In [None]:
housing['CHAS'].value_counts() #Exploring categorical attributes and their counts

## Train test splitting of data

In [515]:
# 
#  # Function to split the data into training and testing sets
# #for learning purpises we are implementing our own function instead of using sklearn's train_test_split
# def split_train_test(data, test_ratio):
#     import numpy as np
#     np.random.seed(42)  # for reproducibility
#     shuffled_indices = np.random.permutation(len(data))
#     print(shuffled_indices)
#     test_set_size = int(len(data) * test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]


In [516]:

# train_set, test_set=split_train_test(housing,0.2) #splitting the data into train and test sets with 20% data in test set
# print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")


## Using Sci-Kit Learn library for Train and test Split Data

In [None]:
import sklearn.model_selection as model_selection #importing sklearn model_selection for train test split

train_set, test_set=model_selection.train_test_split(housing,test_size=0.2, random_state=42)#splitting the data into train and test sets with 20% data in test set
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

In [None]:
train_set.describe()

In [519]:
#stratified sampling based on CHAS attribute, which is a categorical attribute 0,1# we do this to ensure that both train and test sets have similar distribution of CHAS attribute
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)#stratified splitting based on CHAS attribute
for train_index,test_index in split.split(housing,housing['CHAS']):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]
#this will ensure that both train and test sets have similar distribution of CHAS attribute


In [None]:
strat_test_set.info()

In [None]:
strat_train_set.info()

In [None]:
strat_train_set.describe()

In [None]:
strat_train_set['CHAS'].value_counts()  #checking the distribution of CHAS attribute in stratified train set


In [None]:
strat_test_set['CHAS'].value_counts()# checking the distribution of CHAS attribute in stratified test set

##Missing Attribute Handling

In [525]:
#To handle missing values in the dataset, we may choose following options:
#1. Remove the rows with missing values
#2. Remove the entire attribute/column with missing values
#3. Fill the missing values with some value(mean, median, mode etc)

<!-- #To take care of the missing attributes we have 3 options
1. Remove missing data points, whole Row
2. Remove whole Attribute /Column
3. Set/ impute  values to  0 or meam or mediam, or other values  -->

In [526]:
#deletring the rows with missing values
a=strat_train_set.dropna(subset=["RAD"]) #option 1
print(a.shape)

(402, 14)


In [527]:
#deleting the entire attribute/column with missing values
b=strat_train_set.drop("RAD",axis=1) #option 2
print(b.shape)
#now there no RAD column in the dataset

(404, 13)


In [528]:
# # print(strat_train_set.shape)
# #feilling the missing values with median value of the attribute
# median=strat_train_set['RAD'].median() #calculating median of RAD attribute
# print(median)
# strat_train_set['RAD'].fillna(median,inplace=True) #option 3

In [None]:
from sklearn.impute import SimpleImputer    
imputer=SimpleImputer(strategy="median") #creating imputer object with median strategy
imputer.fit(strat_train_set) #fitting the imputer object to our housing data

In [None]:
imputer.statistics_ #checking the statistics calculated by imputer object


In [None]:
x=imputer.transform(strat_train_set) #transforming the data by replacing missing values with median values
strat_train_set_tr=pd.DataFrame(x,columns=strat_train_set.columns) #converting the numpy array returned by transform() to a pandas DataFrame
strat_train_set_tr.info() #checking if there are any missing values now

In [None]:
strat_train_set.describe() #Getting the statistical measures of the data

In [None]:
strat_train_set_tr.describe() #Getting the statistical measures of the test data

In [534]:
import matplotlib.pyplot as plt #importing matplotlib for plotting graphs

In [None]:
strat_train_set_tr.hist(bins=50, figsize=(20,15))   #plotting histograms for all attributes, to understand the distribution of data
plt.show()

## Looking for Correlations, Here we will check how different feature are varying with respect to one feature


In [536]:
correlation_matrix=strat_train_set_tr.corr() #calculating the correlation matrix to see how the attributes are correlated with each other
print(correlation_matrix['MEDV'].sort_values(ascending=False)) #checking the correlation of all attributes with MEDV attribute

MEDV       1.000000
RM         0.679894
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374004
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64


In [None]:
scatter_matrix=pd.plotting.scatter_matrix(strat_train_set_tr,figsize=(20,15)) #plotting scatter matrix to see the relationships between attributes
plt.show()


In [None]:
attribute=['MEDV','RM','ZN','LSTAT','PTRATIO'] #selecting some attributes which have high correlation with MEDV attribute
scatter_matrix=pd.plotting.scatter_matrix(strat_train_set_tr[attribute],figsize=(20,15)) #plotting scatter matrix to see the relationships between selected attributes
plt.show()

In [None]:
strat_train_set_tr.plot(kind="scatter",x="RM",y="MEDV",alpha=0.8) #plotting scatter plot between RM and MEDV attributes#alpha is used to set the transparency darkness of points
plt.show()

#such plots help us to understand the relationships between different attributes and how they affect the target variable MEDV (Median value of owner-occupied homes in $1000's).
#also it helps us to identify any outliers or patterns in the data that may be useful for building a predictive model for house prices.


## Tryout Attribute combinations

In [None]:
# Creating new attributes to see if they have better correlation with MEDV attribute
strat_train_set_tr['TAXRM']=strat_train_set_tr['TAX']/strat_train_set_tr['RM'] #creating new attribute TAXRM which is ratio of TAX and RM attributes
strat_train_set_tr['PTRATIORM']=strat_train_set_tr['PTRATIO']/strat_train_set_tr['RM'] #creating new attribute PTRATIORM which is ratio of PTRATIO and RM attributes
correlation_matrix=strat_train_set_tr.corr() #calculating the correlation matrix again to see how the new attributes are correlated with MEDV attribute
print(correlation_matrix['MEDV'].sort_values(ascending=False)) #checking the correlation of all attributes with MEDV attribute again

In [None]:
strat_train_set_tr.plot(kind="scatter",x="TAXRM",y="MEDV",alpha=0.8) #plotting scatter plot between TAXRM and MEDV attributes shows Tax per room vs Median value of owner-occupied homes
strat_train_set_tr.plot(kind="scatter",x="PTRATIORM",y="MEDV",alpha=0.8) #plotting scatter plot between PTRATIORM and MEDV attributes shows Pupil-Teacher ratio per room vs Median value of owner-occupied homes
plt.show()

In [542]:
strat_train_set_tr.shape


(404, 16)

In [543]:
strat_train_set.shape

(404, 14)

## Sci-Kit Learn
Primarily 3 types objects
1. Estimators (Data prep):  estimate parameters based on dataset, example- Imputer, Feature scaling, and calculate internal parameters
2. Transformers (Data fitting to Model): takes input and returns output based in the learnings from fit(). It has convenience function fit_transform()
3. Predictors (Prediction):  LInerarRgression model- has fit() and predict() as comon finctions. It also has score() function which evaluate the predictions.

## Feature scaling
Model performes better when attributes/features are on almost same numeric ranges, so we use suitable scaling mentods to have all feature values in similar numeric ranges
Primarily Two types of feature scaling methods:
1. Min- Max scaling (Normalization):        (Value- Min)/(Max-Min) : SkLearn has a class MinMaxScaler for this.
2. Standardization: (Value- Mean)/Std       : SkLearn has a class Standardscaler for this, it makes variance withing range 0-1

## Pipeline for prepareing data

In [None]:
#pipeline -- Series of transformations applied to the data.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler   
my_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy="median")), #filling missing values with median
    # can add more transformations here as needed
    ('std_scaler',StandardScaler()), #standardizing the data
])  
#process for preparing the data for model training

In [545]:
final_train_data=strat_train_set.drop("MEDV",axis=1) #separating the target attribute MEDV from the training data
final_train_labels=strat_train_set['MEDV'].copy() #copying the target attribute MEDV to a separate variable

In [546]:
housing_num_tr=my_pipeline.fit_transform(final_train_data) #applying the pipeline to the training data excluding the target attribute MEDV
print(housing_num_tr.shape)

(404, 13)


In [547]:
strat_test_set.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
342,0.02498,0.0,1.89,0,0.518,6.54,59.7,6.2669,1.0,422,15.9,389.96,8.65,16.5
379,17.8667,0.0,18.1,0,0.671,6.223,100.0,1.3861,24.0,666,20.2,393.74,21.78,10.2
223,0.6147,0.0,6.2,0,0.507,6.618,80.8,3.2721,8.0,307,17.4,396.9,7.6,30.1
219,0.11425,0.0,13.89,1,0.55,6.373,92.4,3.3633,5.0,276,16.4,393.74,10.5,23.0
48,0.25387,0.0,6.91,0,0.448,5.399,95.3,5.87,3.0,233,17.9,396.9,30.81,14.4


In [None]:
housing_num_tr #transformed training data after applying the pipeline

array([[-0.43942006,  3.12628155, -1.12165014, ..., -0.97491834,
         0.41164221, -0.86091034],
       [-0.44352175,  3.12628155, -1.35893781, ..., -0.69277865,
         0.39131918, -0.94116739],
       [ 0.15682292, -0.4898311 ,  0.98336806, ...,  0.81196637,
         0.44624347,  0.81480158],
       ...,
       [-0.43525657, -0.4898311 , -1.23083158, ..., -0.22254583,
         0.41831233, -1.27603303],
       [ 0.14210728, -0.4898311 ,  0.98336806, ...,  0.81196637,
        -3.15239177,  0.73869575],
       [-0.43974024, -0.4898311 ,  0.37049623, ..., -0.97491834,
         0.41070422,  0.09940681]], shape=(404, 13))

: 

## Selecting a desired model for Hosing Price Prediction

In [549]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor      
#model=LinearRegression() #creating linear regression model
#model=DecisionTreeRegressor() #creating decision tree regressor model  
model=RandomForestRegressor() #creating random forest regressor model
model.fit(housing_num_tr,final_train_labels) #training the model on the training data or fitting the model

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


## Initial validation of model with some train data comparing prediction vs actual

In [550]:
some_data=final_train_data.iloc[:5] #taking first 5 rows of training data for testing the model
some_labels=final_train_labels.iloc[:5] #taking first 5 labels of training data for testing the model

In [551]:
prepared_data=my_pipeline.transform(some_data) #preparing the data using the pipeline

In [552]:
model.predict(prepared_data) #predicting the labels for the prepared data

array([22.472, 25.728, 16.473, 23.415, 23.407])

In [553]:
list(some_labels)

[21.9, 24.5, 16.7, 23.1, 23.0]

In [554]:
from sklearn.metrics import mean_squared_error  
import numpy as np
housing_predictions=model.predict(housing_num_tr) #predicting the labels for the training data
mse=mean_squared_error(final_train_labels,housing_predictions) #calculating mean squared error between actual and predicted labels
rmse=np.sqrt(mse) #calculating root mean squared error
print(mse)
print(rmse)

1.4926562945544528
1.2217431377153107


## Using better evaluation technique - Cross Validation

In [555]:
# devide train data into 10 folds and validate the model on each fold and get the average rmse
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model,housing_num_tr,final_train_labels,scoring="neg_root_mean_squared_error",cv=10) #cross validating the model using 10 folds
rmse_scores=np.sqrt(-scores) #calculating root mean squared error for each fold

In [556]:
rmse_scores

array([1.68366185, 1.61114882, 2.11068378, 1.64606897, 1.82932648,
       1.63065133, 2.17675369, 1.82073261, 1.82180008, 1.77662216])

In [557]:
def print_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard Deviation:",scores.std())

In [558]:
print_scores(rmse_scores)

Scores: [1.68366185 1.61114882 2.11068378 1.64606897 1.82932648 1.63065133
 2.17675369 1.82073261 1.82180008 1.77662216]
Mean: 1.8107449763348797
Standard Deviation: 0.18471543777172544


##  Saving the model

In [560]:
import joblib #load model
joblib.dump(model, 'HousePricePredict.joblib')

['HousePricePredict.joblib']

## Testing the model on Test Data

In [570]:
X_test=strat_test_set.drop("MEDV",axis=1) #separating the target attribute MEDV from the test data  
y_test=strat_test_set['MEDV'].copy() #copying the target attribute MEDV to a separate variable from the test data
X_test_prepared=my_pipeline.transform(X_test) #preparing the test data using the pipeline
final_predictions=model.predict(X_test_prepared) #predicting the labels for the test data
final_mse=mean_squared_error(y_test,final_predictions) #calculating mean squared error between actual and predicted labels for test data
final_rmse=np.sqrt(final_mse) #calculating root mean squared error for test data    
print(final_rmse)
print(final_predictions, y_test.values)

2.9507805530993
[24.638 11.361 25.789 22.717 18.672 15.212 19.961 14.418 31.679 41.582
 19.978 12.022 23.753 28.209 19.54  10.982 31.615 14.717 23.651 18.638
 19.77  17.779 16.92  21.976 18.469 30.785 16.281 32.704  8.892 33.467
 24.092 21.196 23.269 10.763 20.926 11.143 42.97  24.541 23.523 41.622
 23.743 29.648 20.661 20.869 19.253 33.514 44.69  20.08  20.388 21.997
 20.953 14.391 21.224 15.131 24.858 33.152 42.292 28.525 19.47  20.978
 47.482  9.883 18.72  25.069 15.145 32.769 19.509 17.928 18.551 33.787
 26.649 22.94  21.221 22.458 34.981 12.839 15.866 20.11  21.069 21.669
 22.283 21.437 14.494 22.93  20.874 21.101 13.882 21.134 21.696 23.107
 18.618 26.928  7.426 26.354 19.312 29.877 19.92  31.454 14.613 26.545
 21.031 20.399] [16.5 10.2 30.1 23.  14.4 15.6 19.4 14.1 30.3 35.2 23.1 13.8 25.  27.9
 19.5 12.3 32.2 13.5 23.8 21.7 19.2 19.5 10.4 23.2 18.6 28.5 15.2 32.
  7.2 34.6 20.1 20.6 23.6 13.1 23.8 12.7 43.1 24.7 22.2 44.  28.1 31.
 21.7 23.4 19.5 33.1 41.7 18.7 19.9 20.6 21.2 1

## Using model for Price prediction

In [577]:
prepared_data [0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24141041, -1.31238772,  2.61111401, -1.00262815, -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

In [582]:
#simple test data to test the model

features=np.array([[-1.02731,0.1,7.07,-1.0,0.469,6.421,78.9,6.9671,2.0,242.0,19.8,396.90,1.14]])

In [584]:
import numpy as np
from joblib import dump, load
model=load('HousePricePredict.joblib')

#features=np.array([[-1.02731,0.1,7.07,-1.0,0.469,6.421,78.9,6.9671,2.0,242.0,19.8,396.90,1.14]])
model.predict(features)

array([23.358])