In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import numpy as np                # mathematical calculations
import pandas as pd               # manipulation of raw data 
import matplotlib.pyplot as plt   # plotting graphs
%matplotlib inline 

In [3]:
path ='/gdrive/My Drive/Online_ML : Weekends Nov 2020/Assignments/Data/'

In [4]:
raw_data = pd.read_csv(path + 'Chennai_house_price_multivariate_train.csv')
raw_data.head()

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,...,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE
0,P03210,Karapakkam,1004,04-05-2011,131,1.0,1.0,3,AbNormal,Yes,...,AllPub,Paved,A,4.0,3.9,4.9,4.33,380000,144400,7600000
1,P09411,Anna Nagar,1986,19-12-2006,26,2.0,1.0,5,AbNormal,No,...,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,760122,304049,21717770
2,P01812,Adyar,909,04-02-2012,70,1.0,1.0,3,AbNormal,Yes,...,ELO,Gravel,RL,4.1,3.8,2.2,3.09,421094,92114,13159200
3,P05346,Velachery,1855,13-03-2010,14,3.0,2.0,5,Family,No,...,NoSewr,Paved,I,4.7,3.9,3.6,4.01,356321,77042,9630290
4,P06210,Karapakkam,1226,05-10-2009,84,1.0,1.0,3,AbNormal,Yes,...,AllPub,Gravel,C,3.0,2.5,4.1,3.29,237000,74063,7406250


In [5]:
#Check Null value of train data

raw_data.isnull().sum(axis=0)

PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
DIST_MAINROAD     0
N_BEDROOM         1
N_BATHROOM        5
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       48
REG_FEE           0
COMMIS            0
SALES_PRICE       0
dtype: int64

In [6]:
#Missing value in QS_Overall and N_Bed Room & N_Bathroom with the mean

raw_data = raw_data.fillna({'N_BEDROOM': raw_data['N_BEDROOM'].mean(), 'N_BATHROOM': raw_data['N_BATHROOM'].mean(), 'QS_OVERALL': raw_data['QS_OVERALL'].mean()})

In [7]:
raw_data.isnull().sum(axis=0)

PRT_ID           0
AREA             0
INT_SQFT         0
DATE_SALE        0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACIL       0
DATE_BUILD       0
BUILDTYPE        0
UTILITY_AVAIL    0
STREET           0
MZZONE           0
QS_ROOMS         0
QS_BATHROOM      0
QS_BEDROOM       0
QS_OVERALL       0
REG_FEE          0
COMMIS           0
SALES_PRICE      0
dtype: int64

In [8]:
# Split output and input variable

y = raw_data['SALES_PRICE']
y = np.array(y)
y = y.reshape(-1,1)
y.shape

(7109, 1)

In [9]:

x_raw_data = raw_data.drop(columns=['PRT_ID','DATE_SALE', 'SALES_PRICE', 'DATE_BUILD'])
x_raw_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,Karapakkam,1004,131,1.0,1.0,3,AbNormal,Yes,Commercial,AllPub,Paved,A,4.0,3.9,4.9,4.33,380000,144400
1,Anna Nagar,1986,26,2.0,1.0,5,AbNormal,No,Commercial,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,760122,304049
2,Adyar,909,70,1.0,1.0,3,AbNormal,Yes,Commercial,ELO,Gravel,RL,4.1,3.8,2.2,3.09,421094,92114
3,Velachery,1855,14,3.0,2.0,5,Family,No,Others,NoSewr,Paved,I,4.7,3.9,3.6,4.01,356321,77042
4,Karapakkam,1226,84,1.0,1.0,3,AbNormal,Yes,Others,AllPub,Gravel,C,3.0,2.5,4.1,3.29,237000,74063


In [10]:
x_raw_data.shape

(7109, 18)

In [11]:
x_raw_data.dtypes

AREA              object
INT_SQFT           int64
DIST_MAINROAD      int64
N_BEDROOM        float64
N_BATHROOM       float64
N_ROOM             int64
SALE_COND         object
PARK_FACIL        object
BUILDTYPE         object
UTILITY_AVAIL     object
STREET            object
MZZONE            object
QS_ROOMS         float64
QS_BATHROOM      float64
QS_BEDROOM       float64
QS_OVERALL       float64
REG_FEE            int64
COMMIS             int64
dtype: object

In [12]:
# Encode CATEGORICAL variable 
from sklearn.preprocessing import LabelEncoder

area_labelencoder = LabelEncoder()
x_raw_data['AREA'] = area_labelencoder.fit_transform(x_raw_data['AREA'])

Sale_labelencoder = LabelEncoder()
x_raw_data['SALE_COND'] = Sale_labelencoder.fit_transform(x_raw_data['SALE_COND'])

Parking_labelencoder = LabelEncoder()
x_raw_data['PARK_FACIL'] = Parking_labelencoder.fit_transform(x_raw_data['PARK_FACIL'])

Buildtype_labelencoder = LabelEncoder()
x_raw_data['BUILDTYPE'] = Buildtype_labelencoder.fit_transform(x_raw_data['BUILDTYPE'])

Utility_labelencoder = LabelEncoder()
x_raw_data['UTILITY_AVAIL'] = Buildtype_labelencoder.fit_transform(x_raw_data['UTILITY_AVAIL'])

Street_labelencoder = LabelEncoder()
x_raw_data['STREET'] = Street_labelencoder.fit_transform(x_raw_data['STREET'])

mz_labelencoder = LabelEncoder()
x_raw_data['MZZONE'] = mz_labelencoder.fit_transform(x_raw_data['MZZONE'])


In [13]:
x_raw_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,12,1004,131,1.0,1.0,3,1,2,0,1,4,0,4.0,3.9,4.9,4.33,380000,144400
1,4,1986,26,2.0,1.0,5,1,0,0,1,0,3,4.9,4.2,2.5,3.765,760122,304049
2,0,909,70,1.0,1.0,3,1,2,0,2,0,4,4.1,3.8,2.2,3.09,421094,92114
3,15,1855,14,3.0,2.0,5,4,0,2,4,4,2,4.7,3.9,3.6,4.01,356321,77042
4,12,1226,84,1.0,1.0,3,1,2,2,1,0,1,3.0,2.5,4.1,3.29,237000,74063


In [14]:
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler()
y_scaler = StandardScaler()

x = x_scaler.fit_transform(x_raw_data)
y = y_scaler.fit_transform(y)

x.shape, y.shape

((7109, 18), (7109, 1))

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((5331, 18), (1778, 18), (5331, 1), (1778, 1))

we will  explore multiple model and test out which are is the best fit model on this dataset.

Linear Regression: Linear regression is a simple and interpretable model that can be effective for predicting house prices. It assumes a linear relationship between the independent variables (features) and the target variable (house price).

Random Forest Regression: Random forest regression is an ensemble learning method that combines multiple decision trees to make predictions. It can handle nonlinear relationships and interactions between features effectively. Random forest models often provide good predictive performance and are robust to outliers and overfitting.


Gradient Boosting Regression: Gradient boosting regression algorithms, such as XGBoost and LightGBM, are powerful models for predicting house prices. They work by iteratively improving the predictions by focusing on the samples with higher errors. Gradient boosting models are known for their high predictive accuracy and ability to handle complex relationships in the data.

Support Vector Regression: Support Vector Regression (SVR) is a variant of Support Vector Machines (SVM) that is used for regression tasks. It can capture nonlinear relationships and works well in high-dimensional spaces. SVR is particularly useful when there are complex patterns and nonlinearity in the house price data.

Neural Networks: Neural networks, especially deep learning models like multi-layer perceptron (MLP) or convolutional neural networks (CNN), can be powerful for predicting house prices. They can capture intricate patterns and nonlinear relationships in the data. However, neural networks typically require more data and computational resources for training.

In [16]:
#import libraries for all above models

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [17]:
linear_regression = LinearRegression()
random_forest = RandomForestRegressor()
gradient_boosting = GradientBoostingRegressor()
support_vector = SVR()
neural_network = MLPRegressor()


In [18]:
#Fit each model to the training data using the fit() method
linear_regression.fit(x_train, y_train)
random_forest.fit(x_train, y_train)
gradient_boosting.fit(x_train, y_train)
support_vector.fit(x_train, y_train)
neural_network.fit(x_train, y_train)


  random_forest.fit(x_train, y_train)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
#Make predictions using each model on the testing data using the predict() method.

linear_regression_pred = linear_regression.predict(x_test)
random_forest_pred = random_forest.predict(x_test)
gradient_boosting_pred = gradient_boosting.predict(x_test)
support_vector_pred = support_vector.predict(x_test)
neural_network_pred = neural_network.predict(x_test)


In [20]:

# Evaluate the models
linear_regression_mse = mean_squared_error(y_test, linear_regression_pred)
linear_regression_mae = mean_absolute_error(y_test, linear_regression_pred)
linear_regression_r2 = r2_score(y_test, linear_regression_pred)

random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_r2 = r2_score(y_test, random_forest_pred)

gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_pred)
gradient_boosting_mae = mean_absolute_error(y_test, gradient_boosting_pred)
gradient_boosting_r2 = r2_score(y_test, gradient_boosting_pred)

support_vector_mse = mean_squared_error(y_test, support_vector_pred)
support_vector_mae = mean_absolute_error(y_test, support_vector_pred)
support_vector_r2 = r2_score(y_test, support_vector_pred)

neural_network_mse = mean_squared_error(y_test, neural_network_pred)
neural_network_mae = mean_absolute_error(y_test, neural_network_pred)
neural_network_r2 = r2_score(y_test, neural_network_pred)

# Compare the performance
print("Linear Regression:")
print("MSE:", linear_regression_mse)
print("MAE:", linear_regression_mae)
print("R2:", linear_regression_r2)
print()

print("Random Forest:")
print("MSE:", random_forest_mse)
print("MAE:", random_forest_mae)
print("R2:", random_forest_r2)
print()

print("Gradient Boosting:")
print("MSE:", gradient_boosting_mse)
print("MAE:", gradient_boosting_mae)
print("R2:", gradient_boosting_r2)
print()

print("Support Vector Regression:")
print("MSE:", support_vector_mse)
print("MAE:", support_vector_mae)
print("R2:", support_vector_r2)
print()

print("Neural Network:")
print("MSE:", neural_network_mse)
print("MAE:", neural_network_mae)
print("R2:", neural_network_r2)


Linear Regression:
MSE: 0.11390874123671328
MAE: 0.26976292021155446
R2: 0.879727636792276

Random Forest:
MSE: 0.027964516934712105
MAE: 0.1313434028304673
R2: 0.9704732182869894

Gradient Boosting:
MSE: 0.0328611115745272
MAE: 0.14008502073833404
R2: 0.965303070652955

Support Vector Regression:
MSE: 0.021570074589588265
MAE: 0.11174865815181738
R2: 0.9772248923367041

Neural Network:
MSE: 0.010246126679754835
MAE: 0.08043582148366712
R2: 0.9891814635459896


From the above metrics, we can see the Neural Network MSE, MAE are the lowest among other four models also the R-2 is nearest to 1. we will choose this model to further in our prediction

In [21]:
# Import the required libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# Scale the input data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create a neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(x_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(x_train_scaled, y_train, epochs=10, batch_size=32)

# Make predictions using the trained model
y_pred = model.predict(x_test_scaled)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
# Save the trained model
model.save('neural_network_model.h5')

Now load the test data set and predict the values



In [23]:
test_path ='/gdrive/My Drive/Online_ML : Weekends Nov 2020/Assignments/Data/'
raw_test_data = pd.read_csv(test_path + 'Chennai_house_price_multivariate_test.csv')
raw_test_data.head()

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,...,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,P05996,Chrompet,958,13-11-2009,185,1,1,3,AbNormal,No,...,Others,NoSewr,Paved,RH,3.1,4.4,3.9,3.86,203260,93813
1,P09294,Anna Nagar,1807,29-01-2007,108,2,1,5,AdjLand,Yes,...,Others,NoSeWa,No Access,RM,2.6,4.1,2.1,2.775,370410,222246
2,P03807,Karapakkam,1658,07-01-2011,59,2,2,4,AdjLand,No,...,Others,NoSeWa,Paved,I,2.5,2.3,3.2,2.72,387972,113159
3,P00539,Anna Nagar,1592,22-01-2014,102,1,1,4,AbNormal,Yes,...,Others,AllPub,Gravel,RL,4.1,4.8,2.5,3.635,408134,84442
4,P01448,Chrompet,857,01-05-2012,62,1,1,3,Family,No,...,House,ELO,Gravel,RH,3.0,4.5,2.9,3.57,181212,34849


In [24]:
raw_test_data.isnull().sum(axis=0)


PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
DIST_MAINROAD     0
N_BEDROOM         0
N_BATHROOM        0
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       19
REG_FEE           0
COMMIS            0
dtype: int64

In [25]:
raw_test_data = raw_test_data.fillna({ 'QS_OVERALL': raw_test_data['QS_OVERALL'].mean()})
raw_test_data.isnull().sum(axis=0)

PRT_ID           0
AREA             0
INT_SQFT         0
DATE_SALE        0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACIL       0
DATE_BUILD       0
BUILDTYPE        0
UTILITY_AVAIL    0
STREET           0
MZZONE           0
QS_ROOMS         0
QS_BATHROOM      0
QS_BEDROOM       0
QS_OVERALL       0
REG_FEE          0
COMMIS           0
dtype: int64

In [26]:
raw_test_data = raw_test_data.drop(columns=['PRT_ID','DATE_SALE', 'DATE_BUILD'])
raw_test_data.head()

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS
0,Chrompet,958,185,1,1,3,AbNormal,No,Others,NoSewr,Paved,RH,3.1,4.4,3.9,3.86,203260,93813
1,Anna Nagar,1807,108,2,1,5,AdjLand,Yes,Others,NoSeWa,No Access,RM,2.6,4.1,2.1,2.775,370410,222246
2,Karapakkam,1658,59,2,2,4,AdjLand,No,Others,NoSeWa,Paved,I,2.5,2.3,3.2,2.72,387972,113159
3,Anna Nagar,1592,102,1,1,4,AbNormal,Yes,Others,AllPub,Gravel,RL,4.1,4.8,2.5,3.635,408134,84442
4,Chrompet,857,62,1,1,3,Family,No,House,ELO,Gravel,RH,3.0,4.5,2.9,3.57,181212,34849


In [27]:
# Encode CATEGORICAL variable 
from sklearn.preprocessing import LabelEncoder

area_labelencoder = LabelEncoder()
raw_test_data['AREA']=area_labelencoder.fit_transform(raw_test_data['AREA'])

Sale_labelencoder = LabelEncoder()
raw_test_data['SALE_COND'] = Sale_labelencoder.fit_transform(raw_test_data['SALE_COND'])

Parking_labelencoder = LabelEncoder()
raw_test_data['PARK_FACIL'] = Parking_labelencoder.fit_transform(raw_test_data['PARK_FACIL'])

Buildtype_labelencoder = LabelEncoder()
raw_test_data['BUILDTYPE'] = Buildtype_labelencoder.fit_transform(raw_test_data['BUILDTYPE'])

Utility_labelencoder = LabelEncoder()
raw_test_data['UTILITY_AVAIL'] = Buildtype_labelencoder.fit_transform(raw_test_data['UTILITY_AVAIL'])

Street_labelencoder = LabelEncoder()
raw_test_data['STREET'] = Street_labelencoder.fit_transform(raw_test_data['STREET'])

mz_labelencoder = LabelEncoder()
raw_test_data['MZZONE'] = mz_labelencoder.fit_transform(raw_test_data['MZZONE'])


In [28]:
# Load the trained neural network model
from tensorflow.keras.models import load_model
model = load_model('neural_network_model.h5')

In [29]:
new_x_test = x_scaler.fit_transform(raw_test_data)
new_x_test.shape

(2925, 18)

In [30]:
# Reshape the input data to have the shape (n_samples, n_features)
new_x_test_reshaped = new_x_test.reshape(-1, 18)

# Make predictions using the reshaped input data
new_y_pred = neural_network.predict(new_x_test_reshaped)

In [31]:
new_y_pred = new_y_pred.reshape(-1, 1)


In [38]:
new_y = y_scaler.inverse_transform(new_y_pred)
     

In [39]:
# Reshape the predicted values array to be 1-dimensional
new_y_reshaped = new_y.reshape(-1)

# Create a DataFrame with the predicted values
output_df = pd.DataFrame({"Predicted": new_y_reshaped})

# Save the DataFrame to a CSV file
output_df.to_csv("predictions.csv", index=False)


In [40]:
# Print the DataFrame
print(output_df)

         Predicted
0     7.509619e+06
1     1.335363e+07
2     7.511076e+06
3     1.369561e+07
4     7.186466e+06
...            ...
2920  1.227874e+07
2921  9.597046e+06
2922  1.808187e+07
2923  1.526779e+07
2924  1.302344e+07

[2925 rows x 1 columns]


Summary on business use case which can apply the same model

Real Estate: Real estate agents, property developers, and investors can use house price prediction models to estimate the value of properties. This information helps them make informed decisions about buying, selling, or investing in real estate. It can also assist in determining the optimal pricing strategy for selling properties.

Mortgage Lending: Mortgage lenders use house price prediction models to assess the value of properties when evaluating loan applications. The predicted house prices help lenders determine the loan amount, interest rates, and terms for potential borrowers. This information helps in managing risk and making lending decisions.

Property Valuation: Property valuation companies utilize house price prediction models to provide accurate property valuation services. By analyzing various factors and features of a property, these models can estimate its market value, which is essential for appraisal, taxation, insurance, and investment purposes.

Market Analysis: House price prediction models can be used to analyze housing market trends and patterns. Real estate market researchers, economists, and government agencies can utilize these models to understand the dynamics of the housing market, identify emerging trends, and make predictions about future market conditions.

Risk Assessment: House price prediction models can be utilized by insurance companies to assess the risk associated with insuring properties. By considering factors such as location, property features, and historical price trends, insurers can determine the appropriate coverage and premiums for insuring a property against risks like damage, theft, or natural disasters.

Urban Planning: City planners and policymakers can leverage house price prediction models to understand housing affordability and plan for future development. These models can assist in identifying areas with rising house prices, predicting housing demand, and making decisions related to infrastructure development, zoning regulations, and housing policies.

Overall, predicting house prices has a wide range of applications in the real estate industry, financial sector, urban planning, and market analysis. It enables stakeholders to make informed decisions, mitigate risks, and optimize their strategies based on accurate estimations of property values.
