In [245]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [246]:
# Reading the data
df = pd.read_csv('train_house_prediction.csv')

In [247]:
# Dropping columns with more than 40% null values
df_temp = pd.DataFrame(100*(df.isnull().sum()/df.shape[0]))
df_temp = df_temp[df_temp[0] <= 40]
req_features = df_temp.index

In [248]:
# Creating the dataframe after dropping columns
df = df[req_features]
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPo

In [249]:
df_temp = pd.DataFrame(100*(df.isnull().sum(axis = 1)/df.shape[0]))
print(df_temp.max())
# As we can 0.68% is the maximum value so we don't need to drop any rows

0    0.684932
dtype: float64


In [250]:
# Selecting the Features that are most correlated with the Sale Price of the house
df_temp = pd.DataFrame(df.iloc[:, 1:].corr()['SalePrice'].sort_values(ascending = False))
df_temp = df_temp.iloc[1:]
df_temp = df_temp[df_temp['SalePrice'] >= 0.30]
req_features = df_temp.index
req_features

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
       '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
       'GarageYrBlt', 'MasVnrArea', 'Fireplaces', 'BsmtFinSF1', 'LotFrontage',
       'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF'],
      dtype='object')

In [252]:
df = df[req_features]
df.shape

(1460, 18)

In [253]:
df.isnull().sum()

OverallQual       0
GrLivArea         0
GarageCars        0
GarageArea        0
TotalBsmtSF       0
1stFlrSF          0
FullBath          0
TotRmsAbvGrd      0
YearBuilt         0
YearRemodAdd      0
GarageYrBlt      81
MasVnrArea        8
Fireplaces        0
BsmtFinSF1        0
LotFrontage     259
WoodDeckSF        0
2ndFlrSF          0
OpenPorchSF       0
dtype: int64

In [254]:
# Filling the NaN values using the median
df.fillna(df.median(), inplace = True)

In [255]:
df.isnull().sum()

OverallQual     0
GrLivArea       0
GarageCars      0
GarageArea      0
TotalBsmtSF     0
1stFlrSF        0
FullBath        0
TotRmsAbvGrd    0
YearBuilt       0
YearRemodAdd    0
GarageYrBlt     0
MasVnrArea      0
Fireplaces      0
BsmtFinSF1      0
LotFrontage     0
WoodDeckSF      0
2ndFlrSF        0
OpenPorchSF     0
dtype: int64

In [257]:
X = df.iloc[:, :].values
y = df.iloc[:, :].values

In [258]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [259]:
################################

In [260]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [261]:
y_pred = regressor.predict(X_test)

In [262]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

Mean Absolute Error: 1.5303741091679044e-13


In [263]:
# Reading test data
test_data = pd.read_csv('test_house_prediction.csv')
test_data = test_data[req_features]
test_data.fillna(test_data.median(), inplace = True)
test_data1 = test_data.iloc[:].values

In [264]:
print(test_data1.shape)
print(X_train.shape)

(1459, 18)
(1168, 18)


In [268]:
y_pred_testdata = regressor.predict(test_data)

In [270]:
df_test_results = pd.DataFrame(y_pred_testdata)

In [273]:
df_test_results.to_csv('df_test_results.csv')