# Project 3: House Price Prediction

Objective: Predict house prices based on various features like location, size, and house
characteristics.

## Imports

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data

We will open the .csv file to perform exploratory data analysis (EDA) to understand the dataset and
visualize trends.

In [18]:
df = pd.read_csv("./data/house_prices.csv")

In [19]:
df.head()

Unnamed: 0,Id,Dwell_Type,Zone_Class,LotFrontage,LotArea,Road_Type,Alley,Property_Shape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Property_Sale_Price
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Drop Columns

In [20]:
df = df.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1)
#df.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType'], axis=1)

In [22]:
df.columns


Index(['Dwell_Type', 'Zone_Class', 'LotFrontage', 'LotArea', 'Road_Type',
       'Property_Shape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'Dwelling_Type',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDriv

## Fill NAN Values - Categorical Data

In [23]:
len(df)

1460

In [24]:
df['MasVnrType'].fillna('NotExist', inplace=True)
df['FireplaceQu'].fillna('NotExist', inplace=True)
df['GarageFinish'].fillna('NotExist', inplace=True)
df['GarageQual'].fillna('NotExist', inplace=True)
df['GarageType'].fillna('NotExist', inplace=True)
df['GarageCond'].fillna('NotExist', inplace=True)
df['BsmtExposure'].fillna('NotExist', inplace=True)
df['BsmtFinType1'].fillna('NotExist', inplace=True)
df['BsmtFinType2'].fillna('NotExist', inplace=True)
df['BsmtQual'].fillna('NotExist', inplace=True)
df['BsmtCond'].fillna('NotExist', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MasVnrType'].fillna('NotExist', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FireplaceQu'].fillna('NotExist', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

In [25]:
df['Electrical'].unique()
print(df['Electrical'].isnull().sum())
#drop nan rows
df = df.dropna(subset=['Electrical'])


1


## Fill NAN Values - Numeric Data

In [26]:
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)


In [27]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))


In [28]:
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["MasVnrArea"] = df["MasVnrArea"].fillna(0)


## Use log Transform on Property_Sale_Price for better distribution

In [29]:
df['Property_Sale_Price_natural_log'] = np.log(df['Property_Sale_Price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Property_Sale_Price_natural_log'] = np.log(df['Property_Sale_Price'])


## Dealing with Outliers

In [54]:
df[(df['SaleCondition'].isin(['Normal','Abnorml'])) & (df['Property_Sale_Price']>700000)]

Unnamed: 0,Dwell_Type,Zone_Class,LotFrontage,LotArea,Road_Type,Property_Shape,LandContour,Utilities,LotConfig,LandSlope,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Property_Sale_Price,Property_Sale_Price_natural_log
691,60,RL,104.0,21535,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,1,2007,WD,Normal,755000,13.534473
1182,60,RL,160.0,15623,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,555,0,7,2007,WD,Abnorml,745000,13.521139


In [55]:
df[(df['GarageYrBlt']==0)]

Unnamed: 0,Dwell_Type,Zone_Class,LotFrontage,LotArea,Road_Type,Property_Shape,LandContour,Utilities,LotConfig,LandSlope,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Property_Sale_Price,Property_Sale_Price_natural_log
39,90,RL,65.000000,6040,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,6,2008,WD,AdjLand,82000,11.314475
48,190,RM,33.000000,4456,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,6,2009,New,Partial,113000,11.635143
78,90,RL,72.000000,10778,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,4,2010,WD,Normal,136500,11.824080
88,50,C (all),105.000000,8470,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,10,2009,ConLD,Abnorml,85000,11.350407
89,20,RL,60.000000,8070,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,8,2007,WD,Normal,123600,11.724806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,70,RM,50.000000,5250,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,12,2008,WD,Normal,122000,11.711776
1407,20,RL,70.083333,8780,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,3,2009,WD,Normal,112000,11.626254
1449,180,RM,21.000000,1533,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,8,2006,WD,Abnorml,92000,11.429544
1450,90,RL,60.000000,9000,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,9,2009,WD,Normal,136000,11.820410


## Export Cleaned Data to CSV File

In [31]:
df.to_csv('cleaned_house_data.csv')

----