# Pre-Processing: Housing Price Predicition

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from library.sb_utils import save_file

In [4]:
data = pd.read_csv("Data Files/train_data_engineered.csv")

In [5]:
data.shape

(1137, 65)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1137 entries, 0 to 1136
Data columns (total 65 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SalePrice      1137 non-null   int64  
 1   LotFrontage    1137 non-null   float64
 2   LotArea        1137 non-null   int64  
 3   MasVnrArea     1137 non-null   float64
 4   BsmtFinSF1     1137 non-null   int64  
 5   BsmtUnfSF      1137 non-null   int64  
 6   TotalBsmtSF    1137 non-null   int64  
 7   1stFlrSF       1137 non-null   int64  
 8   2ndFlrSF       1137 non-null   int64  
 9   GrLivArea      1137 non-null   int64  
 10  BsmtFullBath   1137 non-null   int64  
 11  FullBath       1137 non-null   int64  
 12  HalfBath       1137 non-null   int64  
 13  BedroomAbvGr   1137 non-null   int64  
 14  TotRmsAbvGrd   1137 non-null   int64  
 15  Fireplaces     1137 non-null   int64  
 16  GarageCars     1137 non-null   int64  
 17  GarageArea     1137 non-null   int64  
 18  WoodDeck

In [7]:
data.columns

Index(['SalePrice', 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'MiscVal', 'MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle',
       'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass', 'OverallQual',
       'OverallCond', 'House_Age', 'Remod_Age', 'Garage_Age'],
      dtype='objec

In [8]:
features = ['BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars','MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle',
       'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass', 'OverallQual',
       'OverallCond']
dummies = pd.get_dummies(data[features])
data_dummies = pd.concat([data,dummies],axis=1)
data_encoded = data_dummies.drop(['BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars','MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle',
       'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass', 'OverallQual',
       'OverallCond'], axis=1)

In [9]:
data_encoded.shape

(1137, 251)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_encoded.drop(columns='SalePrice'), 
                                                    data_encoded.SalePrice, test_size=0.3, 
                                                    random_state=123)

In [14]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_scaled = scaler.transform(X_train)
X_te_scaled = scaler.transform(X_test)

In [19]:
#Saving preprocessed data to new csv
datapath = 'C:\Springboard_\CapstoneTwo\Data Files'
save_file(data_encoded, 'data_preprocessed.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "C:\Springboard_\CapstoneTwo\Data Files\data_preprocessed.csv"
