## Acknowledgments

- https://www.kaggle.com/satishgunjal/advanced-reg-techniques-linear-models-top-6


# Setting up Code 

In [1]:
# Imports
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings

from pylab import rcParams
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from scipy.special import boxcox1p
from pprint import pprint

# Add base dir path to sys.path so that I can do local exports
sys.path.append("../..")
import utils

In [8]:
# Global settings

# Initialise logger
LOGGER = utils.get_logger("house-prices")

# To ignore warnings
warnings.filterwarnings("ignore")

#Limiting float output to 4 decimal points
pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x))

# Default figure size in inches
rcParams['figure.figsize'] = 12,6

# Print on each line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Gather data + Preprocessing

In [9]:
FILE_PATH_DICT = {
    os.path.basename(file):file
    for file in glob.glob("../../data/house-prices-advanced-regression-techniques/*")
}
pprint(FILE_PATH_DICT)    

{'data_description.txt': '../../data/house-prices-advanced-regression-techniques/data_description.txt',
 'sample_submission.csv': '../../data/house-prices-advanced-regression-techniques/sample_submission.csv',
 'test.csv': '../../data/house-prices-advanced-regression-techniques/test.csv',
 'train.csv': '../../data/house-prices-advanced-regression-techniques/train.csv'}


In [10]:
# Load data
LOGGER.info("Train data")
train_df = pd.read_csv(FILE_PATH_DICT["train.csv"])
train_df.shape
train_df.head()

LOGGER.info("Test data")
test_df = pd.read_csv(FILE_PATH_DICT["test.csv"])
test_df.shape
test_df.head()

2021-04-17 21:05:51,560 [house-prices] [INFO] Train data


(1460, 81)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


2021-04-17 21:05:51,584 [house-prices] [INFO] Test data


(1459, 80)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [11]:
# Remove Id column for both data frames as its of no use
train_df.drop("Id", inplace=True, axis=1)
test_df.drop("Id", inplace=True, axis=1)