In [23]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split

In [28]:
df = pd.read_csv('../data/raw.csv')

# categorical columns
all_cat = df.dtypes[df.dtypes == 'object'].index

# ordered categorical columns
ordered_cat = {
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "PoolQC": ["Fa", "Gd", "Ex"],
    "CentralAir": ["N", "Y"],
    "Utilities": ["NoSeWa", "AllPub"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "Alley": ["Grvl", "Pave"],
    "Street": ["Grvl", "Pave"]}

# unordered categorical columns
unordered_cat = list(set(all_cat) - set(ordered_cat))
    
# convert ordered categorical to integers
for c in ordered_cat.keys():
    cat_type = CategoricalDtype(categories=ordered_cat[c], ordered=True)
    df[c] = df[c].astype(cat_type).cat.codes
        
# convert remaining unordered categorical to dummy columns        
df = pd.get_dummies(df)

# fill missing values
df = df.fillna(df.mode().iloc[0,:])      

df['target'] = np.log(df['SalePrice'])

df.drop(['SalePrice'], axis=1, inplace=True)

train, test = train_test_split(df, test_size=0.2, random_state=1773)

train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)