In [90]:
import pandas as pd
from utils.chi_square import perform_chi_square_test
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [91]:

df = pd.read_csv('./data/train.csv')
categorical = df.select_dtypes(include=['object']).columns       
col_significance= [(col,perform_chi_square_test(df,col,'SalePrice')) for col in categorical]
dropped_cols = []
for sig in col_significance:
    if sig[1][2]==False:
        dropped_cols.append(sig[0])
        print("Dropping column due to insignificant values: ", sig[0], " p-value: ", sig[1][1])
        df.drop(sig[0], axis=1, inplace=True)



Dropping column due to insignificant values:  Alley  p-value:  0.20972415885759163
Dropping column due to insignificant values:  LandContour  p-value:  0.08674645041917711
Dropping column due to insignificant values:  Utilities  p-value:  1.0
Dropping column due to insignificant values:  LandSlope  p-value:  0.10508638737793884
Dropping column due to insignificant values:  Condition1  p-value:  1.0
Dropping column due to insignificant values:  Condition2  p-value:  0.07598640644469955
Dropping column due to insignificant values:  BldgType  p-value:  0.9999860714473023
Dropping column due to insignificant values:  HouseStyle  p-value:  0.6482615179447816
Dropping column due to insignificant values:  RoofStyle  p-value:  1.0
Dropping column due to insignificant values:  RoofMatl  p-value:  1.0
Dropping column due to insignificant values:  Exterior1st  p-value:  0.9999839433628513
Dropping column due to insignificant values:  Exterior2nd  p-value:  0.8469189757654344
Dropping column due t

In [92]:

categorical_numeric = [
    'MSZoning',
    'Street',
    'LotShape',
    'LotConfig',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'Heating',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'SaleCondition' 
]
categorical_onehot = [
    'Neighborhood',
    'CentralAir',
    'SaleType',
]
le = LabelEncoder()
for col in categorical_numeric:
    df[col] = le.fit_transform(df[col])

for col in categorical_onehot:
    onehot = pd.get_dummies(df[col], prefix=col, dummy_na=True)
    df.drop(col, axis=1, inplace=True)
    df = df.join(onehot)


In [93]:
for col in df.columns[df.isna().any()].tolist():
    df[col]= df[col].fillna(df[col].median(skipna=True))

In [94]:
df.drop('Id', axis=1, inplace=True)

In [95]:
df.to_csv('./data/train_preprocessed.csv', index=False)

In [100]:
def prep_test_data(df_test,categorical_numeric,categorical_onehot, dropped_cols):
    for col in dropped_cols:
        df_test.drop(col, axis=1, inplace=True)
    
    for col in categorical_numeric:
        df_test[col] = le.fit_transform(df_test[col])

    for col in categorical_onehot:
        onehot = pd.get_dummies(df_test[col], prefix=col, dummy_na=True)
        df_test.drop(col, axis=1, inplace=True)
        df_test = df_test.join(onehot)
    df_test.drop('Id', axis=1, inplace=True)
    
    for col in df_test.columns[df_test.isna().any()].tolist():
        df_test[col]= df_test[col].fillna(df_test[col].median(skipna=True))
        
    return df_test
df_test = pd.read_csv('./data/test.csv')
df_test_prep= prep_test_data(df_test,categorical_numeric,categorical_onehot, dropped_cols)
df_test_prep.to_csv('./data/test_preprocessed.csv', index=False)

In [98]:
df.columns[df.isna().any()].tolist()

[]

In [99]:
df_test_prep.columns[df_test_prep.isna().any()].tolist()

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']