In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from utils.data_analysis import get_data_for_preprocessing

In [23]:
uncorrelated_col, insignificant_col, missing_values_col=get_data_for_preprocessing('./data/train.csv',treshhold=0.05)

In [24]:
def drop_columns(df,insignificant_col,uncorrelated_col):
    df = df.drop(columns=insignificant_col)
    df = df.drop(columns=uncorrelated_col)
    return df

def fill_missing_values(df,missing_values_col):
    numerical = df.select_dtypes(exclude=['object'])
    categorical = df.select_dtypes(include=['object'])
    for col in missing_values_col:
        if col[1] == False and (col[0] in numerical.columns):
            df[col[0]] = df[col[0]].fillna(df[col[0]].median())
        elif col[1] == True and (col[0] in numerical.columns):
            df[col[0]] = df[col[0]].fillna(df[col[0]].mean())
        elif col[0] in categorical.columns:
            df[col[0]] = df[col[0]].fillna(df[col[0]].mode()[0])
    return df
    
            


In [25]:
df = pd.read_csv('./data/train.csv')
df = drop_columns(df,insignificant_col,uncorrelated_col)
df = fill_missing_values(df,missing_values_col)


In [26]:

categorical_numeric = [
    'MSZoning',
    'Street',
    'LotShape',
    'LotConfig',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'Heating',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'SaleCondition' 
]
categorical_onehot = [
    'Neighborhood',
    'CentralAir',
    'SaleType',
]
le = LabelEncoder()
for col in categorical_numeric:
    df[col] = le.fit_transform(df[col])

for col in categorical_onehot:
    onehot = pd.get_dummies(df[col], prefix=col, dummy_na=True)
    df.drop(col, axis=1, inplace=True)
    df = df.join(onehot)


In [27]:
df.to_csv('./data/train_preprocessed.csv', index=False)

In [28]:
def prep_test_data(df_test,categorical_numeric,categorical_onehot):
    df_test = drop_columns(df_test,insignificant_col,uncorrelated_col)
    df_test = fill_missing_values(df_test,missing_values_col)
    
    for col in categorical_numeric:
        df_test[col] = le.fit_transform(df_test[col])

    for col in categorical_onehot:
        onehot = pd.get_dummies(df_test[col], prefix=col, dummy_na=True)
        df_test.drop(col, axis=1, inplace=True)
        df_test = df_test.join(onehot)
    
    for col in df_test.columns[df_test.isna().any()].tolist():
        df_test[col]= df_test[col].fillna(df_test[col].median(skipna=True))
        
    return df_test
df_test = pd.read_csv('./data/test.csv')
df_test_prep= prep_test_data(df_test,categorical_numeric,categorical_onehot)
df_test_prep.to_csv('./data/test_preprocessed.csv', index=False)