In [51]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
from src.data_cleaning import group_categorical_features
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sns.set_theme(style="whitegrid")
sns.set_palette(palette="Paired")

# Load Data

In [52]:
import yaml
import pandas as pd

from data_cleaning import drop_correlated_features
from data_cleaning import group_categorical_features
from data_cleaning import  prepare_data

# Read config file
with open("../../configs/config.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile, yaml.FullLoader)

# Check if data in config exists and read it
print("Read and Load Data ...")
train_values_path = "../../data/raw/train_values.csv"
train_labels_path = "../../data/raw/train_labels.csv"
test_values_path = "../../data/raw/test_values.csv"
result_path = cfg["paths"]["result"]

# Load data
print("Loading Data ...")
train_values = pd.read_csv(train_values_path)
train_labels = pd.read_csv(train_labels_path)
test_values = pd.read_csv(test_values_path)
train_values.set_index("building_id", inplace=True)
test_values.set_index("building_id", inplace=True)

# Data cleaning
# Prepare raw data
print("Cleaning Train Data ...")
binary_encoded_cols = [x for x in train_values.columns if x.startswith("has_")]
columns_to_ignore = cfg.get("data_cleaning", "NO DATA CLEANING DEFINED!").get("columns_to_ignore")
train_data_cleaned = prepare_data(df=train_values, config=cfg,
                                  ignore_cols=columns_to_ignore+binary_encoded_cols,
                                  outlier_method="replace")
print("Cleaning Test Data ...")
test_data_cleaned = prepare_data(df=test_values, config=cfg,
                                  ignore_cols=columns_to_ignore+binary_encoded_cols,
                                  outlier_method="replace")

# Correlated features
print("Drop correlated features...")
train_data_cleaned = drop_correlated_features(data=train_data_cleaned, config=cfg["data_cleaning"]["correlations"])
test_data_cleaned = drop_correlated_features(data=test_data_cleaned, config=cfg["data_cleaning"]["correlations"])

# Group categorical features with rarely occurring realizations
print("Grouping categorical features ...")
train_data_cleaned = group_categorical_features(df=train_data_cleaned, default_val="others", verbose=False)
test_data_cleaned = group_categorical_features(df=test_data_cleaned, default_val="others", verbose=False)

Read and Load Data ...
Loading Data ...
Cleaning Train Data ...
Found 33899 outliers,using method'replace'to handle them:
##########
Count per column: {'age': 12499, 'area_percentage': 13557, 'height_percentage': 7843}
Lower bound: {'age': -20.0, 'area_percentage': -1.0, 'height_percentage': 1.0}
Upper bound: {'age': 60.0, 'area_percentage': 15.0, 'height_percentage': 9.0}
Cleaning Test Data ...
Found 11365 outliers,using method'replace'to handle them:
##########
Count per column: {'age': 4279, 'area_percentage': 4442, 'height_percentage': 2644}
Lower bound: {'age': -20.0, 'area_percentage': -1.0, 'height_percentage': 1.0}
Upper bound: {'age': 60.0, 'area_percentage': 15.0, 'height_percentage': 9.0}
Drop correlated features...
Grouping categorical features ...


In [53]:
train_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 26 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   geo_level_1_id                          260601 non-null  int64   
 1   geo_level_2_id                          260601 non-null  int64   
 2   geo_level_3_id                          260601 non-null  int64   
 3   count_floors_pre_eq                     260601 non-null  int64   
 4   land_surface_condition                  260601 non-null  category
 5   foundation_type                         260601 non-null  category
 6   roof_type                               260601 non-null  category
 7   ground_floor_type                       260601 non-null  object  
 8   other_floor_type                        260601 non-null  category
 9   position                                260601 non-null  category
 10  plan_configuration         

# Encoding using OHE

In [54]:
def encode_train_data(x_train: pd.DataFrame):
    """
    Encodes the train data using sklearn One Hot Encoded and returns fitted OHE object.
    We need the fitted OHE object to transform our test data thus we also return the OHE object.

    :param x_train:
    :return: Encoded DataFrame and fitted OHE object
    """
    x_train_cats = x_train.select_dtypes(['object', 'category'])

    # Fit One Hot Encoding Object
    ohe = OneHotEncoder(handle_unknown="ignore", dtype=np.int64)
    x_train_cats_encoded = ohe.fit_transform(x_train_cats).toarray()
    # Transform encoded data to pandas dataframe
    x_train_cats_encoded = pd.DataFrame(x_train_cats_encoded, columns=ohe.get_feature_names_out(), index=x_train.index)
    # Drop old features
    feats_to_drop = list(ohe.feature_names_in_)
    x_train = x_train.drop(columns=feats_to_drop, axis=1)
    # Concat old dataframe with new encoded features
    x_train_encoded = pd.concat([x_train, x_train_cats_encoded], axis=1)

    return x_train_encoded, ohe

def encode_test_data(x_test: pd.DataFrame, ohe: OneHotEncoder) -> pd.DataFrame:
    """
    Applies the already fitted OHE object on the test dataframe x_test.
    First extracts categorical columns from the x_test and transforms them using the ohe object.
    Then the encoded data gets concatenated with the remaining not encoded features.

    :param x_test: Test DataFrame to transform using the fitted OHE object
    :param ohe: Fitted OneHotEncoder Object yielded from 'encode_train_data()' function
    :return: Encoded DataFrame
    """
    # Get categorical columns and transform them using already fitted OHE object
    x_test_cats = x_test.select_dtypes(['object', 'category'])
    x_test_cats_encoded = ohe.transform(x_test_cats).toarray()
    # Transform to pandas DataFrame
    x_test_cats_encoded = pd.DataFrame(x_test_cats_encoded, columns=ohe.get_feature_names_out(), index=x_test.index)
    # Drop old features
    feats_to_drop = list(ohe.feature_names_in_)
    x_test = x_test.drop(columns=feats_to_drop, axis=1)
    # Concat old dataframe with new encoded features
    x_test_encoded = pd.concat([x_test, x_test_cats_encoded], axis=1)
    return x_test_encoded

train_data_cleaned_encoded, ohe = encode_train_data(x_train=train_data_cleaned)
test_data_cleaned_encoded = encode_test_data(x_test=test_data_cleaned, ohe=ohe)

In [58]:
def normalize_train_data(x_train: pd.DataFrame, scaler):
    """
    Function to normalize the train data.
    Fits StandardScaler on given train DataFrame and also outputs the scaler.

    :param x_train: train DataFrame
    :param scaler: Either MinMax() object or StandardScaler() object
    :return: Scaled Train DataFrame
    """
    x_train_scaled = scaler.fit_transform(x_train)
    # Transform back to pandas DataFrame
    x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns)
    return x_train_scaled, scaler

def normalize_test_data(x_test: pd.DataFrame, scaler) -> pd.DataFrame:
    """
    Function to normalize the test data. Uses already fitted StandardScaler or MinMax scaler object
    to transform given DataFrame.

    :param x_test: test DataFrame
    :param scaler: Fitted StandardScaler or MinMax Scaler object, yielded from 'normalize_train_data' function
    :return: Scaled Test DataFrame
    """

    x_test_scaled = scaler.transform(x_test)
    # Transform back to pandas DataFrame
    x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns)
    return x_test_scaled

train_data_cleaned_encoded_scaled, scaler = normalize_train_data(train_data_cleaned_encoded, StandardScaler())
test_data_cleaned_encoded_scaled = normalize_test_data(test_data_cleaned_encoded, scaler)

In [59]:
train_data_cleaned_encoded_scaled

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,...,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_d,plan_configuration_others,legal_ownership_status_others,legal_ownership_status_v
0,-0.983414,-0.518705,1.629055,-0.178274,3.206391,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,-1.858462,2.252816,0.205192,-0.205192,-0.196223,0.196223
1,-0.734459,0.481998,-0.945017,-0.178274,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
2,0.883744,-0.819158,0.744612,-0.178274,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,2.235620,-0.231741,-0.095043,-1.858462,2.252816,0.205192,-0.205192,-0.196223,0.196223
3,1.008221,-0.685893,1.216589,-0.178274,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,2.235620,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
4,-0.361028,-1.381296,-1.308119,1.195989,3.206391,-1.789003,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,2.235620,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,1.381653,1.536007,-1.271644,-1.552536,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,-4.873478,4.873478,-0.196223,0.196223
260597,0.385835,0.033741,-1.151250,-0.178274,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
260598,0.385835,-1.575137,0.522472,1.195989,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
260599,1.506130,-1.604213,-1.208568,-0.178274,-0.311877,-1.789003,-0.188554,-0.136284,-0.270442,3.505111,...,4.546009,-0.447303,4.315161,-0.095043,-1.858462,-0.443889,0.205192,-0.205192,-0.196223,0.196223


In [62]:
test_data_cleaned_encoded_scaled

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,...,other_floor_type_s,other_floor_type_x,position_j,position_o,position_s,position_t,plan_configuration_d,plan_configuration_others,legal_ownership_status_others,legal_ownership_status_v
0,0.385835,-0.254597,1.384702,1.195989,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
1,-0.983414,-1.357066,1.571189,-0.178274,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
2,1.008221,-1.652673,1.038329,-0.178274,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
3,1.506130,-1.604213,-1.542599,-1.552536,-0.311877,-1.789003,-0.188554,-0.136284,-0.270442,3.505111,...,-0.219973,-0.447303,-0.231741,-0.095043,-1.858462,2.252816,0.205192,-0.205192,-0.196223,0.196223
4,0.385835,-0.998461,0.469543,1.195989,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,-1.858462,2.252816,0.205192,-0.205192,-0.196223,0.196223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,-1.232368,-0.232790,-0.722604,1.195989,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,-1.858462,2.252816,0.205192,-0.205192,5.096248,-5.096248
86864,-0.485505,1.710464,1.549249,1.195989,3.206391,0.558971,5.303508,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
86865,1.008221,1.053828,0.398788,-1.552536,-0.311877,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,-0.196223,0.196223
86866,-0.983414,0.823642,-1.466085,-0.178274,3.206391,0.558971,-0.188554,-0.136284,-0.270442,-0.285298,...,-0.219973,-0.447303,-0.231741,-0.095043,0.538079,-0.443889,0.205192,-0.205192,5.096248,-5.096248
