In [1]:
import pandas as pd

In [2]:
# Read provided data
x_train = pd.read_csv("../../data/raw/train_values.csv")
y_train = pd.read_csv("../../data/raw/train_labels.csv")
x_test = pd.read_csv("../../data/raw/test_values.csv")

# Idea
Look up (in some external ressources) the quality/rist/resistance of superstructure against eartquakes.

- https://whereisthenorth.com/4-methods-to-achieve-earthquake-resistant-buildings/
  Steel, wood, reinforced concrete and bamboo are good
- https://www.yashkrishi.com/earthquake-its-effects-design-construction-of-buildings
  brick, stone or mud is bad
- https://www.quora.com/Are-brick-buildings-safe-in-an-earthquake
  bricks are not good
  
It also was pointed out, that the foundation of the building has a big influence on the resistance against earthquakes.

### In our case...
I took the superstructure information we have in the dataset and flagged them with good, bad and no idea: 

good:
- bamboo
- rc_non_engineered
- rc_engineered
- timber

bad:
- adobe mud
- mud mortar stone
- stone flag
- mud mortar brick
- cement mortar brick
- cement mortar stone

no idea:
- other

In [30]:
has_superstructure_features = [x for x in x_train.columns if x.startswith("has_superstructure_")]
has_superstructure_features

['has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other']

In [31]:
# encode superstructure as good = 1, no idea = 0; bad = -1
# Also set combinations of good+bad, good+other, bad+other to 0

# Default to -1 --> all bad are right
x_train["superstructure_quality"] = -1

# Update all good superstructures
x_train.loc[(x_train["has_superstructure_bamboo"] == 1) | 
            (x_train["has_superstructure_rc_engineered"] == 1) | 
            (x_train["has_superstructure_rc_non_engineered"] == 1) | 
            (x_train["has_superstructure_timber"] == 1), "superstructure_quality"] = 1

# Update all other superstructures
x_train.loc[(x_train["has_superstructure_other"] == 1), "superstructure_quality"] = 0

# Update combinations of superstructures
# Combination of good + other
x_train.loc[((x_train["has_superstructure_bamboo"] == 1) | 
            (x_train["has_superstructure_rc_engineered"] == 1) | 
            (x_train["has_superstructure_rc_non_engineered"] == 1) | 
            (x_train["has_superstructure_timber"] == 1)) & 
            (x_train["has_superstructure_other"] == 1), "superstructure_quality"] = 0

# Combination of good + bad
x_train.loc[((x_train["has_superstructure_bamboo"] == 1) | 
            (x_train["has_superstructure_rc_engineered"] == 1) | 
            (x_train["has_superstructure_rc_non_engineered"] == 1) | 
            (x_train["has_superstructure_timber"] == 1)) & 
            ((x_train["has_superstructure_adobe_mud"] == 1) |
            (x_train["has_superstructure_mud_mortar_stone"] == 1) |
            (x_train["has_superstructure_cement_mortar_stone"] == 1) |
            (x_train["has_superstructure_mud_mortar_brick"] == 1) |
            (x_train["has_superstructure_cement_mortar_brick"] == 1) |
            (x_train["has_superstructure_stone_flag"] == 1)), "superstructure_quality"] = 0

# Combination of bad + other
x_train.loc[(x_train["has_superstructure_other"] == 1) & 
            ((x_train["has_superstructure_adobe_mud"] == 1) |
            (x_train["has_superstructure_mud_mortar_stone"] == 1) |
            (x_train["has_superstructure_cement_mortar_stone"] == 1) |
            (x_train["has_superstructure_mud_mortar_brick"] == 1) |
            (x_train["has_superstructure_cement_mortar_brick"] == 1) |
            (x_train["has_superstructure_stone_flag"] == 1)), "superstructure_quality"] = 0

In [32]:
x_train[["building_id", "superstructure_quality"]][:20]

Unnamed: 0,building_id,superstructure_quality
0,802906,-1
1,28830,-1
2,94947,-1
3,590882,0
4,201944,-1
5,333020,-1
6,728451,-1
7,475515,0
8,441126,0
9,989500,-1


# Define function

In [26]:
def get_quality_of_superstructure(raw_data=None, df_to_add_info=None):
    """
    The used superstructure has an influence on the resistance of a building against an earthquake. 
    After some research the result was as follws:
    Good superstructures: Steel, bamboo, timber, reinforced concrete
    Bad superstructures: Bricks, stone, mud
    Based on the features in the raw data the following ordinal feature is created: 
    Good superstructures get the value 1, Bad superstructures get the value -1 and everything else 0 (including combinations). 
    
    :param raw_data The raw dataframe including the has_superstructure_X columns
    :param df_to_add_info The dataframe where to add the information to
    
    :returns A dataframe with the addtitonal feature 'superstructure_quality'
    """
    # encode superstructure as good = 1, no idea = 0; bad = -1
    # Also set combinations of good+bad, good+other, bad+other to 0
    
    # Default to -1 --> all bad are right
    raw_data["superstructure_quality"] = -1

    # Update all good superstructures
    raw_data.loc[(raw_data["has_superstructure_bamboo"] == 1) | 
                (raw_data["has_superstructure_rc_engineered"] == 1) | 
                (raw_data["has_superstructure_rc_non_engineered"] == 1) | 
                (raw_data["has_superstructure_timber"] == 1), "superstructure_quality"] = 1

    # Update all other superstructures
    raw_data.loc[(raw_data["has_superstructure_other"] == 1), "superstructure_quality"] = 0

    # Update combinations of superstructures
    # Combination of good + other
    raw_data.loc[((raw_data["has_superstructure_bamboo"] == 1) | 
                (raw_data["has_superstructure_rc_engineered"] == 1) | 
                (raw_data["has_superstructure_rc_non_engineered"] == 1) | 
                (raw_data["has_superstructure_timber"] == 1)) & 
                (raw_data["has_superstructure_other"] == 1), "superstructure_quality"] = 0

    # Combination of good + bad
    raw_data.loc[((raw_data["has_superstructure_bamboo"] == 1) | 
                (raw_data["has_superstructure_rc_engineered"] == 1) | 
                (raw_data["has_superstructure_rc_non_engineered"] == 1) | 
                (raw_data["has_superstructure_timber"] == 1)) & 
                ((raw_data["has_superstructure_adobe_mud"] == 1) |
                (raw_data["has_superstructure_mud_mortar_stone"] == 1) |
                (raw_data["has_superstructure_cement_mortar_stone"] == 1) |
                (raw_data["has_superstructure_mud_mortar_brick"] == 1) |
                (raw_data["has_superstructure_cement_mortar_brick"] == 1) |
                (raw_data["has_superstructure_stone_flag"] == 1)), "superstructure_quality"] = 0

    # Combination of bad + other
    raw_data.loc[(raw_data["has_superstructure_other"] == 1) & 
                ((raw_data["has_superstructure_adobe_mud"] == 1) |
                (raw_data["has_superstructure_mud_mortar_stone"] == 1) |
                (raw_data["has_superstructure_cement_mortar_stone"] == 1) |
                (raw_data["has_superstructure_mud_mortar_brick"] == 1) |
                (raw_data["has_superstructure_cement_mortar_brick"] == 1) |
                (raw_data["has_superstructure_stone_flag"] == 1)), "superstructure_quality"] = 0
    
    # Join new info to df
    result = df_to_add_info.set_index("building_id").join(raw_data[["building_id", "superstructure_quality"]].set_index("building_id"))
    result.reset_index(inplace=True)
    
    return result

In [27]:
x_train = x_train.drop("superstructure_quality", axis=1)

In [28]:
test = get_quality_of_superstructure(raw_data=x_train, df_to_add_info=x_train.copy())

In [29]:
test[["building_id", "superstructure_quality"]][:20]

Unnamed: 0,building_id,superstructure_quality
0,802906,-1
1,28830,-1
2,94947,-1
3,590882,0
4,201944,-1
5,333020,-1
6,728451,-1
7,475515,0
8,441126,0
9,989500,-1
