### This notebook contains the code to generate the submission for the "Richter's Predictor: Modeling Earthquake Damage" competition.

The submission should contain the building_id with the predicted damage_grade, an ordinal variable with values 1,2,3.

The score is evaluated using the micro averaged F1 score.

This notebook contains the code for the model that is trained on the data and created the submission. There are different options for models to try: a classifier, regressor, and hybrid of the two.

This notebook contains the hybrid, which is an ordinal classifier. The idea is that we split up the target labels into multiple binary classification tasks, which then each have a separate classifier trained on them.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [2]:
# Load the data
X = pd.read_csv('train_values_preprocessed.csv')
y = pd.read_csv('train_labels.csv')
X.drop('building_id', axis=1, inplace=True)  # doesn't help the model
X.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
# Then convert to binary targets; one column to see if the grade is above 1, another to see if it's above 2, etc.
for i in range(1, y['damage_grade'].max()):
    col_name = 'damage_grade_above_' + str(i)
    y[col_name] = y["damage_grade"].apply(lambda x: 1 if x > i else 0)
y.head(10)

Unnamed: 0,building_id,damage_grade,damage_grade_above_1,damage_grade_above_2
0,802906,3,1,1
1,28830,2,1,0
2,94947,3,1,1
3,590882,2,1,0
4,201944,3,1,1
5,333020,2,1,0
6,728451,3,1,1
7,475515,1,0,0
8,441126,2,1,0
9,989500,1,0,0


In [4]:
# Then drop id again, and original damage_grade
y.drop(['building_id', 'damage_grade'], axis=1, inplace=True)
y.head()

Unnamed: 0,damage_grade_above_1,damage_grade_above_2
0,1,1
1,1,0
2,1,1
3,1,0
4,1,1


In [5]:
# Define separate columns
y_target_1 = y["damage_grade_above_1"]
y_target_2 = y["damage_grade_above_2"]
y_target_2.head()

0    1
1    0
2    1
3    0
4    1
Name: damage_grade_above_2, dtype: int64

In [6]:
# Split the data for both; X's are the same
X_train, X_valid, y_train_1, y_valid_1 = train_test_split(X, y_target_1, test_size=0.2, random_state=42)
X_train, X_valid, y_train_2, y_valid_2 = train_test_split(X, y_target_2, test_size=0.2, random_state=42)

In [7]:
# Train models for each threshold
n_est = 1000
lr = 0.15
model_1 = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_1.fit(X_train, y_train_1,
          eval_set=[(X_valid, y_valid_1)],
          verbose=False)

model_2 = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_2.fit(X_train, y_train_2,
          eval_set=[(X_valid, y_valid_2)],
          verbose=False)

In [8]:
y_pred_1 = model_1.predict(X_valid)
y_pred_2 = model_2.predict(X_valid)
y_pred_grade = y_pred_1 + y_pred_2 + 1
print(y_pred_grade)

y_valid_grade = y_valid_1 + y_valid_2 + 1
score = f1_score(y_valid_grade, y_pred_grade, average='micro')
print(score)

score_1 = f1_score(y_valid_1, y_pred_1, average='micro')
score_2 = f1_score(y_valid_2, y_pred_2, average='micro')
print(score_1, score_2)

[3 2 2 ... 2 3 2]
0.7380326547840602
0.928915408376662 0.8071027033249554


In [14]:
# Optimise hyperparameters
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.05, 0.15, 0.25]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        model_1 = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model_1.fit(X_train, y_train_1,
                  eval_set=[(X_valid, y_valid_1)],
                  verbose=False)
        
        model_2 = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model_2.fit(X_train, y_train_2,
                  eval_set=[(X_valid, y_valid_2)],
                  verbose=False)

        y_pred_1 = model_1.predict(X_valid)
        y_pred_2 = model_2.predict(X_valid)
        y_pred_grade = y_pred_1 + y_pred_2 + 1
        
        score = f1_score(y_valid_grade, y_pred_grade, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.05 0.7227604996066844
500 0.15 0.7377256767905451
500 0.25 0.7386466107710904
1000 0.05 0.7333704265075497
1000 0.15 0.7380326547840602
1000 0.25 0.7386466107710904
1200 0.05 0.7344064772356632
1200 0.15 0.7380326547840602
1200 0.25 0.7386466107710904
0.7386466107710904


In [15]:
# Increasing n_est does not change score, higher lr does, check some more
param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.2, 0.3]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        model_1 = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model_1.fit(X_train, y_train_1,
                  eval_set=[(X_valid, y_valid_1)],
                  verbose=False)
        
        model_2 = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model_2.fit(X_train, y_train_2,
                  eval_set=[(X_valid, y_valid_2)],
                  verbose=False)

        y_pred_1 = model_1.predict(X_valid)
        y_pred_2 = model_2.predict(X_valid)
        y_pred_grade = y_pred_1 + y_pred_2 + 1
        
        score = f1_score(y_valid_grade, y_pred_grade, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.2 0.7375146294200035
500 0.3 0.7362675313213484
1000 0.2 0.7375146294200035
1000 0.3 0.7362675313213484
0.7375146294200035


In [16]:
# See best is 1000, 0.25
model_fin_1 = XGBClassifier(n_estimators=1000, learning_rate=0.25, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_fin_1.fit(X_train, y_train_1,
                eval_set=[(X_valid, y_valid_1)],
                verbose=False)
model_fin_2 = XGBClassifier(n_estimators=1000, learning_rate=0.25, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_fin_2.fit(X_train, y_train_2,
                eval_set=[(X_valid, y_valid_2)],
                verbose=False)

y_pred_1 = model_fin_1.predict(X_valid)
y_pred_2 = model_fin_2.predict(X_valid)
y_pred_grade = y_pred_1 + y_pred_2 + 1
score = f1_score(y_valid_grade, y_pred_grade, average='micro')
print(score)

0.7386466107710904


In [17]:
# Load the test data
X_test = pd.read_csv('test_values_preprocessed.csv')

# Initialize the output
output = pd.DataFrame(X_test['building_id'])
X_test.drop('building_id', axis=1, inplace=True)

# Predict the damage grade
predictions_1 = model_fin_1.predict(X_test)
predictions_2 = model_fin_2.predict(X_test)
damage_grade_pred = predictions_1 + predictions_2 + 1
output['damage_grade'] = damage_grade_pred.astype(int)
output.head()

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3


In [18]:
# Save the output as csv
output.to_csv('submission_ordinalS.csv', index=False)

### Final note:

This model got a submission score of 0.7364, so it does not perform better than the normal classifier or the ordinal classifier with only a single trained model.