### This notebook contains the code to generate the submission for the "Richter's Predictor: Modeling Earthquake Damage" competition.

The submission should contain the building_id with the predicted damage_grade, an ordinal variable with values 1,2,3.

The score is evaluated using the micro averaged F1 score.

This notebook contains the code for the model that is trained on the data and created the submission. There are different options for models to try: a classifier, regressor, and hybrid of the two. 

This notebook contains the regressor.

(In this script I forgot to remove the building_id column in X and did not apply the StandardScaler. However, as it is very unlikely this will improve this model beyond the classifier, I won't bother implementing it.)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBRegressor

In [25]:
# Load the data
X = pd.read_csv('train_values_preprocessed.csv')
y = pd.read_csv('train_labels.csv')
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,802906,6,487,12198,2,30,6,5,1,1,...,0,0,0,0,0,0,0,0,1,0
1,28830,8,900,2812,2,10,8,7,0,1,...,0,0,0,0,0,0,0,0,1,0
2,94947,21,363,8973,2,10,5,5,0,1,...,0,0,0,0,0,0,0,0,1,0
3,590882,22,418,10694,2,10,6,5,0,1,...,0,0,0,0,0,0,0,0,1,0
4,201944,11,131,1488,3,30,8,9,1,0,...,0,0,0,0,0,0,0,0,1,0


In [26]:
y.drop('building_id', axis=1, inplace=True)
# The model only works when classes are [0,1,2] so we subtract 1
y = y - 1
y.head()

Unnamed: 0,damage_grade
0,2
1,1
2,2
3,1
4,2


In [27]:
# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Define a test model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          verbose=False)

In [29]:
y_pred = model.predict(X_valid)
y_pred = y_pred + 1  # add 1 again
# Now we need to round the predictions to the nearest integer
y_pred = y_pred.round().astype(int)
print(y_pred)

y_valid = y_valid + 1  # add 1 again
score = f1_score(y_valid, y_pred, average='micro')
print(score)

[3 2 2 ... 2 3 2]
0.7270006331421116


In [15]:
# Optimise hyperparameters
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.01, 0.05, 0.1]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_pred = y_pred.round().astype(int)
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.01 0.6841388307975672
500 0.05 0.7171197789758447
500 0.1 0.726540166151839
1000 0.01 0.6993726137257535
1000 0.05 0.7270006331421116
1000 0.1 0.7279983116210357
1200 0.01 0.7029604190249612
1200 0.05 0.7286506398572552
1200 0.1 0.7279983116210357
0.7286506398572552


In [16]:
# Like for the classifier, it seems increasing learning rate helps here, although it is not the case for the 1200
# Still try the 0.25
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.25]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_pred = y_pred.round().astype(int)
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.25 0.7315669307956486
1000 0.25 0.7315669307956486
1200 0.25 0.7315669307956486
0.7315669307956486


In [17]:
# Check more estimators
param_grid = {
    'n_estimators': [1500],
    'learning_rate': [0.05, 0.15, 0.25]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_pred = y_pred.round().astype(int)
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

1500 0.05 0.7286506398572552
1500 0.15 0.7312215805529441
1500 0.25 0.7315669307956486
0.7315669307956486


In [18]:
# So at 0.25 the estimators don't matter, all give the same score
# Try other rates as well again
param_grid = {
    'n_estimators': [1000],
    'learning_rate': [0.2, 0.3]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_pred = y_pred.round().astype(int)
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

1000 0.2 0.7245064369448015
1000 0.3 0.7270198192667063
0.7270198192667063


In [30]:
# This gets worse again, so we'll go with 1000 0.25 again
y_valid = y_valid - 1  # redo this
model_fin = XGBRegressor(n_estimators=1000, learning_rate=0.25, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_fin.fit(X_train, y_train,
             eval_set=[(X_valid, y_valid)],
             verbose=False)

y_pred = model_fin.predict(X_valid)
y_pred = y_pred + 1  # add 1 again
# Now we need to round the predictions to the nearest integer
y_pred = y_pred.round().astype(int)
print(y_pred)

y_valid = y_valid + 1  # add 1 again
score = f1_score(y_valid, y_pred, average='micro')
print(score)

[3 2 2 ... 2 3 2]
0.7315669307956486


In [32]:
# Load the test data
X_test = pd.read_csv('test_values_preprocessed.csv')

# Initialize the output
output = pd.DataFrame(X_test['building_id'])

# Make the predictions
predictions = model_fin.predict(X_test)
predictions = predictions + 1  # add 1 again
predictions = predictions.round().astype(int)
output['damage_grade'] = predictions
output.head()

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3


In [33]:
# Check if all grades are 1,2,3
print(output['damage_grade'].unique())

[3 2 1]


In [34]:
# Save the output as csv
output.to_csv('submission_regressor.csv', index=False)

### Final note:

This model got a submission score of 0.7302, while the best is 0.7558.
It is slightly worse than the classifier model score.