### This notebook contains the code to generate the submission for the "Richter's Predictor: Modeling Earthquake Damage" competition.

The submission should contain the building_id with the predicted damage_grade, an ordinal variable with values 1,2,3.

The score is evaluated using the micro averaged F1 score.

This notebook contains the code for the model that is trained on the data and created the submission. There are different options for models to try: a classifier, regressor, and hybrid of the two.

This notebook contains the classifier. Now v2, where we apply StandardScaler as well, and get rid of the id's in X. (Note that in the end the scaler did not change the score while the id removal increased it.)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [3]:
# Load the data
X = pd.read_csv('train_values_preprocessed.csv')
y = pd.read_csv('train_labels.csv')
X.drop('building_id', axis=1, inplace=True)  # no info in this
X.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
y.drop('building_id', axis=1, inplace=True)
# The model only works when classes are [0,1,2] so we subtract 1
y = y - 1
y.head()

Unnamed: 0,damage_grade
0,2
1,1
2,2
3,1
4,2


In [14]:
# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the StandardScaler now
scaling_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), scaling_columns)], 
    remainder='passthrough'  # keep other column unchanged
)
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

# Transform to dataframe again
X_train = pd.DataFrame(X_train, columns=X.columns)
X_valid = pd.DataFrame(X_valid, columns=X.columns)
X_train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,-0.111023,-1.077522,0.574734,1.196103,-0.020259,-1.369184,-0.226463,0.038141,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.982874,0.907934,0.807815,1.196103,0.184847,0.452923,1.33835,-2.356188,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.858323,0.330964,1.224894,-0.177786,0.253216,-0.002603,-0.226463,0.038141,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.858323,-0.355097,-0.957847,-0.177786,0.048109,-0.230367,-0.748067,0.038141,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.387178,-0.047218,-1.430591,1.196103,0.048109,-0.45813,0.816746,0.038141,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
# Define a test model
model = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          verbose=False)

In [16]:
y_pred = model.predict(X_valid)
y_pred = y_pred + 1  # add 1 again
print(y_pred)

y_valid = y_valid + 1  # add 1 again
score = f1_score(y_valid, y_pred, average='micro')
print(score)

[3 2 2 ... 2 3 2]
0.7315861169202433


In [8]:
# Optimise hyperparameters
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.01, 0.05, 0.1]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.01 0.6910842079008461
500 0.05 0.7191151359336928
500 0.1 0.731662861418622
1000 0.01 0.7038621668809117
1000 0.05 0.7310680915561866
1000 0.1 0.7366512538132423
1200 0.01 0.7064906659503847
1200 0.05 0.7335047293797126
1200 0.1 0.7366512538132423
0.7366512538132423


In [9]:
# So weirdly enough both 1000 and 1200 for 0.1 give the same best score
# There also seems to be an improvement with learning_rate though, so try some larger values of that
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.15, 0.2]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.15 0.735289038967019
500 0.2 0.7364402064427006
1000 0.15 0.735289038967019
1000 0.2 0.7368623011837839
1200 0.15 0.735289038967019
1200 0.2 0.7368623011837839
0.7368623011837839


In [10]:
# Seems like increasing to 1200 does not improve anything, so drop that; 500 is also the same at 0.15 now even
# There is still an improvement with learning_rate, try even larger values
param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.25, 0.3, 0.35]
}

scores = []
for n_est in param_grid["n_estimators"]:
    for lr in param_grid["learning_rate"]:
        y_valid = y_valid - 1  # redo this
        model = XGBClassifier(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train, y_train,
                  eval_set=[(X_valid, y_valid)],
                  verbose=False)
        y_pred = model.predict(X_valid)
        y_pred = y_pred + 1
        y_valid = y_valid + 1
        score = f1_score(y_valid, y_pred, average='micro')
        print(n_est, lr, score)
        scores.append(score)
print(max(scores))

500 0.25 0.7371884653018936
500 0.3 0.7355192724621553
500 0.35 0.7356152030851288
1000 0.25 0.7371884653018936
1000 0.3 0.7355192724621553
1000 0.35 0.7356152030851288
0.7371884653018936


In [17]:
# So now we do see larger learning rates decrease score
# We take 1000, 0.25 as best hyperparameters
y_valid = y_valid - 1  # redo this
model_fin = XGBClassifier(n_estimators=1000, learning_rate=0.25, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_fin.fit(X_train, y_train,
             eval_set=[(X_valid, y_valid)],
             verbose=False)

In [18]:
y_pred = model_fin.predict(X_valid)
y_pred = y_pred + 1  # add 1 again
print(y_pred)

y_valid = y_valid + 1  # add 1 again
score = f1_score(y_valid, y_pred, average='micro')
print(score)

[3 2 2 ... 2 3 2]
0.7410640624700217


In [19]:
# Load the test data
X_test = pd.read_csv('test_values_preprocessed.csv')

# Initialize the output
output = pd.DataFrame(X_test['building_id'])
X_test.drop('building_id', axis=1, inplace=True)

# Apply the StandardScaler
X_test = preprocessor.transform(X_test)
X_test = pd.DataFrame(X_test, columns=X.columns)

# Make the predictions
predictions = model_fin.predict(X_test)
predictions = predictions + 1  # add 1 again
output['damage_grade'] = predictions
output.head()

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,1
4,421793,3


In [20]:
# Save the output as csv
output.to_csv('submission_classifier_v2.csv', index=False)

### Final note:

This model got a submission score of 0.7398, while the best is 0.7558.
It is ranked 1026/2420 (at time of writing).