# Improving the benchmark analysis of earthquake damage using machine learning

This notebook builds on the benchmark analysis to improve the predictions of building damage. Including, adding more features, adding other algorithms, fine-tuning the hyperparameters, and deal with the data imbalance.

In [3]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

Define data paths and load data.

In [4]:
DATA_DIR = Path('data', 'raw')
SUB_DIR = Path('submissions')

train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

## Classification model

In [14]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# the model
from sklearn.linear_model import LogisticRegression

# for combining the preprocess with model training
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# evaluation metric (f1-score)
from sklearn.metrics import f1_score

Start by adding more features:

In [6]:
num_features = ['geo_level_1_id',
                'geo_level_2_id',
                'geo_level_3_id',
                'age',
                'area_percentage', 
                'height_percentage',
                'count_floors_pre_eq',
                'count_families']

cat_features = ['land_surface_condition',
                'foundation_type',
                'ground_floor_type']

features = num_features + cat_features

train_values_subset = train_values[features]

# create labels variable
labels = train_labels.damage_grade.to_numpy()

Create processing pipeline:

In [7]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)])


pipe = Pipeline([("preprocessing", preprocessor),
                ("classifier", LogisticRegression(random_state=0))])

In [17]:
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__max_iter': [100, 200]
}

stratified_kfold = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search.fit(train_values_subset, labels)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)




Best parameters found:  {'classifier__C': 1, 'classifier__max_iter': 200, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Best cross-validation score:  0.5728412374841273




## Prepare and save submission

Some boilerplate code for running the model on the test dataset and saving the predictions for submission:

In [19]:
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')

test_values_subset = test_values[features]
predictions = pipe.predict(test_values_subset)

submission_format = pd.read_csv(SUB_DIR / 'submission_format.csv', index_col='building_id')

my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

my_submission.head()

my_submission.to_csv(SUB_DIR / 'submission.csv')