# Building machine learning model to predict building damage in Kavrepalanchok, Nepal after an earthquake

In [None]:
# Import libraries here
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

In [None]:
#build wrangle function to clean the dataset collected from a database
def wrangle(db_path):
    conn = sqlite3.connect(db_path)
    query = """
        SELECT DISTINCT(i.building_id) AS b_id,
        s.*,
        d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s
        ON i.building_id = s.building_id
        JOIN building_damage AS d
        ON i.building_id = d.building_id
        WHERE district_id = 3
    """
    
    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")

    # Identify leaky columns
    drop_cols = [col for col in df.columns if "post_eq" in col]

    # Add high-cardinality / redundant column
    drop_cols.append("building_id")

    # Create binary target column
    df["damage_grade"] = df["damage_grade"].str[-1].astype(int)
    df["severe_damage"] = (df["damage_grade"] > 3).astype(int)

    # Drop old target
    drop_cols.append("damage_grade")

    # Drop multicollinearity column
    drop_cols.append("count_floors_pre_eq")

    # Drop columns
    df.drop(columns=drop_cols, inplace=True)

    return df

In [None]:
#use wrangle functionto query the database
df = wrangle('/home/jovyan/nepal.sqlite')
df.head()

In [None]:
# Bar chart of the label to be predicted, "severe_damage"`
df['severe_damage'].value_counts(normalize=True).plot(kind='bar')
plt.xlabel('Severe Damage')
plt.ylabel('Relative Frequency')
plt.title('Kavrepalanchok, Class Balance')

In [None]:
# Boxplot of distribution of the label classes`
sns.boxplot(x='severe_damage', y='plinth_area_sq_ft', data=df)

# Don't delete the code below ðŸ‘‡
plt.savefig("images/4-5-7.png", dpi=150)
plt.xlabel('Severe Damage')
plt.ylabel('Plinth Area [sq. ft.]')
plt.title('Kavrepalanchok, Plinth Area vs Building Damage');

In [None]:
#frequencies of roof types more likely to suffer severe damage
roof_pivot = pd.pivot_table(
        df, index='roof_type', values='severe_damage', aggfunc=np.mean).sort_values(by='severe_damage')
roof_pivot

In [None]:
#preparing the dataset to train the machine learning model
target = 'severe_damage'
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
#calculate the baseline accuracy score for the model
acc_baseline = df['severe_damage'].value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))

In [None]:
#train logistic regression model with the dataset
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression(max_iter=1000))
model_lr.fit(X_train, y_train)

In [None]:
#calculate the training & validation accuracy score for the logistic regression model
lr_train_acc = model_lr.score(X_train, y_train)
lr_val_acc = model_lr.score(X_val, y_val)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_val_acc)

In [None]:
#train a decision tree model with the dataset
depth_hyperparams = range(1, 16)
training_acc = []
validation_acc = []
for d in depth_hyperparams:
    model_dt = make_pipeline(
        OrdinalEncoder(),
        DecisionTreeClassifier(max_depth=d, random_state=42))
    model_dt.fit(X_train, y_train)
    training_acc.append(model_dt.score(X_train, y_train))
    validation_acc.append(model_dt.score(X_val, y_val))

In [None]:
#Plot a validation curve for the decision tree model, to return the best parameters
plt.plot(depth_hyperparams, validation_acc, label='validation')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy Score')
plt.title('Validation Curve, Decision Tree Model')
plt.legend();

In [None]:
#retrain the decision tree model with the best hyperparameters
final_model_dt = make_pipeline(
        OrdinalEncoder(),
        DecisionTreeClassifier(max_depth=10, random_state=42))
final_model_dt.fit(X_train, y_train)

In [None]:
#plot most important features contributing to the purity of the decision tree model
feat_imp.plot(kind='barh')
plt.xlabel('Gini Importance')
plt.ylabel('Feature')
plt.title('Kavrepalanchok Decision Tree, Feature Importance')