In [288]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as p9
import os
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from lib.lib import Import_data

path = Import_data()
# get all files in the directory
files = os.listdir(path)
print(files)

Path to dataset files: C:\Users\kskdu\.cache\kagglehub\datasets\arashnic\earthquake-magnitude-damage-and-impact\versions\6
['csv_building_damage_assessment.csv', 'csv_building_ownership_and_use.csv', 'csv_building_structure.csv', 'csv_household_demographics.csv', 'csv_household_earthquake_impact.csv', 'csv_household_resources.csv', 'csv_individual_demographics.csv', 'mapping.csv', 'ward_vdcmun_district_name_mapping.csv']


In [289]:
building_structure = pd.read_csv(path + "/csv_building_structure.csv")
dmg_assesment = pd.read_csv(path + "/csv_building_damage_assessment.csv")

building_structure.keys()



Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'count_floors_post_eq', 'age_building',
       'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'condition_post_eq', 'damage_grade', 'technical_solution_proposed'],
      dtype='object')

In [290]:
# Filter out uneeded rows
dmg_assesment = dmg_assesment.drop(columns=[col for col in dmg_assesment.columns if 'has_geotechnical_risk_' not in col and col != 'building_id'])

In [291]:
dmg_assesment.keys()

Index(['building_id', 'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_liquefaction', 'has_geotechnical_risk_landslide',
       'has_geotechnical_risk_rock_fall', 'has_geotechnical_risk_flood',
       'has_geotechnical_risk_other'],
      dtype='object')

In [292]:
building_structure.keys()

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'count_floors_post_eq', 'age_building',
       'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'condition_post_eq', 'damage_grade', 'technical_solution_proposed'],
      dtype='object')

In [293]:
# Merge dmg assesment into building structure on building id column
building_structure = building_structure.merge(dmg_assesment, on='building_id')
building_structure.keys()

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'count_floors_post_eq', 'age_building',
       'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'condition_post_eq', 'damage_grade', 'technical_solution_proposed',
       'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_liquefaction', 'ha

In [294]:
len(building_structure)

762106

In [295]:
# Remove id columns, columns containing post_eq and technical_solution_proposed and position
building_structure = building_structure.drop(columns=[col for col in building_structure.columns if 'id' in col or 'post_eq' in col or 'technical_solution_proposed' in col or 'position' in col])

In [296]:
len(building_structure)

762106

In [297]:
# Dropping floors above 6
building_structure = building_structure[building_structure["count_floors_pre_eq"] < 6]

In [298]:
len(building_structure)

761661

In [299]:
# Dropping rows with plinith area above 1500
building_structure = building_structure[building_structure["plinth_area_sq_ft"] <= 1500]

In [300]:
# One hot encode the relevant columns
building_structure_onehot = pd.get_dummies(building_structure, columns=["foundation_type", "plan_configuration", 'land_surface_condition', 'roof_type', 'ground_floor_type', 'other_floor_type'] )

In [301]:
# Changing damage categories into numerical values
building_structure["damage_grade"] = building_structure["damage_grade"].replace({"Grade 1": 0, "Grade 2": 1, "Grade 3": 2, "Grade 4": 3, "Grade 5": 4})



In [302]:
building_structure.keys()

Index(['count_floors_pre_eq', 'age_building', 'plinth_area_sq_ft',
       'height_ft_pre_eq', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'damage_grade', 'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_liquefaction', 'has_geotechnical_risk_rock_fall',
       'has_geotechnical_risk_flood', 'has_geotechnical_risk_other'],
      dtype='object')

In [303]:
# Translating category columns to category type for native category support
categorical_columns = [
    "land_surface_condition", 
    "foundation_type", 
    "roof_type", 
    "ground_floor_type",
    "other_floor_type",
    "plan_configuration",
]

for col in categorical_columns:
    building_structure[col] = building_structure[col].astype('category')

In [304]:
print(building_structure.isna().sum())
building_structure = building_structure.dropna()

count_floors_pre_eq                        0
age_building                               0
plinth_area_sq_ft                          0
height_ft_pre_eq                           0
land_surface_condition                     0
foundation_type                            0
roof_type                                  0
ground_floor_type                          0
other_floor_type                           0
plan_configuration                         1
has_superstructure_adobe_mud               0
has_superstructure_mud_mortar_stone        0
has_superstructure_stone_flag              0
has_superstructure_cement_mortar_stone     0
has_superstructure_mud_mortar_brick        0
has_superstructure_cement_mortar_brick     0
has_superstructure_timber                  0
has_superstructure_bamboo                  0
has_superstructure_rc_non_engineered       0
has_superstructure_rc_engineered           0
has_superstructure_other                   0
damage_grade                              12
has_geotec

In [305]:
def train(X, y, params, rounds):
    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Create the LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    # Train the model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=rounds,
        valid_sets=[train_data, val_data],
    )

    return model, X_test, y_test

In [306]:
def test(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_class = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred_class)
    report = classification_report(y_test, y_pred_class)

    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

    # Feature importance
    importance = model.feature_importance(importance_type='split')
    feature_names = X.columns.tolist()
    feature_importance = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True)

    print("Feature Importance (top 10):")
    for feature, importance in feature_importance[:10]:
        print(f"{feature}: {importance}")

In [None]:
X = building_structure.drop(columns=["damage_grade", "plan_configuration"])
y = building_structure["damage_grade"].astype("Int64")

In [308]:
# Version 1: Convert 0, 1, 2 to 0 and 3, 4 to 1
y_binary = y.copy()
y_binary = y_binary.apply(lambda x: 0 if x <= 2 else 1)

# Version 2: Convert 0 to -1, 1-3 to 0, and 4 to 1
y_three_class = y.copy()
y_three_class = y_three_class.apply(lambda x: -1 if x == 0 else (1 if x == 4 else 0))

In [309]:
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),  # Number of damage grade classes
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'max_depth': -1,        # -1 means no limit
    'num_leaves': 31,       # Maximum tree leaves for base learners
    'feature_fraction': 0.8, # Randomly select a subset of features on each iteration
    'bagging_fraction': 0.8, # Randomly select a subset of data without resampling
    'bagging_freq': 5,       # Perform bagging every 5 iterations
    'verbose': -1,          # Suppress printing messages
    'boosting_type': 'gbdt'  # Traditional Gradient Boosting Decision Tree
}

rounds=1000

model, X_test, y_test = train(X, y, params, rounds)
test(model, X_test, y_test)

Accuracy: 0.4750
Classification Report:
              precision    recall  f1-score   support

         0.0       0.55      0.65      0.60     11642
         1.0       0.38      0.12      0.18     12860
         2.0       0.37      0.14      0.21     20396
         3.0       0.40      0.22      0.29     27486
         4.0       0.49      0.87      0.63     41353

    accuracy                           0.48    113737
   macro avg       0.44      0.40      0.38    113737
weighted avg       0.44      0.48      0.42    113737

Feature Importance (top 10):
plinth_area_sq_ft: 35405
height_ft_pre_eq: 24585
age_building: 24526
land_surface_condition: 6444
roof_type: 6281
count_floors_pre_eq: 5823
has_superstructure_timber: 4895
has_superstructure_mud_mortar_stone: 3916
ground_floor_type: 3693
other_floor_type: 3621


In [None]:
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_binary)),  # Number of damage grade classes
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'max_depth': -1,        # -1 means no limit
    'num_leaves': 31,       # Maximum tree leaves for base learners
    'feature_fraction': 0.8, # Randomly select a subset of features on each iteration
    'bagging_fraction': 0.8, # Randomly select a subset of data without resampling
    'bagging_freq': 5,       # Perform bagging every 5 iterations
    'verbose': -1,          # Suppress printing messages
    'boosting_type': 'gbdt'  # Traditional Gradient Boosting Decision Tree
}

rounds=1000

#model, X_test, y_test = train(X, y_binary, params, rounds)
#test(model, X_test, y_test)

Accuracy: 0.7494
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.49      0.61     44898
           1       0.73      0.92      0.82     68839

    accuracy                           0.75    113737
   macro avg       0.77      0.70      0.71    113737
weighted avg       0.76      0.75      0.73    113737

Feature Importance (top 10):
plinth_area_sq_ft: 12761
height_ft_pre_eq: 9655
age_building: 8307
land_surface_condition: 2746
roof_type: 2556
count_floors_pre_eq: 2479
has_superstructure_timber: 2174
other_floor_type: 1947
ground_floor_type: 1939
has_superstructure_mud_mortar_stone: 1805


In [None]:
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_three_class)),  # Number of damage grade classes
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'max_depth': -1,        # -1 means no limit
    'num_leaves': 31,       # Maximum tree leaves for base learners
    'feature_fraction': 0.8, # Randomly select a subset of features on each iteration
    'bagging_fraction': 0.8, # Randomly select a subset of data without resampling
    'bagging_freq': 5,       # Perform bagging every 5 iterations
    'verbose': -1,          # Suppress printing messages
    'boosting_type': 'gbdt'  # Traditional Gradient Boosting Decision Tree
}

rounds=1000

# model, X_test, y_test = train(X, y_three_class, params, rounds)
# test(model, X_test, y_test)

LightGBMError: Label must be in [0, 3), but found -1 in label