# Setup

In [54]:
import pandas as pd
import plotly.express as px

#data
train_features = pd.read_csv("data/train_values.csv").set_index("building_id")
train_labels = pd.read_csv("data/train_labels.csv").set_index("building_id")
test_features = pd.read_csv("data/test_values.csv").set_index("building_id")

# helper functions
def ids():
    ids = [id for id in globals().keys() if not id.startswith("_")]
    print(ids)

columns = train_features.columns
# for col in columns:
#     globals()[col] = train_features[col]

ids()




# Explore

## outliers

In [55]:
pd.set_option("display.precision", 2)
print(train_features.describe())
#outliers: age(+), area_perc(+), height_perc(+)

       geo_level_1_id  geo_level_2_id  geo_level_3_id  count_floors_pre_eq  \
count       260601.00       260601.00       260601.00            260601.00   
mean            13.90          701.07         6257.88                 2.13   
std              8.03          412.71         3646.37                 0.73   
min              0.00            0.00            0.00                 1.00   
25%              7.00          350.00         3073.00                 2.00   
50%             12.00          702.00         6270.00                 2.00   
75%             21.00         1050.00         9412.00                 2.00   
max             30.00         1427.00        12567.00                 9.00   

             age  area_percentage  height_percentage  \
count  260601.00        260601.00          260601.00   
mean       26.54             8.02               5.43   
std        73.57             4.39               1.92   
min         0.00             1.00               2.00   
25%        10.00 

In [56]:
# let's take a closer look at age(+), area_perc(+), height_perc(+)
for col in "age", "area_percentage", "height_percentage":
    # px.histogram(train_features, x = col).show()
    pass

#drop if
## age>100
## area_perc>31
## height_perc>10
delete_rows = ( 
    (train_features["age"]>100) | 
    (train_features["area_percentage"]>31) | 
    (train_features["height_percentage"]>10)
)
print(delete_rows.mean().round(3)) #we're losing 2.5% of observations

print(len(train_features))
train_features2 = train_features[~delete_rows]
print(len(train_features2))


0.024
260601
254462


## binary `has_*` variables

In [57]:
#are all has_* variables binary?
#are some has_* very unbalanced?

delete_cols = []

for col in filter(lambda col: col.startswith("has"), train_features2):
    value_counts = train_features2[col].value_counts()
    if value_counts.shape[0]!=2:
        # print(col, "takes on", value_counts.shape[0], "values.") 
        delete_cols.append(col)
    
    ratio = value_counts[0]/value_counts[1]
    if ratio<0.05 or ratio>20:
        # print("Values in", col, "are imbalanced.")
        # print(train_features2[col].value_counts())
        delete_cols.append(col)

train_features3 = train_features2.drop(columns = delete_cols)
train_features3.shape

(254462, 24)

## `geo_level`

In [58]:
# print(train_features.describe())
# drop geo_level_2 and geo_level_3
# turn geo_level_1 into dummies
train_features4 = train_features3.drop(columns=["geo_level_2_id", "geo_level_3_id"])

dummies = pd.get_dummies(train_features4["geo_level_1_id"], prefix="geo_level_cat")
# print(dummies.columns, dummies.shape, sep="\n")

# train_features5 = pd.concat([train_features4, dummies], axis=1).drop(columns="geo_level_1_id")
train_features5 = train_features4.copy(deep=True)
print(train_features5.columns, train_features5.shape, sep="\n")

Index(['geo_level_1_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'legal_ownership_status', 'count_families',
       'has_secondary_use', 'has_secondary_use_agriculture'],
      dtype='object')
(254462, 22)


# lightGBM

In [59]:
import lightgbm as lgb

lgbc = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    learning_rate=0.1,  # commonly used starting value
    n_estimators=100,   # number of trees, can start with 100 for initial run
    max_depth=-1,       # -1 means no limit on tree depth (consider adjusting if overfitting occurs)
    num_leaves=31,      # default value, you may increase or decrease based on the complexity of your data
    min_data_in_leaf=20, # minimum number of samples in a leaf, helps to avoid overfitting
    feature_fraction=0.8 # fraction of features to be randomly selected for each tree (adjust for overfitting/underfitting)
)

object_dtype_cols = [
    "land_surface_condition", "foundation_type", "roof_type", "ground_floor_type", 
    "other_floor_type", "position", "plan_configuration", "legal_ownership_status"
]

train_features6 = train_features5.drop(columns=object_dtype_cols)
train_labels_filtered = train_labels.loc[train_features6.index]

lgbc.fit(train_features6, train_labels_filtered)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 254462, number of used features: 14
[LightGBM] [Info] Start training from score -2.369256
[LightGBM] [Info] Start training from score -0.563184
[LightGBM] [Info] Start training from score -1.087506


In [66]:
test_features2 = test_features[train_features6.columns]
test_labels = pd.DataFrame( lgbc.predict(test_features2) )



In [75]:
test_labels.index = test_features2.index
print(
    test_labels.value_counts(),
    test_labels.head()    
)

test_labels.columns = ["damage_grade"]
test_labels.to_csv("data/test_labels.csv")

''' 
11:00am:    0.6814 (imbalanced added)
02:00am:    0.6785
'''

2    60137
3    21908
1     4823
Name: count, dtype: int64              0
building_id   
300051       3
99355        2
890251       2
745817       1
421793       3
