# Mini Competition: Richter's Predictor

Import modules and data

In [1]:
# Modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
# Import scripts
import helper_functions # various helper functions
import log_regression # Simple regression model
import lgb_optimized # Random forest

In [3]:
# Load data
X, y, X_test = helper_functions.imports()

### Data Analysis

Start with a basic analysis of the data to get a general feeling of what we are dealing with.

In [4]:
print(f"Proportions of the DataFrame X, containing the features for testing: {X.shape}")
print(f"Proportions of the DataFrage y, containing the target value for testing: {y.shape}")
print(f"Proportions of the DataFrame X, containing the features for the prediction: {X_test.shape}")

Proportions of the DataFrame X, containing the features for testing: (260601, 39)
Proportions of the DataFrage y, containing the target value for testing: (260601, 2)
Proportions of the DataFrame X, containing the features for the prediction: (86868, 39)


In [5]:
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [6]:
y.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [7]:
X_test.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,...,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,...,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,...,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,...,0,0,0,0,0,0,0,0,0,0


Check whether X and X_test have the same columns

In [8]:
X.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [9]:
helper_functions.test_column_equality(X, X_test)

Both DataFrames have the same columns.


In [10]:
X.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [11]:
X.nunique()

building_id                               260601
geo_level_1_id                                31
geo_level_2_id                              1414
geo_level_3_id                             11595
count_floors_pre_eq                            9
age                                           42
area_percentage                               84
height_percentage                             27
land_surface_condition                         3
foundation_type                                5
roof_type                                      3
ground_floor_type                              5
other_floor_type                               4
position                                       4
plan_configuration                            10
has_superstructure_adobe_mud                   2
has_superstructure_mud_mortar_stone            2
has_superstructure_stone_flag                  2
has_superstructure_cement_mortar_stone         2
has_superstructure_mud_mortar_brick            2
has_superstructure_c

From the description, we assume Geo Level 3 is the most precise whereas Geo Level 1 the least precise.
Follow that logic, we expect more unique data points in Level 3 than Level 1.

In [12]:
print(f"Unique data points in geo_level_1_id: {X.loc[:, 'geo_level_1_id'].nunique()}")
print(f"Unique data points in geo_level_2_id: {X.loc[:, 'geo_level_2_id'].nunique()}")
print(f"Unique data points in geo_level_3_id: {X.loc[:, 'geo_level_3_id'].nunique()}")

Unique data points in geo_level_1_id: 31
Unique data points in geo_level_2_id: 1414
Unique data points in geo_level_3_id: 11595


Data Cleaning

In [13]:
#First cleaning (before split - if required)

In [14]:
y = y['damage_grade']

In [15]:
# Split data in training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
#More cleaning (after split)

Visualization

## Models

In [17]:
#FOR NOW (to test tree gen) limit dataset to a few parameters only
mask = ['has_superstructure_adobe_mud', 'age','count_floors_pre_eq','area_percentage','height_percentage','has_secondary_use',
        'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo'] 

# apply mask
X = X[mask]

X_train = X_train[mask]
X_valid = X_valid[mask]
X_test_pred = X_test[mask]

Simple estimation using regressionm

In [18]:
lr = log_regression.lr(X_train, y_train)
y_valid_pred_lr = lr.make_prediction(X_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Generation of Random Forest and optimization

In [19]:
import lgb_optimized_CBN
# Initiate and optimize model
lgb = lgb_optimized_CBN.LGBM(X_train, X_valid, y_train, y_valid)

[I 2023-10-06 16:08:33,795] A new study created in memory with name: no-name-47e20f58-3214-4a43-8ba9-565ba4bdd150
[I 2023-10-06 16:08:34,778] Trial 0 finished with value: 0.3020279733696591 and parameters: {'learning_rate': 0.07596524039650988, 'subsample': 0.7665867235123348, 'num_leaves': 60, 'min_data_in_leaf': 14, 'max_depth': 7, 'lambda_l2': 0.44341722296697694}. Best is trial 0 with value: 0.3020279733696591.
[I 2023-10-06 16:08:35,522] Trial 1 finished with value: 0.2984017958212621 and parameters: {'learning_rate': 0.1401879756136447, 'subsample': 0.8233708123067554, 'num_leaves': 94, 'min_data_in_leaf': 19, 'max_depth': 7, 'lambda_l2': 0.7959182975786243}. Best is trial 0 with value: 0.3020279733696591.
[I 2023-10-06 16:08:36,227] Trial 2 finished with value: 0.2960419024961148 and parameters: {'learning_rate': 0.22217241459460926, 'subsample': 0.8879553611946891, 'num_leaves': 57, 'min_data_in_leaf': 13, 'max_depth': 14, 'lambda_l2': 0.47284033298411465}. Best is trial 0 with

Number of finished trials: 100
Best trial:
--------------------------------
Best F1 Score: 0.34114848141823834
--------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152
[LightGBM] [Info] Number of data points in the train set: 260601, number of used features: 9
[LightGBM] [Info] Start training from score -2.339167
[LightGBM] [Info] Start training from score -0.564030
[LightGBM] [Info] Start training from score -1.094580


In [20]:
y_pred_lgb = lgb.make_prediction(X_valid)



#### Evaluation

f1 Score using Logistic Regression

In [21]:
f1_lr = f1_score(y_valid, y_valid_pred_lr, average='macro')
f1_lr

0.2817870207071116

f1 Score for Random Forest

In [22]:
f1_lgb = f1_score(y_valid, y_pred_lgb, average='macro')
f1_lgb

0.3040032657323409

### Create prediction on testset

In [23]:
y_pred_lgb = lgb.make_prediction(X_test_pred)



Export

In [24]:
# Export prediction to creat csv by DrivenData.org datastandards
helper_functions.write_output(X_test, y_pred_lgb)

In [25]:
import session_info
session_info.show()

Export

In [27]:
# Export prediction to creat csv by DrivenData.org datastandards
helper_functions.write_output(X_test, y_pred_lgb)