# Mini Competition: Richter's Predictor

Import modules and data

In [55]:
# Modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [56]:
# Import scripts
import helper_functions # various helper functions
import log_regression # Simple regression model
import lgb_optimized # Random forest

In [57]:
# Load data
X, y, X_test = helper_functions.imports()

### Data Analysis

Start with a basic analysis of the data to get a general feeling of what we are dealing with.

In [58]:
print(f"Proportions of the DataFrame X, containing the features for testing: {X.shape}")
print(f"Proportions of the DataFrage y, containing the target value for testing: {y.shape}")
print(f"Proportions of the DataFrame X, containing the features for the prediction: {X_test.shape}")

Proportions of the DataFrame X, containing the features for testing: (260601, 39)
Proportions of the DataFrage y, containing the target value for testing: (260601, 2)
Proportions of the DataFrame X, containing the features for the prediction: (86868, 39)


In [59]:
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [60]:
y.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [61]:
X_test.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,...,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,...,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,...,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,...,0,0,0,0,0,0,0,0,0,0


Check whether X and X_test have the same columns

In [62]:
X.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [63]:
helper_functions.test_column_equality(X, X_test)

Both DataFrames have the same columns.


In [64]:
X.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [65]:
X.nunique()

building_id                               260601
geo_level_1_id                                31
geo_level_2_id                              1414
geo_level_3_id                             11595
count_floors_pre_eq                            9
age                                           42
area_percentage                               84
height_percentage                             27
land_surface_condition                         3
foundation_type                                5
roof_type                                      3
ground_floor_type                              5
other_floor_type                               4
position                                       4
plan_configuration                            10
has_superstructure_adobe_mud                   2
has_superstructure_mud_mortar_stone            2
has_superstructure_stone_flag                  2
has_superstructure_cement_mortar_stone         2
has_superstructure_mud_mortar_brick            2
has_superstructure_c

From the description, we assume Geo Level 3 is the most precise whereas Geo Level 1 the least precise.
Follow that logic, we expect more unique data points in Level 3 than Level 1.

In [66]:
print(f"Unique data points in geo_level_1_id: {X.loc[:, 'geo_level_1_id'].nunique()}")
print(f"Unique data points in geo_level_2_id: {X.loc[:, 'geo_level_2_id'].nunique()}")
print(f"Unique data points in geo_level_3_id: {X.loc[:, 'geo_level_3_id'].nunique()}")

Unique data points in geo_level_1_id: 31
Unique data points in geo_level_2_id: 1414
Unique data points in geo_level_3_id: 11595


## Data Cleaning

In [67]:
# list of columns that shall be dropped at the end
drop_cols = []

# helper functions
def ids():
    ids = [id for id in globals().keys() if not id.startswith("_")]
    print(ids[5:])
ids()

print(X.shape, 'damage_grade' in X.columns)
data = pd.merge(left=y, right=X, on='building_id', how='inner')
print(data.shape, 'damage_grade' in data.columns) #ok, one column was added

cols = data.columns
for col in cols:
    globals()[col] = col



(260601, 39) False


(260601, 40) True


### `geo_level2` mean encodings

In [68]:
geo_levels = []
for col in filter(lambda col: col.startswith("geo"), X):
    print(str(col))
    geo_levels.append(col)

geo_level_1_id
geo_level_2_id
geo_level_3_id


In [69]:
mean_encodings = data.groupby('geo_level_2_id')['damage_grade'].mean()
pd.DataFrame(mean_encodings).head()

Unnamed: 0_level_0,damage_grade
geo_level_2_id,Unnamed: 1_level_1
0,2.763158
1,2.348039
3,2.103896
4,2.520635
5,2.16


In [70]:
for df in X, X_test:
    df["geo_level_2_enc"] = df['geo_level_2_id'].map(mean_encodings)
    # print(df[["geo_level_2_id", "geo_level_2_enc"]].sort_values("geo_level_2_id")[:100], end="\n")

print(geo_levels)

['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']


### `geo_level1` dummies

In [71]:
dummies = pd.get_dummies(X["geo_level_1_id"], prefix="geo_level_cat")
X = pd.concat([X, dummies], axis=1)
dummies = pd.get_dummies(X_test["geo_level_1_id"], prefix="geo_level_cat")
X_test = pd.concat([X_test, dummies], axis=1)

print(
    X.shape,    
    X_test.shape, sep="\n"
)

(260601, 71)
(86868, 71)


In [72]:
geo_columns = [col for col in X.columns if col.startswith('geo_level_cat')]
print(X.groupby("geo_level_1_id").first()[geo_columns].iloc[:6,:6])

                geo_level_cat_0  geo_level_cat_1  geo_level_cat_2  \
geo_level_1_id                                                      
0                          True            False            False   
1                         False             True            False   
2                         False            False             True   
3                         False            False            False   
4                         False            False            False   
5                         False            False            False   

                geo_level_cat_3  geo_level_cat_4  geo_level_cat_5  
geo_level_1_id                                                     
0                         False            False            False  
1                         False            False            False  
2                         False            False            False  
3                          True            False            False  
4                         False        

### foundation_type dummies

In [73]:
X = pd.get_dummies(X, columns=["foundation_type"], drop_first=False)
X_test = pd.get_dummies(X_test, columns=["foundation_type"], drop_first=False)

print(X.shape)
print(X_test.shape)
foundation_cols = [col for col in X.columns if col.startswith("foundation")]
print(
    X[foundation_cols].columns,
    X_test[foundation_cols].columns,
    sep="\n")


(260601, 75)
(86868, 75)
Index(['foundation_type_h', 'foundation_type_i', 'foundation_type_r',
       'foundation_type_u', 'foundation_type_w'],
      dtype='object')
Index(['foundation_type_h', 'foundation_type_i', 'foundation_type_r',
       'foundation_type_u', 'foundation_type_w'],
      dtype='object')


In [74]:
X = pd.get_dummies(X, columns=["roof_type"], drop_first=False)
X_test = pd.get_dummies(X_test, columns=["roof_type"], drop_first=False)

print(X.shape)
print(X_test.shape)
foundation_cols = [col for col in X.columns if col.startswith("roof_type")]
print(
    X[foundation_cols].columns,
    X_test[foundation_cols].columns,
    sep="\n")


(260601, 77)
(86868, 77)
Index(['roof_type_n', 'roof_type_q', 'roof_type_x'], dtype='object')
Index(['roof_type_n', 'roof_type_q', 'roof_type_x'], dtype='object')


In [75]:
# ground --> hot


In [76]:
object_dtype_cols = [
    "land_surface_condition", "ground_floor_type", 
    "other_floor_type", "position", "plan_configuration", "legal_ownership_status"
]
# 

drop_cols.extend(object_dtype_cols)
drop_cols

['land_surface_condition',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [77]:
# Outliers 

### Prepare data for analysis

In [78]:
# drop columns
drop_cols.extend(geo_levels)
print("drop_cols:", drop_cols)
for df in X, X_test:
    df.drop(columns=drop_cols, inplace=True)
    print(df.shape, end="\n")

print(
    X.columns,
    X_test.columns
)

drop_cols: ['land_surface_condition', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
(260601, 68)
(86868, 68)
Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_s

In [79]:
# set index to building_id
for df in y, X, X_test:
    df.set_index("building_id", inplace=True)


In [80]:

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
X_test_pred = X_test

# for k,v in {"X":X, X_train, X_valid, X_test}:
for name,df in {"X":X, "X_train":X_train, "X_valid": X_valid, "X_test": X_test}.items():
    print(name, *df.shape, sep="\t")

geo_columns = [col for col in X.columns if col.startswith("geo")]
print(X[geo_columns].columns)



X	260601	67
X_train	208480	67
X_valid	52121	67
X_test	86868	67
Index(['geo_level_2_enc', 'geo_level_cat_0', 'geo_level_cat_1',
       'geo_level_cat_2', 'geo_level_cat_3', 'geo_level_cat_4',
       'geo_level_cat_5', 'geo_level_cat_6', 'geo_level_cat_7',
       'geo_level_cat_8', 'geo_level_cat_9', 'geo_level_cat_10',
       'geo_level_cat_11', 'geo_level_cat_12', 'geo_level_cat_13',
       'geo_level_cat_14', 'geo_level_cat_15', 'geo_level_cat_16',
       'geo_level_cat_17', 'geo_level_cat_18', 'geo_level_cat_19',
       'geo_level_cat_20', 'geo_level_cat_21', 'geo_level_cat_22',
       'geo_level_cat_23', 'geo_level_cat_24', 'geo_level_cat_25',
       'geo_level_cat_26', 'geo_level_cat_27', 'geo_level_cat_28',
       'geo_level_cat_29', 'geo_level_cat_30'],
      dtype='object')


Visualization

## Models

Simple estimation using regressionm

In [81]:
lr = log_regression.lr(X_train, y_train)
y_valid_pred_lr = lr.make_prediction(X_valid)

Generation of Random Forest and optimization

In [82]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [83]:
import lgb_optimized
# Initiate and optimize model
lgb = lgb_optimized.LGBM(X_train, X_valid, y_train, y_valid)

[I 2023-10-07 11:51:34,122] A new study created in memory with name: no-name-1e32fb04-30d9-4d06-b0bc-40c92579981b
[I 2023-10-07 11:51:43,029] Trial 0 finished with value: 0.15293259914429885 and parameters: {'learning_rate': 0.16845285837667467, 'subsample': 0.7454701310947338, 'num_leaves': 30, 'min_data_in_leaf': 2, 'max_depth': 13, 'lambda_l2': 0.9405746886910722}. Best is trial 0 with value: 0.15293259914429885.
[I 2023-10-07 11:51:51,314] Trial 1 finished with value: 0.1552541202202567 and parameters: {'learning_rate': 0.26944696879776936, 'subsample': 0.9598977267127748, 'num_leaves': 47, 'min_data_in_leaf': 11, 'max_depth': 4, 'lambda_l2': 0.9362880279333154}. Best is trial 1 with value: 0.1552541202202567.


Number of finished trials: 2
Best trial:
--------------------------------
Best F1 Score: 0.1552541202202567
--------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 528
[LightGBM] [Info] Number of data points in the train set: 260601, number of used features: 67
[LightGBM] [Info] Start training from score -2.339167
[LightGBM] [Info] Start training from score -0.564030
[LightGBM] [Info] Start training from score -1.094580


In [84]:
y_pred_lgb = lgb.make_prediction(X_valid)



#### Evaluation

f1 Score using Logistic Regression

In [85]:
f1_lr = f1_score(y_valid, y_valid_pred_lr, average='macro')
f1_lr

0.5844657657233902

f1 Score for Random Forest

In [86]:
f1_lgb = f1_score(y_valid, y_pred_lgb, average='macro')
f1_lgb

0.6682076499850466

### Create prediction on testset

In [87]:
y_pred_lgb = lgb.make_prediction(X_test_pred)



Export

In [88]:
# Export prediction to create csv by DrivenData.org datastandards
X_test.reset_index(inplace=True)

helper_functions.write_output(X_test, y_pred_lgb)

In [89]:
for df in X,X_train,X_valid,X_test:
    print(df.shape)

X_test.columns

(260601, 67)
(208480, 67)
(52121, 67)
(86868, 68)


Index(['building_id', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'geo_level_2_enc', 'geo_level_cat_0',
       'geo_level_cat_1', 'geo_level_cat_2', 'ge

In [24]:
import session_info
session_info.show()