# Baseline-Model

### Importing the data

In [1]:
# Import the data (training and labels) into Pandas
import pandas as pd

features = pd.read_csv("./data/train_values.csv")
label = pd.read_csv("./data/train_labels.csv")
test = pd.read_csv("./data/test_values.csv")
label.head

<bound method NDFrame.head of         building_id  damage_grade
0            802906             3
1             28830             2
2             94947             3
3            590882             2
4            201944             3
...             ...           ...
260596       688636             2
260597       669485             3
260598       602512             3
260599       151409             2
260600       747594             3

[260601 rows x 2 columns]>

In [5]:
# Merge the features with the labels
merged_data = pd.merge(features, label, on=["building_id"])
merged_data.shape

(260601, 40)

In [6]:
# Remove all rows which have a missing value in any column
non_missing_data = merged_data.dropna()
non_missing_data.shape

(260601, 40)

In [7]:
non_missing_test_data = test.dropna()
non_missing_test_data.shape

(86868, 39)

In [17]:
# Remove all columns which are not integer/ float
integer_float_data = non_missing_data.select_dtypes(include=[int, float])
integer_float_test_data = non_missing_test_data.select_dtypes(include=[int, float])
integer_float_data

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,1,1,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,0,1,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,0,1,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,0,1,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,1,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,0,1,...,0,0,0,0,0,0,0,0,0,2
260597,669485,17,715,2060,2,0,6,5,0,1,...,0,0,0,0,0,0,0,0,0,3
260598,602512,17,51,8163,3,55,6,7,0,1,...,0,0,0,0,0,0,0,0,0,3
260599,151409,26,39,1851,2,10,14,6,0,0,...,0,0,0,0,0,0,0,0,0,2


In [9]:
# Separate features and target
target = integer_float_data.loc[:, "damage_grade"]
features = integer_float_data.drop(columns="damage_grade")
features.shape
target.shape

(260601,)

In [10]:
features.shape

(260601, 31)

### DecisionTreeClassifier

In [19]:
# Load / train and predict a DecisionTreeClassifier on the ENTIRE dataset

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

clf = DecisionTreeClassifier(random_state=0, min_samples_split=250, max_depth=12)
fitted_clf = clf.fit(features, target)
y_pred_clf = fitted_clf.predict(features)
f1_score_result = f1_score(target, y_pred_clf, average='micro')
print('f1_score_micro: ', f1_score_result.round(3))


f1_score_micro:  0.704


### RandomForestClassifier

In [21]:
# Load / train and predict a RandomForestClassifier on the ENTIRE dataset

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

rfc = RandomForestClassifier(random_state=42, oob_score=True, min_samples_split=60, max_depth=11)
fitted_model = rfc.fit(features, target)

# Prediction on train_data
predictions = fitted_model.predict(features)

f1_score_result = f1_score(target, predictions, average='micro')
print('f1_score_micro: ', f1_score_result.round(3))

f1_score_micro:  0.666
