# Import Data

First we import the data from the csv file. We use the pandas library to read the csv file and store it in a dataframe.

In [1]:
import sys

sys.path.append("../src")

import importer

raw_train_values, raw_train_labels, raw_test_values = importer.import_data(directory="../Data")

# Print the shapes of the data
print("Train values shape: ", raw_train_values.shape)
print("Train labels shape: ", raw_train_labels.shape)
print("Test values shape: ", raw_test_values.shape)


Train values shape:  (260601, 39)
Train labels shape:  (260601, 2)
Test values shape:  (86868, 39)


# Encode Data

In [2]:
import encoder

fitted_enc = encoder.create_encoder(raw_train_values)

train_data = encoder.encode(raw_train_values, fitted_enc)
test_data = encoder.encode(raw_test_values, fitted_enc)


# Print the shapes of the new data
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

# Print the columns of the new data
print("Train data columns: ", train_data.columns)
print("Test data columns: ", test_data.columns)

Train data shape:  (260601, 81)
Test data shape:  (86868, 81)
Train data columns:  Index(['building_id', 'geo_level_1_id_0', 'geo_level_1_id_1',
       'geo_level_1_id_2', 'geo_level_1_id_3', 'geo_level_1_id_4',
       'geo_level_2_id_0', 'geo_level_2_id_1', 'geo_level_2_id_2',
       'geo_level_2_id_3', 'geo_level_2_id_4', 'geo_level_2_id_5',
       'geo_level_2_id_6', 'geo_level_2_id_7', 'geo_level_2_id_8',
       'geo_level_2_id_9', 'geo_level_2_id_10', 'geo_level_3_id_0',
       'geo_level_3_id_1', 'geo_level_3_id_2', 'geo_level_3_id_3',
       'geo_level_3_id_4', 'geo_level_3_id_5', 'geo_level_3_id_6',
       'geo_level_3_id_7', 'geo_level_3_id_8', 'geo_level_3_id_9',
       'geo_level_3_id_10', 'geo_level_3_id_11', 'geo_level_3_id_12',
       'geo_level_3_id_13', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'land_surface_condition_0',
       'land_surface_condition_1', 'foundation_type_0', 'foundation_type_1',
       'foundation_type_2', 'roof_type

# Clean Data

We clean the data by removing the rows categorical data. This is a fast implementation of the data cleaning process.

In [3]:
import cleaner

train_cleaned = cleaner.clean(train_data, raw_train_labels)

# Print the shapes of the new data
print("Train data shape: ", train_cleaned.shape)

Train data shape:  (260601, 81)


# Normalize Data

In [4]:
import normalizer

train_normalized, test_data = normalizer.normalize(train_cleaned, test_data)

# Train CV Split

In [5]:
import splitter

X_train, X_val, y_train, y_val = splitter.split(train_normalized)

# Create a model

We create a model using the sklearn library. We use the RandomForestClassifier to create a model.

In [6]:
import model

model = model.XGBoost(X_train, y_train)


# Evaluate the model

In [None]:
import evaluator

accuracy, micro_f1 = evaluator.evaluate(model, X_val, y_val)

In [None]:
print(f'Micro F1 score: {micro_f1:.5f}')
print(f'Accuracy: {accuracy:.5f}')

# Model without splitting

In [None]:
import model

# Split the data
X_train = train_normalized.drop(columns=['damage_grade'])
y_train = train_normalized['damage_grade']

# Train the model
model = model.XGBoost(X_train, y_train)

# Predicting

In [None]:
import predictor

predictions = predictor.predict(model, test_data)

predictions.head()

# Submit

In [None]:
# save predictions with timestamp to folder Submissions
from datetime import datetime

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
filename = f'predicted_{timestamp}.csv'

predictions.to_csv(f'../Submissions/{filename}', index=False)