# Import Data

First we import the data from the csv file. We use the pandas library to read the csv file and store it in a dataframe.

In [1]:
import sys

sys.path.append("../src")

import importer

raw_train_values, raw_train_labels, raw_test_values = importer.import_data(directory="../Data")

# Print the shapes of the data
print("Train values shape: ", raw_train_values.shape)
print("Train labels shape: ", raw_train_labels.shape)
print("Test values shape: ", raw_test_values.shape)


Train values shape:  (260601, 39)
Train labels shape:  (260601, 2)
Test values shape:  (86868, 39)


# Clean Data

We clean the data by removing the rows categorical data. This is a fast implementation of the data cleaning process.

In [2]:
import cleaner

train_data, test_data = cleaner.clean(raw_train_values, raw_train_labels, raw_test_values)

# Print the shapes of the new data
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape:  (260601, 28)
Test data shape:  (86868, 28)


# Encode Data

In [3]:
import encoder

train_data, test_data = encoder.encode(train_data, test_data)

# Print the shapes of the new data
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

# Print the columns of the new data
print("Train data columns: ", train_data.columns)
print("Test data columns: ", test_data.columns)

Train data shape:  (260601, 29)
Test data shape:  (86868, 29)
Train data columns:  Index(['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'damage_grade_0', 

# Create a model

We create a model using the sklearn library. We use the RandomForestClassifier to create a model.

In [4]:
import model

model = model.random_forest(train_data)

# Evaluate the model

In [5]:
import evaluator

predictions = evaluator.print_model_summary(model, test_data)

F-Score:  0.9762168532704775


# Submit

In [6]:
import pandas as pd

# Create a data frame with the predictions
predictions = pd.DataFrame(predictions, columns=['damage_grade_0', 'damage_grade_1'])

# Add the building_id column
predictions['building_id'] = raw_test_values['building_id']

# Rearrange the columns
predictions = predictions[['building_id', 'damage_grade_0', 'damage_grade_1']]

# Save the predictions in a csv file
predictions.to_csv('../Data/predictions.csv', index=False)

predictions.head()

Unnamed: 0,building_id,damage_grade_0,damage_grade_1
0,300051,1,1
1,99355,1,0
2,890251,1,0
3,745817,1,1
4,421793,1,0
