# Sample notebook to run the earthquake data

## Package import

In [None]:
import boto3
import io
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

## Define s3 location where data is stored

In [None]:
usecase_bucket = "das-usecases-eu-west-1-24fc"
data_directory = "da-store-demos/earthquake-competition/"
train_values_file = data_directory + "train_values.csv"
test_values_file = data_directory + "test_values.csv"
train_labels_file = data_directory + "train_labels.csv"
submission_file = data_directory + "submission_format.csv"
s3 = boto3.client('s3')

In [None]:
obj = s3.get_object(Bucket = usecase_bucket, Key = submission_file)
submission_format = pd.read_csv(io.BytesIO(obj['Body'].read()))

## optimizing the hyperparameters of the pipeline

Load training features from s3

In [None]:
obj = s3.get_object(Bucket = usecase_bucket, Key = train_values_file)
train_values = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col = 'building_id')

In [None]:
train_values.head(n=5)

Load training labels from s3

In [None]:
obj = s3.get_object(Bucket = usecase_bucket, Key = train_labels_file)
train_labels = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col = 'building_id')

In [None]:
train_values.info()

## resample data to get a uniform amount of labels representation

In [None]:
# pick a limited amount of features
selected_features = ['foundation_type',
                     'area_percentage',
                     'height_percentage',
                     'count_floors_pre_eq',
                     'land_surface_condition',
                     'has_superstructure_cement_mortar_stone']

train_values_subset = train_values[selected_features]
train_values_subset = pd.get_dummies(train_values_subset)

## training a random forest

In [None]:
rf = RandomForestClassifier(random_state=2018)
param_grid = {'n_estimators': [5, 10],
              'min_samples_leaf': [1, 5]}
gs = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
gs.fit(train_values_subset, train_labels.values.ravel())
in_sample_preds = gs.predict(train_values_subset)
print(f1_score(train_labels, in_sample_preds, average='micro'))

## load the test data

In [None]:
obj = s3.get_object(Bucket = usecase_bucket, Key = test_values_file)
test_values = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col = 'building_id')
test_values_subset = test_values[selected_features]
test_values_subset = pd.get_dummies(test_values_subset)
predictions = gs.predict(test_values_subset)
obj = s3.get_object(Bucket = usecase_bucket, Key = submission_file)
submission_format = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col = 'building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

generate results for submission in local environment

In [None]:
my_submission.to_csv('../results/submission.csv')