In [1]:
import pandas as pd
import numpy as np

In [2]:
def str_to_num(string):
    return int(string.split()[1])

In [3]:
# load train and test files
train = pd.read_csv('../data/train.csv', converters={'location': str_to_num})
test = pd.read_csv('../data/test.csv', converters={'location': str_to_num})

In [4]:
data = pd.concat([train, test], axis=0)

In [5]:
# load other files

event_types = pd.read_csv('../data/event_type.csv', converters={'event_type': str_to_num})
log_features = pd.read_csv('../data/log_feature.csv', converters={'log_feature': str_to_num})
resource_types = pd.read_csv('../data/resource_type.csv', converters={'resource_type': str_to_num})
severity_types = pd.read_csv('../data/severity_type.csv', converters={'severity_type': str_to_num})

### Merge Location and Event Type Data

In [45]:
events_grouped = event_types.groupby(['id', 'event_type']).size().unstack(1).fillna(0)
events_grouped.columns = ['event_type_' + str(col) for col in events_grouped.columns]
events_grouped = events_grouped.reset_index()

In [46]:
data_merged = pd.merge(data, events_grouped, on='id', how='left')
train_df = data_merged.loc[data_merged.fault_severity.notnull()]
test_df = data_merged.loc[data_merged.fault_severity.isnull(), data_merged.columns[1:]]

### Merge Location and Log Feature

In [44]:
log_feature_grouped = log_features.groupby(['id', 'log_feature']).sum().unstack(1).fillna(0)
log_feature_grouped.columns = ['feature_type_' + str(col) for col in log_feature_grouped.columns]
log_feature_grouped = log_feature_grouped.reset_index()

In [48]:
data_merged = pd.merge(data_merged, log_feature_grouped, on='id', how='left')
train_df = data_merged.loc[data_merged.fault_severity.notnull()]
test_df = data_merged.loc[data_merged.fault_severity.isnull(), data_merged.columns[1:]]

In [52]:
train_df.to_csv('../processed/processed_train.csv', index=False)
test_df.to_csv('../processed/processed_test.csv', index=False)