In [41]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def str_to_num(string):
    return int(string.split()[1])

In [52]:
# load train and test files
train = pd.read_csv('../data/train.csv', converters={'location': str_to_num})
test = pd.read_csv('../data/test.csv', converters={'location': str_to_num})

### Concatenate both train and test data

In [108]:
data = pd.concat([train[['location']], test[['location']]], axis=0)
data.index = np.arange(data.shape[0])

### Load other files

In [54]:
# load other files

event_types = pd.read_csv('../data/event_type.csv', converters={'event_type': str_to_num})
log_features = pd.read_csv('../data/log_feature.csv', converters={'log_feature': str_to_num})
resource_types = pd.read_csv('../data/resource_type.csv', converters={'resource_type': str_to_num})
severity_types = pd.read_csv('../data/severity_type.csv', converters={'severity_type': str_to_num})

## Feature Engineering

In [109]:
# Why did they choose index of severity types as the index of this new data frame ?
# Because it has the severity types for all of the ids in train and test data frames

X = pd.DataFrame(0, index=severity_types.index, columns= [])
X['fault_severity'] = train.fault_severity # creates target variable in this new data frame
X['severity_type'] = severity_types.severity_type # creates new feature for severity types
X['location'] = data.location

In [120]:
X['num'] = X.groupby('location')['severity_type'].transform(lambda x: np.arange(x.shape[0]) + 1)

# three different types of normalizing ( Why create these normalized values ?)

# Why are these features considered as useful, as clearly just by using these features we are not able to 
# separate records based on the severity of their faults ?

X['numsh'] = X.groupby('location')['num'].transform(lambda x: x/(x.shape[0]+1))
X['numsh0'] = X.groupby('location')['num'].transform(lambda x: (x-1)/(x.shape[0]))
X['numsh1'] = X.groupby('location')['num'].transform(lambda x: x/(x.shape[0]))

In [126]:
# location counts
lc = pd.DataFrame(data['location'].value_counts()).rename(columns={'location':'loc_count'})

In [128]:
X = pd.merge(X, lc, how='left', left_on='location',right_index=True).fillna(0)

In [130]:
nevents = pd.DataFrame(event_types['id'].value_counts()).rename(columns={'id':'nevents'})
X = pd.merge(X, nevents, right_index=True, left_index=True, how='left').fillna(0)

In [133]:
evtypes = event_types.event_type.value_counts()
common_events = evtypes.index

In [139]:
ohevents = event_types.loc[event_types.event_type.isin(common_events)].groupby(['id','event_type'])['id'].count()
ohevents = ohevents.unstack().fillna(0).add_prefix('event_')

In [141]:
X = pd.merge(X, ohevents, right_index=True, left_index=True, how='left').fillna(0)

In [147]:
# log features
log_features['logvolume'] = np.log(log_features.volume + 1)
X['volsumlog'] = np.log1p(log_features.groupby('id')['volume'].agg('sum'))

In [148]:
logvol = log_features.groupby('id')['logvolume'].agg(['count','min','mean','max','std','sum']).fillna(0).add_prefix('logvolume_')

In [150]:
X = pd.merge(X, logvol, how='left', right_index=True, left_index=True).fillna(0)

In [158]:
common_features = log_features.log_feature.value_counts().index
ohlog = log_features.loc[log_features.log_feature.isin(common_features)].groupby(['id','log_feature'])['logvolume'].mean()
ohlog = ohlog.unstack().fillna(0).add_prefix('logfeatvol_')
X = pd.merge(X, ohlog, how='left', left_index=True, right_index=True).fillna(0)

In [162]:
X['logfeatle_min'] = log_features.groupby('id').apply(lambda df:df['log_feature'].values[df['volume'].values.argmin()])

In [167]:
rmean = lambda x: x.rolling(window=9,min_periods=1,center=True).mean()
X.groupby('location')['logvolume_sum'].transform(rmean)
X['logvolume_sum_ma9'] = X.groupby('location')['logvolume_sum'].transform(rmean)
X['logvolume_sum_ma9_diff'] = X['logvolume_sum'] - X['logvolume_sum_ma9']
X['volsumlog_ma9'] = X.groupby('location')['volsumlog'].transform(rmean)
X['volsumlog_ma9_diff'] = X['volsumlog'] - X['volsumlog_ma9']
ma = X.groupby('location')['logfeatvol_203'].transform(rmean)
X['logfeatvol_203_ma9_diff'] = X['logfeatvol_203'] - ma

In [170]:
# resource data

nresources = pd.DataFrame(resource_types['id'].value_counts()).rename(columns={'id':'nresources'})
X = pd.merge(X, nresources, right_index=True, left_index=True, how='left').fillna(0)

In [171]:
# one-hot common resources
restypes = resource_types.resource_type.value_counts()
common_resources = restypes.index

In [172]:
ohres = resource_types.loc[resource_types.resource_type.isin(common_resources)].groupby(['id','resource_type'])['resource_type'].count()
ohres = ohres.unstack().fillna(0.).add_prefix('restype_')

In [174]:
X = pd.merge(X, ohres, how='left', left_index=True, right_index=True).fillna(0.)

In [180]:
features = X.columns[1:]

train_df = X[:train.shape[0]]
test_df = X[train.shape[0]:][features]

In [184]:
train_df.to_csv('../processed/processed_train.csv', index=False)
test_df.to_csv('../processed/processed_test.csv', index=False)