In [1]:
import pandas as pd
import numpy as np

# Load data

Read into pandas dataframe, make integer data types, create sparse features

In [6]:
# Location and Fault severity
train = pd.read_csv('train.csv', index_col = 'id')
train['fault_severity'] = train['fault_severity'].apply(lambda x: int(x)) # make sure fault_severity is of type integer
print 'Shape of train data', train.shape
print len(train.location.unique()), 'unique locations in train set'
train.head(3)

Shape of train data (7381, 2)
929 unique locations in train set


Unnamed: 0_level_0,location,fault_severity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
14121,location 118,1
9320,location 91,0
14394,location 152,1


In [5]:
test = pd.read_csv('test.csv', index_col = 'id')
print 'Shape of test data', test.shape
print len(test.location.unique()), 'unique locations in test set'
test.head(3)

Shape of test data (11171, 1)
1039 unique locations in test set


Unnamed: 0_level_0,location
id,Unnamed: 1_level_1
11066,location 481
18000,location 962
16964,location 491


In [7]:
# Concat train/test data filling missing fault_severity column in test file
label_loc = pd.concat([train, test], axis = 0).fillna('predict!')
label_loc = label_loc.sort_index() # sort by ascending id number
label_loc['location'] = label_loc['location'].apply(lambda x: int(x.split('location ')[1])) # make location data integers
print 'Shape of label_loc', label_loc.shape
print len(label_loc.location.unique()), 'unique locations in full data'
label_loc.head(5)

Shape of label_loc (18552, 2)
1126 unique locations in full data


Unnamed: 0_level_0,fault_severity,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,601
2,predict!,474
3,predict!,64
4,predict!,645
5,0,460


In [9]:
location = pd.get_dummies(label_loc.location, prefix = 'loc')
print 'Shape of location', location.shape
location.head(5)

Shape of location (18552, 1126)


Unnamed: 0_level_0,loc_1,loc_2,loc_3,loc_4,loc_5,loc_6,loc_7,loc_8,loc_9,loc_10,...,loc_1117,loc_1118,loc_1119,loc_1120,loc_1121,loc_1122,loc_1123,loc_1124,loc_1125,loc_1126
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Location seems like a tricky feature to use due to the number of unique values and many locations not appearing in training data set, but since this is a simple model lets leave it in sparse form and see what the machine can do. Also, I do not need to worry about the get_dummies trap since I will be using tree based methods.

In [12]:
# Event type data
event_type = pd.read_csv('Event_type.csv')
event_type['event_type'] = event_type['event_type'].apply(lambda x: int(x.split('event_type ')[1]))
print 'Shape of event_type', event_type.shape
print len(event_type.event_type.unique()), 'unique event types'
event_type.sort_values('id').head(8) # this does not change the order of event_type data frame (default: inplace = False)

Shape of event_type (31170, 2)
53 unique event types


Unnamed: 0,id,event_type
18299,1,11
18300,1,13
14535,2,35
14536,2,34
19226,3,11
19536,4,47
13306,5,34
13307,5,35


In [18]:
event_dummies = pd.concat([event_type.id, pd.get_dummies(event_type.event_type, prefix = 'e')], axis = 1)
event_dummies = event_dummies.groupby('id').sum() # groupby sorts by the variable its sorted on and makes that the index by default
print 'Shape of event_dummies', event_dummies.shape
event_dummies.head(4)

Shape of event_dummies (18552, 53)


Unnamed: 0_level_0,e_1,e_2,e_3,e_4,e_5,e_6,e_7,e_8,e_9,e_10,...,e_45,e_46,e_47,e_48,e_49,e_50,e_51,e_52,e_53,e_54
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [28]:
# Log feature data
log_feature = pd.read_csv('log_feature.csv')
log_feature['log_feature'] = log_feature['log_feature'].apply(lambda x: int(x.split('feature ')[1]))
log_feature['volume'] = log_feature['volume'].apply(lambda x: int(x))
print 'Shape of log_feature', log_feature.shape
print len(log_feature.log_feature.unique()), 'unique log_features'
log_feature.sort_values('id').head(8)

Shape of log_feature (58671, 3)
386 unique log_features


Unnamed: 0,id,log_feature,volume
36199,1,179,1
36197,1,68,2
36198,1,345,2
27668,2,312,1
27670,2,233,1
27669,2,235,1
27667,2,315,1
27671,2,313,1


Log feature is a bit more difficult to use since it has two columns of (seemingly related) values. I will create a dummy dataframe of the log_feature column and then mutliply that by volume (row-wise)

In [29]:
log_dummies = pd.get_dummies(log_feature.log_feature, prefix = 'l')
log_dummies = log_dummies.multiply(log_feature.volume.values, axis = 0) # multiply each row by the associated volume
log_dummies = pd.concat([log_feature.id, log_dummies], axis = 1) # add the id values to log_dummies
log_dummies = log_dummies.groupby('id').sum()
print 'Shape of log_feature', log_dummies.shape
log_dummies.head(4)

Shape of log_feature (18552, 386)


Unnamed: 0_level_0,l_1,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,...,l_377,l_378,l_379,l_380,l_381,l_382,l_383,l_384,l_385,l_386
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Resource type data
resource_type = pd.read_csv('resource_type.csv')
resource_type['resource_type'] = resource_type['resource_type'].apply(lambda x: int(x.split('resource_type ')[1]))
print len(resource_type.resource_type.unique()), 'unique resource types'
resource_dummies = pd.get_dummies(resource_type.resource_type, prefix = 'r')
resource_dummies = pd.concat([resource_type.id, resource_dummies], axis = 1)
resource_dummies = resource_dummies.groupby('id').sum()
print 'Shape of resource_dummies', resource_dummies.shape

# Severity type data - one entry per id
severity_type = pd.read_csv('severity_type.csv')
severity_type['severity_type'] = severity_type['severity_type'].apply(lambda x: int(x.split('severity_type ')[1]))
print len(severity_type.severity_type.unique()), 'unique severity types'
severity_dummies = pd.get_dummies(severity_type.severity_type, prefix = 's')
severity_dummies = pd.concat([severity_type.id, severity_dummies], axis = 1)
severity_dummies = severity_dummies.groupby('id').sum()
print 'Shape of severity_dummies', severity_dummies.shape

10 unique resource types
Shape of resource_dummies (18552, 10)
5 unique severity types
Shape of severity_dummies (18552, 5)


Now we can group all our dataframes together

In [1]:
# Concat all data into one pandas data frame
merge_data = pd.concat([label_loc.loc[:, 'fault_severity'], location, event_dummies, resource_dummies, severity_dummies, log_dummies], axis = 1)
print(merge_data.shape)
merge_data.head(20)

NameError: name 'pd' is not defined