# Load Libraries, Modules, and Magics

In [71]:
import pandas as pd
import pandas_profiling as pdp
import numpy as np
import random
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV, f_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, normalize, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn import metrics
import xgboost as xgb
import operator
from scipy.stats import uniform as sp_rand
from scipy.stats import randint as sp_randint
import scipy.stats as st
from scipy.stats.mstats import mode
from matplotlib import pyplot as plt
import seaborn as sns
import time
import math
import datetime
import random
from yellowbrick.features import ParallelCoordinates, Rank2D, JointPlotVisualizer
from yellowbrick.regressor import ResidualsPlot, AlphaSelection, PredictionError

%matplotlib inline

# Load (fixed) Data files

In [72]:
train_orig = pd.read_csv('./assets/train.csv', header=0)
test_orig = pd.read_csv('./assets/test.csv', header=0)
severity_type = pd.read_csv('./assets/severity_type.csv', header=0)
log_feature = pd.read_csv('./assets/log_feature.csv', header=0)
event_type = pd.read_csv('./assets/event_type_fixed.csv', header=0)
resource_type = pd.read_csv('./assets/resource_type_fixed.csv', header=0)

# Functions That Make EDA Easier

In [73]:
def norm_rows(df):
    with np.errstate(invalid='ignore'):
        return df.div(df.sum(axis=1), axis=0).fillna(0)

In [74]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

In [75]:
def print_value_counts(input_df,col_name,is_normalized=False):
    for value in input_df[col_name].unique():
        print(value)
        print(input_df.loc[input_df[col_name] == value].fault_severity.value_counts(normalize=is_normalized))

In [76]:
def print_value_counts_spec(input_df,col_name,col_value,is_normalized=False):
    print(col_value)
    print(input_df.loc[input_df[col_name] == col_value].fault_severity.value_counts(normalize=is_normalized))

In [77]:
def convert_strings_to_ints(input_df,col_name,output_col_name):
    labels, levels = pd.factorize(input_df[col_name])
    input_df[output_col_name] = labels
    output_dict = dict(zip(input_df[col_name],input_df[output_col_name]))
    return (output_dict,input_df)

In [78]:
#assumes row normalized, doesn't do eps thing
def get_log_loss_row(row):
    ans = row['fault_severity']
    if (ans == 0):
        return -1.0 * np.log(row['predict_0'])
    elif (ans == 1):
        return -1.0 * np.log(row['predict_1'])
    elif (ans == 2):
        return -1.0 * np.log(row['predict_2'])
    else:
        print('not_acceptable_value')
        raise ValueError('Not of correct class')
        return -1000

In [79]:
def get_log_loss_row_two_classes(row):
    ans = row['fault_severity']
    if (ans == 0):
        return -1.0 * np.log(row['predict_low'])
    elif (ans == 1):
        return -1.0 * np.log(row['predict_high'])
    else:
        print('not_acceptable_value')
        raise ValueError('Not of correct class')
        return -1000

# EDA

In [80]:
pdp.ProfileReport(train_orig)

0,1
Number of variables,3
Number of observations,7381
Total Missing (%),0.0%
Total size in memory,173.1 KiB
Average record size in memory,24.0 B

0,1
Numeric,2
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.45021
Minimum,0
Maximum,2
Zeros (%),64.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,2
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.66656
Coef of variation,1.4806
Kurtosis,0.12842
Mean,0.45021
MAD,0.58361
Skewness,1.1793
Sum,3323
Variance,0.4443
Memory size,57.7 KiB

Value,Count,Frequency (%),Unnamed: 3
0,4784,64.8%,
1,1871,25.3%,
2,726,9.8%,

Value,Count,Frequency (%),Unnamed: 3
0,4784,64.8%,
1,1871,25.3%,
2,726,9.8%,

Value,Count,Frequency (%),Unnamed: 3
0,4784,64.8%,
1,1871,25.3%,
2,726,9.8%,

0,1
Distinct count,7381
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9264.6
Minimum,1
Maximum,18550
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,938
Q1,4647
Median,9222
Q3,13885
95-th percentile,17637
Maximum,18550
Range,18549
Interquartile range,9238

0,1
Standard deviation,5349.3
Coef of variation,0.57739
Kurtosis,-1.1902
Mean,9264.6
MAD,4624.5
Skewness,0.012208
Sum,68382375
Variance,28615000
Memory size,57.7 KiB

Value,Count,Frequency (%),Unnamed: 3
4094,1,0.0%,
11591,1,0.0%,
7497,1,0.0%,
17738,1,0.0%,
3403,1,0.0%,
13644,1,0.0%,
15693,1,0.0%,
11599,1,0.0%,
5456,1,0.0%,
15701,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.0%,
5,1,0.0%,
6,1,0.0%,
8,1,0.0%,
13,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18539,1,0.0%,
18542,1,0.0%,
18543,1,0.0%,
18548,1,0.0%,
18550,1,0.0%,

0,1
Distinct count,929
Unique (%),12.6%
Missing (%),0.0%
Missing (n),0

0,1
location 821,85
location 1107,78
location 734,75
Other values (926),7143

Value,Count,Frequency (%),Unnamed: 3
location 821,85,1.2%,
location 1107,78,1.1%,
location 734,75,1.0%,
location 1008,71,1.0%,
location 126,71,1.0%,
location 810,66,0.9%,
location 600,64,0.9%,
location 704,61,0.8%,
location 242,60,0.8%,
location 124,59,0.8%,

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


In [81]:
pdp.ProfileReport(test_orig)

0,1
Number of variables,2
Number of observations,11171
Total Missing (%),0.0%
Total size in memory,174.6 KiB
Average record size in memory,16.0 B

0,1
Numeric,1
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,11171
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9284.3
Minimum,2
Maximum,18552
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,926
Q1,4630
Median,9316
Q3,13926
95-th percentile,17612
Maximum,18552
Range,18550
Interquartile range,9296

0,1
Standard deviation,5360.1
Coef of variation,0.57732
Kurtosis,-1.2063
Mean,9284.3
MAD,4646.8
Skewness,-0.0080346
Sum,103715253
Variance,28730000
Memory size,87.4 KiB

Value,Count,Frequency (%),Unnamed: 3
2132,1,0.0%,
7513,1,0.0%,
7529,1,0.0%,
11623,1,0.0%,
16642,1,0.0%,
3427,1,0.0%,
17762,1,0.0%,
11615,1,0.0%,
3419,1,0.0%,
1370,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,
7,1,0.0%,
9,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18546,1,0.0%,
18547,1,0.0%,
18549,1,0.0%,
18551,1,0.0%,
18552,1,0.0%,

0,1
Distinct count,1039
Unique (%),9.3%
Missing (%),0.0%
Missing (n),0

0,1
location 734,109
location 653,108
location 126,106
Other values (1036),10848

Value,Count,Frequency (%),Unnamed: 3
location 734,109,1.0%,
location 653,108,1.0%,
location 126,106,0.9%,
location 1107,106,0.9%,
location 810,102,0.9%,
location 704,102,0.9%,
location 821,98,0.9%,
location 684,97,0.9%,
location 793,96,0.9%,
location 1019,95,0.9%,

Unnamed: 0,id,location
0,11066,location 481
1,18000,location 962
2,16964,location 491
3,4795,location 532
4,3392,location 600


In [82]:
pdp.ProfileReport(severity_type)

0,1
Number of variables,2
Number of observations,18552
Total Missing (%),0.0%
Total size in memory,290.0 KiB
Average record size in memory,16.0 B

0,1
Numeric,1
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,18552
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9276.5
Minimum,1
Maximum,18552
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,928.55
Q1,4638.8
Median,9276.5
Q3,13914.0
95-th percentile,17624.0
Maximum,18552.0
Range,18551.0
Interquartile range,9275.5

0,1
Standard deviation,5355.6
Coef of variation,0.57733
Kurtosis,-1.2
Mean,9276.5
MAD,4638
Skewness,0
Sum,172097628
Variance,28683000
Memory size,145.0 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
6902,1,0.0%,
17053,1,0.0%,
10912,1,0.0%,
8865,1,0.0%,
15010,1,0.0%,
12963,1,0.0%,
2724,1,0.0%,
677,1,0.0%,
6822,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,
5,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18548,1,0.0%,
18549,1,0.0%,
18550,1,0.0%,
18551,1,0.0%,
18552,1,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
severity_type 2,8737
severity_type 1,8728
severity_type 4,1014
Other values (2),73

Value,Count,Frequency (%),Unnamed: 3
severity_type 2,8737,47.1%,
severity_type 1,8728,47.0%,
severity_type 4,1014,5.5%,
severity_type 5,65,0.4%,
severity_type 3,8,0.0%,

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [83]:
pdp.ProfileReport(log_feature)

0,1
Number of variables,3
Number of observations,58671
Total Missing (%),0.0%
Total size in memory,1.3 MiB
Average record size in memory,24.0 B

0,1
Numeric,2
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,18552
Unique (%),31.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9271.4
Minimum,1
Maximum,18552
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,910.0
Q1,4658.5
Median,9275.0
Q3,13903.0
95-th percentile,17599.0
Maximum,18552.0
Range,18551.0
Interquartile range,9244.5

0,1
Standard deviation,5355.1
Coef of variation,0.5776
Kurtosis,-1.2
Mean,9271.4
MAD,4639.9
Skewness,0.00054192
Sum,543960485
Variance,28678000
Memory size,458.4 KiB

Value,Count,Frequency (%),Unnamed: 3
5233,20,0.0%,
15184,20,0.0%,
12575,19,0.0%,
9266,18,0.0%,
6516,18,0.0%,
9261,18,0.0%,
11563,18,0.0%,
3418,18,0.0%,
16094,18,0.0%,
626,18,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,3,0.0%,
2,5,0.0%,
3,1,0.0%,
4,1,0.0%,
5,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18548,6,0.0%,
18549,6,0.0%,
18550,2,0.0%,
18551,2,0.0%,
18552,2,0.0%,

0,1
Distinct count,386
Unique (%),0.7%
Missing (%),0.0%
Missing (n),0

0,1
feature 312,5267
feature 232,4754
feature 82,3472
Other values (383),45178

Value,Count,Frequency (%),Unnamed: 3
feature 312,5267,9.0%,
feature 232,4754,8.1%,
feature 82,3472,5.9%,
feature 203,2823,4.8%,
feature 313,2145,3.7%,
feature 233,1901,3.2%,
feature 307,1597,2.7%,
feature 54,1573,2.7%,
feature 170,1526,2.6%,
feature 71,1514,2.6%,

0,1
Distinct count,341
Unique (%),0.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9.6853
Minimum,1
Maximum,1310
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,2
Q3,7
95-th percentile,40
Maximum,1310
Range,1309
Interquartile range,6

0,1
Standard deviation,27.314
Coef of variation,2.8202
Kurtosis,246.87
Mean,9.6853
MAD,11.268
Skewness,11.605
Sum,568246
Variance,746.08
Memory size,458.4 KiB

Value,Count,Frequency (%),Unnamed: 3
1,20713,35.3%,
2,9716,16.6%,
3,4488,7.6%,
4,3713,6.3%,
5,2207,3.8%,
6,2203,3.8%,
8,1403,2.4%,
7,1275,2.2%,
10,1012,1.7%,
9,894,1.5%,

Value,Count,Frequency (%),Unnamed: 3
1,20713,35.3%,
2,9716,16.6%,
3,4488,7.6%,
4,3713,6.3%,
5,2207,3.8%,

Value,Count,Frequency (%),Unnamed: 3
675,1,0.0%,
717,1,0.0%,
814,1,0.0%,
877,1,0.0%,
1310,1,0.0%,

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [84]:
pdp.ProfileReport(resource_type)

0,1
Number of variables,2
Number of observations,22877
Total Missing (%),0.0%
Total size in memory,357.5 KiB
Average record size in memory,16.0 B

0,1
Numeric,1
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,18552
Unique (%),81.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9263.3
Minimum,1
Maximum,18552
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,941
Q1,4613
Median,9262
Q3,13904
95-th percentile,17627
Maximum,18552
Range,18551
Interquartile range,9291

0,1
Standard deviation,5357.9
Coef of variation,0.5784
Kurtosis,-1.2034
Mean,9263.3
MAD,4644
Skewness,0.0031285
Sum,211915944
Variance,28707000
Memory size,178.8 KiB

Value,Count,Frequency (%),Unnamed: 3
3742,10,0.0%,
11595,10,0.0%,
3456,8,0.0%,
14510,8,0.0%,
3554,8,0.0%,
8283,6,0.0%,
7863,6,0.0%,
11746,6,0.0%,
11817,6,0.0%,
13215,6,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,2,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,
5,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18548,1,0.0%,
18549,1,0.0%,
18550,1,0.0%,
18551,1,0.0%,
18552,1,0.0%,

0,1
Distinct count,11
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
resource_type 8,11584
resource_type 2,9209
resource_type 6,652
Other values (8),1432

Value,Count,Frequency (%),Unnamed: 3
resource_type 8,11584,50.6%,
resource_type 2,9209,40.3%,
resource_type 6,652,2.9%,
resource_type 7,548,2.4%,
resource_type 4,364,1.6%,
resource_type 9,207,0.9%,
resource_type 3,147,0.6%,
resource_type 10,82,0.4%,
resource_type 1,69,0.3%,
resource_type 5,14,0.1%,

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [85]:
pdp.ProfileReport(event_type)

0,1
Number of variables,2
Number of observations,34083
Total Missing (%),0.0%
Total size in memory,532.6 KiB
Average record size in memory,16.0 B

0,1
Numeric,1
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,54
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
event_type 11,9041
event_type 35,6814
event_type 34,6116
Other values (51),12112

Value,Count,Frequency (%),Unnamed: 3
event_type 11,9041,26.5%,
event_type 35,6814,20.0%,
event_type 34,6116,17.9%,
event_type 15,5003,14.7%,
event_type 20,1594,4.7%,
event_type 54,785,2.3%,
event_type 13,682,2.0%,
event_type 42,529,1.6%,
event_type 44,513,1.5%,
event_type 23,482,1.4%,

0,1
Distinct count,18553
Unique (%),54.4%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9281.3
Minimum,1
Maximum,18552
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,917.0
Q1,4641.2
Median,9302.0
Q3,13936.0
95-th percentile,17623.0
Maximum,18552.0
Range,18551.0
Interquartile range,9294.5

0,1
Standard deviation,5360.5
Coef of variation,0.57756
Kurtosis,-1.2037
Mean,9281.3
MAD,4646.6
Skewness,-0.0045953
Sum,316320000
Variance,28735000
Memory size,266.4 KiB

Value,Count,Frequency (%),Unnamed: 3
14243.0,20,0.1%,
759.0,12,0.0%,
3742.0,12,0.0%,
17876.0,12,0.0%,
10354.0,11,0.0%,
16882.0,10,0.0%,
8046.0,10,0.0%,
15184.0,10,0.0%,
11595.0,10,0.0%,
215.0,9,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1.0,2,0.0%,
2.0,2,0.0%,
3.0,1,0.0%,
4.0,1,0.0%,
5.0,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18548.0,2,0.0%,
18549.0,3,0.0%,
18550.0,1,0.0%,
18551.0,1,0.0%,
18552.0,2,0.0%,

Unnamed: 0,id,event_type
0,6597.0,event_type 11
1,8011.0,event_type 15
2,2597.0,event_type 15
3,5022.0,event_type 15
4,5022.0,event_type 11


### Analysis:
The data files present multiple views into linked data tables. Based on cardinality, my educated guess is that the ID field  is the key between the tables, with location and log_feature containing relevant outage data.

# Feature Engineering

## Join the tables into one monolithic file to prepare for modeling, including test and train. I'm not sure why or how they were seperated, but I don't trust that they were separated without bias.

In [86]:
df = pd.concat([train_orig,test_orig], ignore_index=True)
df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,fault_severity,id,location
0,1.0,14121,location 118
1,0.0,9320,location 91
2,1.0,14394,location 152
3,1.0,8218,location 931
4,0.0,14804,location 120


In [87]:
df.shape

(18552, 3)

In [88]:
# Let's get a look at the breakdown of fault_severity, since this will be our target vector
df.fault_severity.value_counts()

0.0    4784
1.0    1871
2.0     726
Name: fault_severity, dtype: int64

In [89]:
# now merge the event_type, key on id
event_type = pd.merge(df, event_type, on='id')
event_type.head()

Unnamed: 0,fault_severity,id,location,event_type
0,1.0,14121,location 118,event_type 34
1,1.0,14121,location 118,event_type 35
2,0.0,9320,location 91,event_type 34
3,0.0,9320,location 91,event_type 35
4,0.0,9320,location 91,event_type 34


In [90]:
# ...and get the unique event types
event_type_unq = pd.DataFrame(event_type['event_type'].value_counts())
event_type_unq.head()

Unnamed: 0,event_type
event_type 11,9041
event_type 35,6814
event_type 34,6116
event_type 15,5003
event_type 20,1594


In [92]:
# these are the top 5 event types.

In [95]:
# now let's deal with the nulls in event_type
event_type.isnull().sum()

fault_severity    20437
id                    0
location              0
event_type            0
dtype: int64

In [105]:
event_type[event_type.fault_severity==2].count()

fault_severity    1428
id                1428
location          1428
event_type        1428
dtype: int64

In [106]:
event_type[event_type.isnull().any(axis=1)]

Unnamed: 0,fault_severity,id,location,event_type
13645,,11066,location 481,event_type 35
13646,,11066,location 481,event_type 34
13647,,18000,location 962,event_type 11
13648,,18000,location 962,event_type 15
13649,,18000,location 962,event_type 11
13650,,18000,location 962,event_type 15
13651,,16964,location 491,event_type 35
13652,,16964,location 491,event_type 34
13653,,4795,location 532,event_type 27
13654,,4795,location 532,event_type 10
