In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import functools as fn
%matplotlib inline

In [2]:
event_df = pd.read_csv('event_type.csv')
logfeature_df = pd.read_csv('log_feature.csv')
resource_df = pd.read_csv('resource_type.csv')
severity_df = pd.read_csv('severity_type.csv')
train_df = pd.read_csv('train.csv')

In [3]:
csv_list = [event_df, resource_df, severity_df, logfeature_df, train_df]
data = fn.reduce(lambda  left,right: pd.merge(left,right,on=['id']), csv_list)

In [4]:
data.head()

Unnamed: 0,id,event_type,resource_type,severity_type,log_feature,volume,location,fault_severity
0,8011,event_type 15,resource_type 8,severity_type 2,feature 68,7,location 1,0
1,2588,event_type 15,resource_type 8,severity_type 1,feature 82,9,location 1,0
2,2588,event_type 15,resource_type 8,severity_type 1,feature 201,5,location 1,0
3,2588,event_type 15,resource_type 8,severity_type 1,feature 80,15,location 1,0
4,2588,event_type 15,resource_type 8,severity_type 1,feature 203,5,location 1,0


In [5]:
data = data.replace(['event_type', 'resource_type', 'severity_type', 'feature', 'location', ' '],'',regex=True)

In [6]:
data.head()

Unnamed: 0,id,event_type,resource_type,severity_type,log_feature,volume,location,fault_severity
0,8011,15,8,2,68,7,1,0
1,2588,15,8,1,82,9,1,0
2,2588,15,8,1,201,5,1,0
3,2588,15,8,1,80,15,1,0
4,2588,15,8,1,203,5,1,0


In [7]:
data.shape

(61839, 8)

In [8]:
data = pd.get_dummies(data, drop_first=True)

In [9]:
data.drop('fault_severity', axis=1, inplace=True)

In [10]:
df = data.groupby('id').sum()

In [11]:
df.shape

(7381, 1320)

In [12]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,id,volume,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_18,event_type_19,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
0,1,20,0.0,6.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df = df.merge(train_df, on='id', how='inner')

In [14]:
df.drop('location', axis=1, inplace=True)
df.head()

Unnamed: 0,id,volume,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_18,event_type_19,...,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999,fault_severity
0,1,20,0.0,6.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,5,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,6,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,8,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,13,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [15]:
features = pd.DataFrame()
features['feature'] = df.columns
features['importance'] = list(abs(df.corrwith(df.fault_severity)))
features.sort_values(by=['importance'], ascending=False, inplace=True)
features.set_index('feature', inplace=True)
features.head(10)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
fault_severity,1.0
log_feature_203,0.406703
log_feature_82,0.354069
event_type_15,0.335126
resource_type_8,0.255359
log_feature_312,0.24679
event_type_35,0.246119
event_type_34,0.23144
log_feature_232,0.228029
severity_type_2,0.194623


In [16]:
y = df.fault_severity
x = df.drop('fault_severity', axis=1)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, random_state=10)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((5904, 1321), (1477, 1321), (5904,), (1477,))

In [24]:
gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=3, random_state=10)

In [25]:
gbcmodel = gbc.fit(x_train, y_train)

In [26]:
print(gbc.score(x_test, y_test))

0.7210561949898443
