In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import functools as fn
%matplotlib inline

In [2]:
ls

1533148585_PBL 1 service disruption data files.zip
HackDay3.ipynb
Untitled.ipynb
event_type.csv
log_feature.csv
resource_type.csv
severity_type.csv
train.csv


In [3]:
event_df = pd.read_csv('event_type.csv')
logfeature_df = pd.read_csv('log_feature.csv')
resource_df = pd.read_csv('resource_type.csv')
severity_df = pd.read_csv('severity_type.csv')
train_df = pd.read_csv('train.csv')

In [4]:
event_df.shape

(31170, 2)

In [5]:
logfeature_df.shape

(58671, 3)

In [6]:
resource_df.shape

(21076, 2)

In [7]:
train_df.shape

(7381, 3)

In [8]:
train_df.drop_duplicates(subset= 'id', keep= 'first', inplace = True)

In [9]:
train_size = train_df.shape[0]

In [10]:
train_size

7381

In [11]:
dflist = [event_df, resource_df, severity_df, logfeature_df]

In [12]:
dfin = fn.reduce(lambda  left,right: pd.merge(left,right,on=['id'],
                                            how='outer'), dflist)

In [13]:
dfin.head()

Unnamed: 0,id,event_type,resource_type,severity_type,log_feature,volume
0,6597,event_type 11,resource_type 8,severity_type 2,feature 68,6
1,8011,event_type 15,resource_type 8,severity_type 2,feature 68,7
2,2597,event_type 15,resource_type 8,severity_type 2,feature 68,1
3,5022,event_type 15,resource_type 8,severity_type 1,feature 172,2
4,5022,event_type 15,resource_type 8,severity_type 1,feature 56,1


In [14]:
dfin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146423 entries, 0 to 146422
Data columns (total 6 columns):
id               146423 non-null int64
event_type       146423 non-null object
resource_type    146423 non-null object
severity_type    146423 non-null object
log_feature      146423 non-null object
volume           146423 non-null int64
dtypes: int64(2), object(4)
memory usage: 7.8+ MB


In [15]:
test_df = dfin.replace(['event_type', 'resource_type', 'severity_type', 'feature', ' '],'',regex=True)

In [16]:
test_df.drop_duplicates(subset= 'id', keep= 'first', inplace = True)

In [17]:
test_size = test_df.shape[0]

In [18]:
test_size

18552

In [19]:
test_df.head()

Unnamed: 0,id,event_type,resource_type,severity_type,log_feature,volume
0,6597,11,8,2,68,6
1,8011,15,8,2,68,7
2,2597,15,8,2,68,1
3,5022,15,8,1,172,2
11,6852,11,8,1,201,2


In [20]:
test_df.shape

(18552, 6)

In [21]:
train_df.head()

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


In [22]:
train_df = train_df.replace(['location',' '], '', regex=True)

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7381 entries, 0 to 7380
Data columns (total 3 columns):
id                7381 non-null int64
location          7381 non-null object
fault_severity    7381 non-null int64
dtypes: int64(2), object(1)
memory usage: 230.7+ KB


In [24]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7381 entries, 0 to 7380
Data columns (total 3 columns):
id                7381 non-null int64
location          7381 non-null object
fault_severity    7381 non-null int64
dtypes: int64(2), object(1)
memory usage: 230.7+ KB


In [25]:
train_df[7375:]

Unnamed: 0,id,location,fault_severity
7375,10455,1075,2
7376,870,167,0
7377,18068,106,0
7378,14111,1086,2
7379,15189,7,0
7380,17067,885,0


In [26]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 146421
Data columns (total 6 columns):
id               18552 non-null int64
event_type       18552 non-null object
resource_type    18552 non-null object
severity_type    18552 non-null object
log_feature      18552 non-null object
volume           18552 non-null int64
dtypes: int64(2), object(4)
memory usage: 1014.6+ KB


In [27]:
dflist = [train_df, test_df]

In [28]:
df = fn.reduce(lambda  left,right: pd.merge(left,right,on=['id'],
                                            how='outer'), dflist)

In [29]:
df.head()

Unnamed: 0,id,location,fault_severity,event_type,resource_type,severity_type,log_feature,volume
0,14121,118,1.0,34,2,2,312,19
1,9320,91,0.0,34,2,2,315,200
2,14394,152,1.0,35,2,2,221,1
3,8218,931,1.0,15,8,1,80,9
4,14804,120,0.0,34,2,1,134,1


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 18551
Data columns (total 8 columns):
id                18552 non-null int64
location          7381 non-null object
fault_severity    7381 non-null float64
event_type        18552 non-null object
resource_type     18552 non-null object
severity_type     18552 non-null object
log_feature       18552 non-null object
volume            18552 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 1.3+ MB


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 18551
Data columns (total 8 columns):
id                18552 non-null int64
location          7381 non-null object
fault_severity    7381 non-null float64
event_type        18552 non-null object
resource_type     18552 non-null object
severity_type     18552 non-null object
log_feature       18552 non-null object
volume            18552 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 1.3+ MB


In [32]:
df.shape

(18552, 8)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 18551
Data columns (total 8 columns):
id                18552 non-null int64
location          7381 non-null object
fault_severity    7381 non-null float64
event_type        18552 non-null object
resource_type     18552 non-null object
severity_type     18552 non-null object
log_feature       18552 non-null object
volume            18552 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 1.3+ MB


In [34]:
df.describe()

Unnamed: 0,id,fault_severity,volume
count,18552.0,7381.0,18552.0
mean,9276.5,0.45021,8.616753
std,5355.645433,0.66656,25.232579
min,1.0,0.0,1.0
25%,4638.75,0.0,1.0
50%,9276.5,0.0,2.0
75%,13914.25,1.0,7.0
max,18552.0,2.0,814.0


In [35]:
abs(df.corr()['fault_severity'].sort_values(ascending = False))

fault_severity    1.000000
volume            0.001013
id                0.010589
Name: fault_severity, dtype: float64

In [36]:
df = pd.get_dummies(df, drop_first=True)

In [37]:
df.head()

Unnamed: 0,id,fault_severity,volume,location_10,location_100,location_1000,location_1002,location_1005,location_1006,location_1007,...,log_feature_84,log_feature_85,log_feature_86,log_feature_87,log_feature_90,log_feature_93,log_feature_94,log_feature_95,log_feature_98,log_feature_99
0,14121,1.0,19,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9320,0.0,200,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14394,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8218,1.0,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14804,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df.isnull().sum().sum()

11171

In [39]:
df.head(10)

Unnamed: 0,id,fault_severity,volume,location_10,location_100,location_1000,location_1002,location_1005,location_1006,location_1007,...,log_feature_84,log_feature_85,log_feature_86,log_feature_87,log_feature_90,log_feature_93,log_feature_94,log_feature_95,log_feature_98,log_feature_99
0,14121,1.0,19,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9320,0.0,200,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14394,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8218,1.0,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14804,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1080,0.0,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,9731,0.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,15505,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3443,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,13300,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18552 entries, 0 to 18551
Columns: 1263 entries, id to log_feature_99
dtypes: float64(1), int64(2), uint8(1260)
memory usage: 22.9 MB


In [41]:
df.head(10)

Unnamed: 0,id,fault_severity,volume,location_10,location_100,location_1000,location_1002,location_1005,location_1006,location_1007,...,log_feature_84,log_feature_85,log_feature_86,log_feature_87,log_feature_90,log_feature_93,log_feature_94,log_feature_95,log_feature_98,log_feature_99
0,14121,1.0,19,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9320,0.0,200,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14394,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8218,1.0,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14804,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1080,0.0,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,9731,0.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,15505,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3443,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,13300,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df.isnull().sum()

id                    0
fault_severity    11171
volume                0
location_10           0
location_100          0
                  ...  
log_feature_93        0
log_feature_94        0
log_feature_95        0
log_feature_98        0
log_feature_99        0
Length: 1263, dtype: int64

In [43]:
df.shape

(18552, 1263)

In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [45]:
gbc = GradientBoostingClassifier()

In [46]:
new_train = df[:train_size]
new_test = df[train_size:]

In [47]:
new_test = new_test.drop('fault_severity',axis=1)

In [48]:
y = new_train['fault_severity']
x = new_train.drop('fault_severity',axis=1)

In [49]:
x.shape, y.shape, new_test.shape

((7381, 1262), (7381,), (11171, 1262))

In [50]:
model = gbc.fit(x,y)

In [51]:
y_predgbc = model.predict(new_test)

In [52]:
y_pred_prob_gbc = model.predict_proba(new_test)

In [53]:
len(y_pred_prob_gbc)

11171

In [54]:
len(y_predgbc)

11171

In [55]:
test_size

18552

In [56]:
result = pd.DataFrame({
    "id": new_test.id,
    "Predicted fault_severity": y_predgbc,
    "Predicted_probability_0": y_pred_prob_gbc[:,0],
    "Predicted_probability_1": y_pred_prob_gbc[:,1],
    "Predicted_probability_2": y_pred_prob_gbc[:,2]}
    ,columns=['id', 'Predicted fault_severity', 'Predicted_probability_0', 'Predicted_probability_1', 'Predicted_probability_2'])

In [57]:
result.head()

Unnamed: 0,id,Predicted fault_severity,Predicted_probability_0,Predicted_probability_1,Predicted_probability_2
7381,6597,0.0,0.79173,0.16229,0.04598
7382,2597,0.0,0.585764,0.329771,0.084465
7383,5022,0.0,0.461777,0.405125,0.133098
7384,6852,0.0,0.495739,0.430224,0.074038
7385,5611,0.0,0.597744,0.316064,0.086193
