In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import functools as fn
%matplotlib inline

In [2]:
ls

1533148585_PBL 1 service disruption data files.zip
HackDay3.ipynb
Untitled.ipynb
event_type.csv
log_feature.csv
resource_type.csv
severity_type.csv
train.csv


In [3]:
event_df = pd.read_csv('event_type.csv')
logfeature_df = pd.read_csv('log_feature.csv')
resource_df = pd.read_csv('resource_type.csv')
severity_df = pd.read_csv('severity_type.csv')
train_df = pd.read_csv('train.csv')

In [4]:
event_df.shape

(31170, 2)

In [5]:
logfeature_df.shape

(58671, 3)

In [6]:
resource_df.shape

(21076, 2)

In [7]:
train_df.shape

(7381, 3)

In [8]:
train_size = train_df.shape[0]

In [9]:
train_size

7381

In [10]:
dflist = [event_df, resource_df, severity_df, logfeature_df]

In [11]:
dfin = fn.reduce(lambda  left,right: pd.merge(left,right,on=['id'],
                                            how='outer'), dflist)

In [12]:
dfin.head()

Unnamed: 0,id,event_type,resource_type,severity_type,log_feature,volume
0,6597,event_type 11,resource_type 8,severity_type 2,feature 68,6
1,8011,event_type 15,resource_type 8,severity_type 2,feature 68,7
2,2597,event_type 15,resource_type 8,severity_type 2,feature 68,1
3,5022,event_type 15,resource_type 8,severity_type 1,feature 172,2
4,5022,event_type 15,resource_type 8,severity_type 1,feature 56,1


In [13]:
dfin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146423 entries, 0 to 146422
Data columns (total 6 columns):
id               146423 non-null int64
event_type       146423 non-null object
resource_type    146423 non-null object
severity_type    146423 non-null object
log_feature      146423 non-null object
volume           146423 non-null int64
dtypes: int64(2), object(4)
memory usage: 7.8+ MB


In [14]:
df = dfin.replace(['event_type', 'resource_type', 'severity_type', 'feature', ' '],'',regex=True)

In [15]:
test_size = df.shape[0]

In [16]:
test_size

146423

In [17]:
df.head()

Unnamed: 0,id,event_type,resource_type,severity_type,log_feature,volume
0,6597,11,8,2,68,6
1,8011,15,8,2,68,7
2,2597,15,8,2,68,1
3,5022,15,8,1,172,2
4,5022,15,8,1,56,1


In [18]:
df.shape

(146423, 6)

In [19]:
train_df.head()

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0


In [20]:
train_df = train_df.replace(['location',' '], '', regex=True)

In [21]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7381 entries, 0 to 7380
Data columns (total 3 columns):
id                7381 non-null int64
location          7381 non-null object
fault_severity    7381 non-null int64
dtypes: int64(2), object(1)
memory usage: 173.1+ KB


In [22]:
df = pd.merge(df, train_df)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Data columns (total 8 columns):
id                61839 non-null int64
event_type        61839 non-null object
resource_type     61839 non-null object
severity_type     61839 non-null object
log_feature       61839 non-null object
volume            61839 non-null int64
location          61839 non-null object
fault_severity    61839 non-null int64
dtypes: int64(3), object(5)
memory usage: 4.2+ MB


In [24]:
df.describe()

Unnamed: 0,id,volume,fault_severity
count,61839.0,61839.0,61839.0
mean,9104.379469,8.390336,0.549168
std,5387.274195,22.902491,0.727701
min,1.0,1.0,0.0
25%,4353.0,1.0,0.0
50%,8981.0,2.0,0.0
75%,13664.0,6.0,1.0
max,18550.0,877.0,2.0


In [25]:
abs(df.corr()['fault_severity'].sort_values(ascending = False))

fault_severity    1.000000
volume            0.027196
id                0.045528
Name: fault_severity, dtype: float64

In [26]:
df = pd.get_dummies(df, drop_first=True)

In [27]:
df.head()

Unnamed: 0,id,volume,fault_severity,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_18,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
0,8011,7,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2588,9,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2588,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2588,15,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2588,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df.isnull().sum().sum()

0

In [29]:
df.head(10)

Unnamed: 0,id,volume,fault_severity,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_18,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
0,8011,7,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2588,9,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2588,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2588,15,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2588,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,2588,9,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2588,5,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2588,15,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2588,5,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4848,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Columns: 1322 entries, id to location_999
dtypes: int64(3), uint8(1319)
memory usage: 79.7 MB


In [31]:
df.head(10)

Unnamed: 0,id,volume,fault_severity,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_18,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
0,8011,7,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2588,9,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2588,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2588,15,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2588,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,2588,9,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2588,5,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2588,15,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2588,5,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4848,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
df.isnull().sum()

id                0
volume            0
fault_severity    0
event_type_10     0
event_type_11     0
                 ..
location_994      0
location_995      0
location_996      0
location_998      0
location_999      0
Length: 1322, dtype: int64

In [33]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [34]:
gbc = GradientBoostingClassifier()

In [35]:
new_train = df[:train_size]
new_test = df[train_size:]

In [36]:
new_test = new_test.drop('fault_severity',axis=1)

In [37]:
y = new_train['fault_severity']
x = new_train.drop('fault_severity',axis=1)

In [38]:
x.shape, y.shape, new_test.shape

((7381, 1321), (7381,), (54458, 1321))

In [39]:
model = gbc.fit(x,y)

In [40]:
y_predgbc = model.predict(new_test)

In [41]:
y_pred_prob_gbc = model.predict_proba(new_test)

In [42]:
result = pd.DataFrame({
    "id": df.id,
    "Predicted fault_severity": y_predgbc,
    "Predicted_probability_0": y_pred_prob_gbc[test_size:,0],
    "Predicted_probability_1": y_pred_prob_gbc[test_size:,1],
    "Predicted_probability_2": y_pred_prob_gbc[test_size:,2]}
    ,columns=['id', 'Predicted fault_severity', 'Predicted_probability_0', 'Predicted_probability_1', 'Predicted_probability_2'])

ValueError: array length 54458 does not match index length 61839