# Load Dataset

In [1]:
# load dataset
import pandas as pd
breaches = pd.read_csv('preprocessed_df.csv')

In [2]:
breaches.shape

(512, 33)

In [3]:
breaches.head()

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description,Hacking/IT Incident,...,weekofyear,Desktop Computer,Electronic Medical Record,Email,Laptop,Network Server,Other Portable Electronic Device,Paper/Films,Yes,No
0,Providence Health Plan,OR,Health Plan,651.0,2019-03-19,Theft,Laptop,Yes,An unencrypted laptop computer containing the ...,0.0,...,12.0,0,0,0,1,0,0,0,1,0
1,"Lanier Family & Cosmetic Dentistry, P.C.",GA,Healthcare Provider,1950.0,2019-01-29,Unauthorized Access/Disclosure,Email,No,"On October 24, 2018, via a business associate ...",0.0,...,5.0,0,0,1,0,0,0,0,0,1
2,ABB Inc. Active Employee Group Benefit Plan,NC,Health Plan,6877.0,2019-01-18,Unauthorized Access/Disclosure,Paper/Films,Yes,"The covered entity (CE), ABB Inc. Active Emplo...",0.0,...,3.0,0,0,0,0,0,0,1,1,0
3,Lebanon VA Medical Center,PA,Healthcare Provider,1002.0,2019-01-16,Unauthorized Access/Disclosure,Email,No,An employee of the covered entity (CE) inadver...,0.0,...,3.0,0,0,1,0,0,0,0,0,1
4,Humana Inc,KY,Health Plan,684.0,2018-12-31,Theft,Paper/Films,No,"On July 3, 2018, the covered entity’s (CE) sal...",0.0,...,1.0,0,0,0,0,0,0,1,0,1


In [4]:
# Double check that data is balanced
breaches['Hacking/IT Incident'].value_counts()

1.0    256
0.0    256
Name: Hacking/IT Incident, dtype: int64

# I. Naive Bayes Classifer Model

Apply naive bayes model, to classfication problem of predicting Hacking/IT Incident breach types. (Note: GaussianNB implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian).see link: https://scikit-learn.org/stable/modules/naive_bayes.html

Predictor variables are separated into binary categories.

Data is balanced, selecting 512 observations, in order to accurately predict as we only have 256 values for hacking/it incident breach types, so we select only 256 for other breach types as well.

Select predictor variables: Individuals Affected, Breach Submission Date (separated), unique breach types, unique location of breach methods, Business Associate Present.

Select target variable: Hacking/IT Incident breach type, other breach types.

# Feature Selection

In [5]:
# Implement Naive Bayes Classifier Machine Learning Model
# to predict type of breach

# Split dataset into features and target variable

# Features
X = breaches[['Individuals Affected', 
            'Year', # Date features
            'Month', 
            'Day',
            'quarter',
            'dayofweek',
            'is_weekend',
            'weekofyear',
            'Desktop Computer', # Location of breach methods
            'Electronic Medical Record', 
            'Email', 
            'Laptop',
            'Network Server', 
            'Other Portable Electronic Device', 
            'Paper/Films',
            'Yes', # Business Associate presence
            'No']] 

# Target variable
y = breaches['Hacking/IT Incident'] 

In [6]:
X.shape 

(512, 17)

In [7]:
y.shape 

(512,)

In [8]:
# load libraries
import numpy as np  
from sklearn import metrics
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score 
from sklearn.metrics import classification_report 

# Create a Gaussian Classifier
NB = GaussianNB() # parameters

#  Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# Train classifer
NB.fit(X_train, y_train) 

# Predict the response for test dataset
y_pred = NB.predict(X_test) 

# Evaluating Model

Check accuracy using actual and predicted values.

In [9]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy for train set:", NB.score(X_train, y_train)) 
print("Accuracy test set:",NB.score(X_test, y_test))
print('Recall, test set: %0.2f' % recall_score(y_test, y_pred))
print('Precision, test set: %0.2f' % precision_score(y_test, y_pred))

Accuracy for train set: 0.8557457212713936
Accuracy test set: 0.7475728155339806
Recall, test set: 0.95
Precision, test set: 0.64


In [13]:
# Classification Report using 
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.95      0.59      0.73        59
         1.0       0.64      0.95      0.76        44

    accuracy                           0.75       103
   macro avg       0.79      0.77      0.75       103
weighted avg       0.81      0.75      0.74       103

