In [2]:
# read in dataset
import pandas as pd
breaches = pd.read_csv('dropped_dataframe.csv')

In [3]:
breaches.shape

(2084, 17)

In [4]:
breaches.head()

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description,Hacking/IT Incident,Other,Unauthorized Access/Disclosure,Theft,Improper Disposal,Loss,Unknown,Name of Covered Entity clean
0,Delta Dental of Illinois,IL,Business Associate,4216.0,2019-04-17,Hacking/IT Incident,Email,Yes,Breach #19-340335 will be consolidated into Br...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Delta Dental of Illinois
1,Providence Health Plan,OR,Health Plan,651.0,2019-03-19,Theft,Laptop,Yes,An unencrypted laptop computer containing the ...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Providence Health Plan
2,ZOLL Services LLC,PA,Healthcare Provider,277319.0,2019-03-18,Hacking/IT Incident,Network Server,Yes,"Sonian, a subcontractor of the covered entity’...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,ZOLL Services
3,Pasquotank-Camden Emergency Medical Service,NC,Healthcare Provider,20420.0,2019-02-28,Hacking/IT Incident,Network Server,No,"On December 14, 2018, Pasquotank-Camden Emerge...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,PasquotankCamden Emergency Medical Service
4,AltaMed Health Services Corporation,CA,Healthcare Provider,6000.0,2019-02-15,Hacking/IT Incident,Network Server,Yes,"On December 31, 2018, ShareCare Health Data Se...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,AltaMed Health Services Corporation


# Preparing Data

# Breach Submission Date, separate column into day, month, year

In [5]:
# Transform object 'Breach Submission Date' to type date time using .to_datetime()
breaches['Breach Submission Date'] = pd.to_datetime(breaches['Breach Submission Date'], infer_datetime_format=True, errors='coerce')
# Confirm 'Breach Submission Date' column is in type date time
print(breaches.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2084 entries, 0 to 2083
Data columns (total 17 columns):
Name of Covered Entity              2084 non-null object
State                               2084 non-null object
Covered Entity Type                 2084 non-null object
Individuals Affected                2084 non-null float64
Breach Submission Date              2083 non-null datetime64[ns]
Type of Breach                      2084 non-null object
Location of Breached Information    2084 non-null object
Business Associate Present          2084 non-null object
Web Description                     2084 non-null object
Hacking/IT Incident                 2082 non-null float64
Other                               2082 non-null float64
Unauthorized Access/Disclosure      2082 non-null float64
Theft                               2082 non-null float64
Improper Disposal                   2082 non-null float64
Loss                                2082 non-null float64
Unknown                

In [6]:
# Split Breach Submission Date into day, month, year
breaches['Year'] = breaches['Breach Submission Date'].dt.year
breaches['Month'] = breaches['Breach Submission Date'].dt.month
breaches['Day'] = breaches['Breach Submission Date'].dt.day

# Extracting other features from Breach Submission Date variable (quarter, day of week, weekend, week of year)

In [7]:
# Quarter from date

breaches['quarter'] = breaches['Breach Submission Date'].dt.quarter
breaches[['Breach Submission Date','quarter']].head()

Unnamed: 0,Breach Submission Date,quarter
0,2019-04-17,2.0
1,2019-03-19,1.0
2,2019-03-18,1.0
3,2019-02-28,1.0
4,2019-02-15,1.0


In [8]:
# Day of the week from Date:
# series.dt.dayofweek attribute returns a numpy array 
# containing the day of the week of the DateTime variable 
# with Monday = 0 & Sunday = 6

breaches['dayofweek'] = breaches['Breach Submission Date'].dt.dayofweek
breaches[['Breach Submission Date','dayofweek']].head()

Unnamed: 0,Breach Submission Date,dayofweek
0,2019-04-17,2.0
1,2019-03-19,1.0
2,2019-03-18,0.0
3,2019-02-28,3.0
4,2019-02-15,4.0


In [9]:
# We can determine whether a day is a weekend or not by again using a simple isin function 
# that assigns ‘Saturday’ and ‘Sunday’ to 1 and the rest of the days to 0.

# import package
import numpy as np

breaches['is_weekend'] = np.where(breaches['dayofweek'].isin([5,6]),1,0)
breaches[['Breach Submission Date','is_weekend']].head()

Unnamed: 0,Breach Submission Date,is_weekend
0,2019-04-17,0
1,2019-03-19,0
2,2019-03-18,0
3,2019-02-28,0
4,2019-02-15,0


In [10]:
# Week of year
# pandas.series.weekofyear

breaches['weekofyear'] = breaches['Breach Submission Date'].dt.weekofyear
breaches[['Breach Submission Date','weekofyear']].head()

Unnamed: 0,Breach Submission Date,weekofyear
0,2019-04-17,16.0
1,2019-03-19,12.0
2,2019-03-18,12.0
3,2019-02-28,9.0
4,2019-02-15,7.0


# Location of breach methods, separate into unique columns with binary observations

In [11]:
# Location of Breach Methods, separate column into unique categories and transform into binary categories 
import numpy as np
# Make a list of unique location of breach values
location_of_breaches = ['Desktop Computer', 
                    'Electronic Medical Record', 
                    'Email',
                    'Laptop', 
                    'Network Server', 
                    'Other',
                    'Other Portable Electronic Device',
                    'Paper/Films',
                   ]

# Create new columns for these values in the dataframe
for value in location_of_breaches:
      breaches[value] = np.where(breaches['Location of Breached Information'].str.contains(value), 1, 0)

# Business Associate present, separate column into unique columns with binary observations

In [12]:
# Business Associate Column, transform into binary categories

# Make a list of unique Business Associate Present values
business_associate_present = ['Yes', 'No']

# Create new columns for these values in the dataframe
for value in business_associate_present:
      breaches[value] = np.where(breaches['Business Associate Present'].str.contains(value), 1, 0)

# Remove NaN values, produced error in model

In [13]:
# Error: ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Locate NaN values that are producing error above
import numpy as np
breaches[np.isnan(breaches['Hacking/IT Incident'])] # but don't i need to check whole df for nan?

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description,Hacking/IT Incident,...,weekofyear,Desktop Computer,Electronic Medical Record,Email,Laptop,Network Server,Other Portable Electronic Device,Paper/Films,Yes,No
1955,Penn Treaty Network America Insurance Company,PA,Health Plan,560.0,2010-08-03,Other,Other,No,Social security numbers were inadvertently pri...,,...,31.0,0,0,0,0,0,0,0,0,1
1956,\,0,1,0.0,NaT,0,0,0,Penn Treaty Network America Insurance Company,,...,,0,0,0,0,0,0,0,0,0


In [14]:
# Remove NaN values 
breaches = breaches[~np.isnan(breaches['Hacking/IT Incident'])]

# Scale date to filter our Individuals Affected > 10000

In [15]:
# use scaled dataset
scaled = breaches[breaches['Individuals Affected'] < 10000]

In [16]:
scaled.shape

(1668, 33)

In [17]:
scaled.head()

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description,Hacking/IT Incident,...,weekofyear,Desktop Computer,Electronic Medical Record,Email,Laptop,Network Server,Other Portable Electronic Device,Paper/Films,Yes,No
0,Delta Dental of Illinois,IL,Business Associate,4216.0,2019-04-17,Hacking/IT Incident,Email,Yes,Breach #19-340335 will be consolidated into Br...,1.0,...,16.0,0,0,1,0,0,0,0,1,0
1,Providence Health Plan,OR,Health Plan,651.0,2019-03-19,Theft,Laptop,Yes,An unencrypted laptop computer containing the ...,0.0,...,12.0,0,0,0,1,0,0,0,1,0
4,AltaMed Health Services Corporation,CA,Healthcare Provider,6000.0,2019-02-15,Hacking/IT Incident,Network Server,Yes,"On December 31, 2018, ShareCare Health Data Se...",1.0,...,7.0,0,0,0,0,1,0,0,1,0
5,"Lanier Family & Cosmetic Dentistry, P.C.",GA,Healthcare Provider,1950.0,2019-01-29,Unauthorized Access/Disclosure,Email,No,"On October 24, 2018, via a business associate ...",0.0,...,5.0,0,0,1,0,0,0,0,0,1
7,ABB Inc. Active Employee Group Benefit Plan,NC,Health Plan,6877.0,2019-01-18,Unauthorized Access/Disclosure,Paper/Films,Yes,"The covered entity (CE), ABB Inc. Active Emplo...",0.0,...,3.0,0,0,0,0,0,0,1,1,0


Moving forward use scaled dataframe.

# Balance data fed into model

In [18]:
# unbalanced, model does not have data to learn and make pred
scaled['Hacking/IT Incident'].value_counts()

0.0    1412
1.0     256
Name: Hacking/IT Incident, dtype: int64

In [19]:
# Balance data to include 256 Hacking/IT incident types and 256 other breach types 
df_other = scaled[scaled['Hacking/IT Incident'] == 0].head(256)
df_hacking = scaled[scaled['Hacking/IT Incident'] == 1]

In [20]:
# Combine filtered breach type dataframes
df_balanced_breach_type = pd.concat([df_other, df_hacking])

In [21]:
# Ensure filtered breach type dataframe is has balanced classes
df_balanced_breach_type['Hacking/IT Incident'].value_counts()

1.0    256
0.0    256
Name: Hacking/IT Incident, dtype: int64

In [22]:
df_balanced_breach_type

Unnamed: 0,Name of Covered Entity,State,Covered Entity Type,Individuals Affected,Breach Submission Date,Type of Breach,Location of Breached Information,Business Associate Present,Web Description,Hacking/IT Incident,...,weekofyear,Desktop Computer,Electronic Medical Record,Email,Laptop,Network Server,Other Portable Electronic Device,Paper/Films,Yes,No
1,Providence Health Plan,OR,Health Plan,651.0,2019-03-19,Theft,Laptop,Yes,An unencrypted laptop computer containing the ...,0.0,...,12.0,0,0,0,1,0,0,0,1,0
5,"Lanier Family & Cosmetic Dentistry, P.C.",GA,Healthcare Provider,1950.0,2019-01-29,Unauthorized Access/Disclosure,Email,No,"On October 24, 2018, via a business associate ...",0.0,...,5.0,0,0,1,0,0,0,0,0,1
7,ABB Inc. Active Employee Group Benefit Plan,NC,Health Plan,6877.0,2019-01-18,Unauthorized Access/Disclosure,Paper/Films,Yes,"The covered entity (CE), ABB Inc. Active Emplo...",0.0,...,3.0,0,0,0,0,0,0,1,1,0
8,Lebanon VA Medical Center,PA,Healthcare Provider,1002.0,2019-01-16,Unauthorized Access/Disclosure,Email,No,An employee of the covered entity (CE) inadver...,0.0,...,3.0,0,0,1,0,0,0,0,0,1
9,Humana Inc,KY,Health Plan,684.0,2018-12-31,Theft,Paper/Films,No,"On July 3, 2018, the covered entity’s (CE) sal...",0.0,...,1.0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,"Gary C. Spinks, DMD, PC",MD,Healthcare Provider,1000.0,2010-12-13,Hacking/IT Incident,"Desktop Computer, Network Server",No,\N,1.0,...,50.0,1,0,0,0,1,0,0,0,1
1906,SW Seattle Orthopaedic and Sports Medicine,WA,Healthcare Provider,9493.0,2010-10-15,Hacking/IT Incident,Network Server,No,"A database web server, containing the electron...",1.0,...,41.0,0,0,0,0,1,0,0,0,1
1947,UNCG Speech and Hearing Center,NC,Healthcare Provider,2300.0,2010-08-09,Hacking/IT Incident,Desktop Computer,No,Computer malware was detected on the covered e...,1.0,...,32.0,1,0,0,0,0,0,0,0,1
1996,"University of Louisville Research Foundation, ...",KY,Healthcare Provider,708.0,2010-06-01,Hacking/IT Incident,Network Server,No,An outside computer’s unique numerical code (I...,1.0,...,22.0,0,0,0,0,1,0,0,0,1


# Logistic regression model

Apply logistic regression model, to classfication problem of predicting Hacking/IT Incident breach types.

Predictor variables are separated into binary categories.

Model is run with unbalanced data 1412 observations for the Other Breach Types and 256 for Hacking/IT Breach Types.

Model is run again with Data that is balanced, selecting 512 observations, in order to accurately predict as we only have 256 values for hacking/it incident breach types, so we select only 256 for other breach types as well.

Select predictor variables: Individuals Affected, Breach Submission Date (separated), unique breach types, unique 
location of breach methods, Business Associate Present.

Select target variable: Hacking/IT Incident breach type, other breach types.

In [23]:
# Define data and target
X = scaled[['Individuals Affected', 
            'Year', # Date features
            'Month', 
            'Day',
            'quarter',
            'dayofweek',
            'is_weekend',
            'weekofyear',
            'Desktop Computer', # Location of breach methods
            'Electronic Medical Record', 
            'Email', 
            'Laptop',
            'Network Server', 
            'Other Portable Electronic Device', 
            'Paper/Films',
            'Yes', 
            'No']]

y = scaled['Hacking/IT Incident'] 

In [24]:
X.shape

(1668, 17)

In [25]:
y.shape

(1668,)

In [38]:
# Pipeline
# Build model + fit model

# import modules
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Classifier
reg = LogisticRegression() 

# train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42) 

# fit
reg.fit(X_train, y_train) 

# predict
y_pred = reg.predict(X_test)  

# metrics/evaluate model
from sklearn.metrics import recall_score, precision_score
print("Accuracy score on the train set:", reg.score(X_train, y_train)) 
print("Accuracy score on the test set: ", reg.score(X_test, y_test))
print('Recall, test set: %0.2f' % recall_score(y_test, y_pred))
print('Precision, test set: %0.2f' % precision_score(y_test, y_pred))

Accuracy score on the train set: 0.8875562218890555
Accuracy score on the test set:  0.8892215568862275
Recall, test set: 0.71
Precision, test set: 0.57




Results indicate overfitting as seen by 88% accuracy.

Run logistic regression classifier model with balanced data.

In [30]:
# Run moddel on balanced data
X_balanced = df_balanced_breach_type[['Individuals Affected', 
            'Year', # Date features
            'Month', 
            'Day',
            'quarter',
            'dayofweek',
            'is_weekend',
            'weekofyear',
            'Desktop Computer',  
            'Electronic Medical Record', 
            'Email', 
            'Laptop',
            'Network Server', 
            'Other Portable Electronic Device', 
            'Paper/Films',
            'Yes', 
            'No']]  

y_balanced = df_balanced_breach_type['Hacking/IT Incident'] # 

In [31]:
X_balanced.shape 

(512, 17)

In [32]:
y_balanced.shape 

(512,)

In [33]:
# Pipeline
# Build model + fit model

# import modules
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 
reg_balanced = LogisticRegression() 

# train/test split 
X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state = 42)

# fit
reg_balanced.fit(X_balanced_train, y_balanced_train) 

# predict
y_balanced_pred = reg.predict(X_balanced_test)  

# metrics/evaluate model
print("Accuracy score on the train set", reg.score(X_balanced_train, y_balanced_train)) 
print("Accuracy score on the test set", reg.score(X_balanced_test, y_balanced_test))
print('Recall, test set: %0.2f' % recall_score(y_balanced_test, y_balanced_pred))
print('Precision, test set: %0.2f' % precision_score(y_balanced_test, y_balanced_pred))

Accuracy score on the train set 0.7603911980440098
Accuracy score on the test set 0.8155339805825242
Recall, test set: 0.68
Precision, test set: 0.86




Interpretation of results:correction of overfitting.

# Tuning Model: Tune regularization parameter 'C'

In [41]:
# Tune regularization parameter: 'C'
# Use GridSearchCV to perform cross validation and grid search to find the best model 
# over the training data

# import packages
from sklearn.model_selection import GridSearchCV

# Instantiate Logistic regression
reg_tuning = LogisticRegression()

# Dictionary of regularization C parameters
parameters = {"C": [0.0001, 0.001, 0.1, 1, 10, 100]} 

# Fit model
fitmodel = GridSearchCV(reg_tuning, param_grid=parameters, cv=5, scoring="accuracy") 
fitmodel.fit(X_train, y_train) 
fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_, fitmodel.cv_results_ 



(LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 {'C': 10},
 0.8853073463268366,
 {'mean_fit_time': array([0.00703993, 0.0038878 , 0.00542264, 0.00551691, 0.00578256,
         0.00590076]),
  'std_fit_time': array([0.00568276, 0.00029956, 0.00082963, 0.00065503, 0.00056162,
         0.00066569]),
  'mean_score_time': array([0.00125566, 0.00101018, 0.00090542, 0.00087628, 0.00088425,
         0.00085645]),
  'std_score_time': array([5.47431769e-04, 1.79375023e-04, 7.22425763e-05, 4.85565879e-05,
         4.19474166e-05, 1.58754552e-05]),
  'param_C': masked_array(data=[0.0001, 0.001, 0.1, 1, 10, 100],
               mask=[False, False, False, False, False, False],
         fill_value='?',
              dtype=o

The parameter C=10 performs the best in this model.