In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import statistics as stats

In [3]:
df = pd.read_csv("databreaches650.csv")

In [4]:
# Create a panda Series - each row is a numpy array of data classes
data_classes = df["DataClasses"].apply(lambda x: np.array(eval(x)), 0)
data_classes

0      [Email addresses, IP addresses, Names, Passwords]
1      [Email addresses, IP addresses, Names, Passwor...
2                           [Email addresses, Passwords]
3      [Device information, Email addresses, IP addre...
4                [Email addresses, Passwords, Usernames]
                             ...                        
645    [Dates of birth, Email addresses, Genders, Nam...
646    [Email addresses, Passwords, Phone numbers, Us...
647              [Email addresses, Passwords, Usernames]
648    [Dates of birth, Email addresses, Genders, Geo...
649    [Email addresses, Names, Phone numbers, Userna...
Name: DataClasses, Length: 650, dtype: object

In [None]:
data = {'DataClass': list(data_classes_counts.keys()),
        'Count': list(data_classes_counts.values())
        }
df_dataclasses = pd.DataFrame.from_dict(data)
# df_dataclasses.info()

# Sort df based on Count
df_dataclasses = df_dataclasses.sort_values('Count', ascending=False)
df_dataclasses.head(10)

# Calculate the frequency of data classes across all breaches
import math
df_dataclasses['Percentage'] = df_dataclasses['Count'].apply(lambda x: round(x/650*100,2))
df_dataclasses.head(10)

# from pathlib import Path  
# filepath = Path('dataclasses_classify.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# df_dataclasses.to_csv(filepath, index=False)

In [5]:
# Read data classes file
df_dc = pd.read_csv("dataclasses_classify.csv")
df_dc.head()

Unnamed: 0,DataClass,Count,Percentage,SeverityLevel
0,Email addresses,644,99.08,High
1,Passwords,511,78.62,High
2,Usernames,346,53.23,High
3,Names,304,46.77,High
4,IP addresses,278,42.77,High


- We categorised the severity levels based on the info in https://foresite.com/blog/classifying-the-severity-of-a-cyber-incident/ and https://teamciso.com/2016/05/csirt-classifying-the-severity-of-a-breach.html

In [7]:
# Create a dictionary: key (severity level), value (corresponding data class)
dictionary = df_dc.groupby('SeverityLevel')['DataClass'].apply(list).to_dict()
dictionary

{'High': ['Email addresses',
  'Passwords',
  'Usernames',
  'Names',
  'IP addresses',
  'Phone numbers',
  'Dates of birth',
  'Physical addresses',
  'Geographic locations',
  'Social media profiles',
  'Purchases',
  'Private messages',
  'Partial credit card data',
  'Security questions and answers',
  'Device information',
  'Government issued IDs',
  'Payment histories',
  'Auth tokens',
  'Email messages',
  'Avatars',
  'Account balances',
  'Instant messenger identities',
  'Bank account numbers',
  'Bios',
  'Profile photos',
  'Vehicle details',
  'Social security numbers',
  'Home ownership statuses',
  'Historical passwords',
  'Passport numbers',
  'Credit cards',
  'Credit status information',
  'Personal health data',
  'Partial dates of birth',
  "Family members' names",
  'Chat logs',
  'SMS messages',
  'PINs',
  'Financial transactions',
  'Health insurance information',
  'Licence plates',
  'Places of birth',
  'Credit card CVV',
  'Password strengths',
  'Audio 

In [10]:
# Count of values in each severity level
for key, value in dictionary.items():
    print(key, len(value))

High 57
Low 45
Medium 33


In [17]:
df['DataClasses'] = df["DataClasses"].apply(lambda x: np.array(eval(x)), 0)

In [42]:
# Create a new column: Severity Level for each breach
# If there is a 'High' level data class -> classify that breach as High
# Else if there is a 'Medium' level data class -> classify that breach as Medium
# Else classify that breach as Low

# Function to classify severity level of each breach by data class's severity level
def classify_severity_level(x):
    
    dict_severity = {'High': 0, 'Low': 0, 'Medium': 0}
    
    for data_class in x:
        if data_class in dictionary['High']:
            dict_severity['High'] += 1
        elif data_class in dictionary['Medium']:
            dict_severity['Medium'] += 1
        else: 
            dict_severity['Low'] += 1

    # Check count in dict_severity
    if dict_severity['High'] > 0:
        return 'High'
    elif dict_severity['Medium'] > 0:
        return 'Medium'
    else: 
        return 'Low'
    
df['SeverityLevel'] = df["DataClasses"].apply(lambda x: classify_severity_level(x))

In [45]:
df[['DataClasses', 'SeverityLevel']]

Unnamed: 0,DataClasses,SeverityLevel
0,"[Email addresses, IP addresses, Names, Passwords]",High
1,"[Email addresses, IP addresses, Names, Passwor...",High
2,"[Email addresses, Passwords]",High
3,"[Device information, Email addresses, IP addre...",High
4,"[Email addresses, Passwords, Usernames]",High
...,...,...
645,"[Dates of birth, Email addresses, Genders, Nam...",High
646,"[Email addresses, Passwords, Phone numbers, Us...",High
647,"[Email addresses, Passwords, Usernames]",High
648,"[Dates of birth, Email addresses, Genders, Geo...",High


In [50]:
df['SeverityLevel'].value_counts()

High    649
Low       1
Name: SeverityLevel, dtype: int64

In [47]:
df['SeverityLevel'].value_counts(normalize=True)

High    0.998462
Low     0.001538
Name: SeverityLevel, dtype: float64

In [49]:
df[df['SeverityLevel']=='Low']

Unnamed: 0,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,SeverityLevel
377,mSpy,mSpy,mspy.com,2015-05-14,2015-05-28 18:09:16,2015-05-28 18:09:16,699793,"In May 2015, the &quot;monitoring&quot; softwa...",https://haveibeenpwned.com/Content/Images/Pwne...,[Device usage tracking data],True,False,False,False,False,False,Low


- From above results, we can see that 99.8% of breaches has high severity level. This means that at least one of the data classes exposed from the breach has high severity level. This is consistent with another finding found previously:  99.08% of all breaches in the dataset have email addresses compromised, and email addresses is classified as having high severity level.

- There is only one breach 'mSpy' with low severity level, data class compromised being device usage tracking data.