In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# create dataframe
data_breaches = pd.read_csv("data_breaches.csv")
# display a previewd
data_breaches.head()

In [None]:
# print the number of cols, rows
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nThere are " + str(data_breaches.shape[1]) + " columns and " + str(data_breaches.shape[0]) + " rows.\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

In [None]:
# basic statistics for the numerical columns
data_breaches.describe()

In [None]:
# show the data types
data_breaches.dtypes

In [None]:
# print all the column names
data_breaches.columns

In [None]:
# display the number of unique values in each categorical column
object_columns = data_breaches.select_dtypes(include=['object']).columns  # set up a list of only the `object` dtypes

for column in object_columns:  # iterate thru columns list
    unique_count = data_breaches[column].nunique()  # count how many unique values are present 
    print(f"There are {unique_count} different values in `{column}`.")  # print the number in the specified format

In [None]:
# get the unique values where it makes sense
print("Unique values in `Entity`:\n", data_breaches['Entity'].unique())
print("\nUnique values in `Year`:\n", data_breaches['Year'].unique())
print("\nUnique values in `Records`:\n", data_breaches['Records'].unique())
print("\nUnique values in `Organization type`:\n", data_breaches['Organization type'].unique())
print("\nUnique values in `Method`:\n", data_breaches['Method'].unique())
print("\nUnique values in `Sources`:\n", data_breaches['Sources'].unique())

In [None]:
# check for missing values
missing_values = data_breaches.isnull().sum()

# calculate missing value counts for those columns where missing values are greater than zero
missing_values_df = pd.DataFrame(missing_values[missing_values > 0], columns=['Count Missing'])

# calculate the percentage of missing values for each column
missing_pct = ((missing_values / data_breaches.shape[0] * 100).round(3))
# add missing count and percent to table
missing_values_df["% Missing"] = missing_pct

# check if there are no missing values, and print a message if that's the case
if missing_values_df.empty:
    print("There are no missing values.")
else:
    # display the table if there are missing values
    display(missing_values_df)

In [None]:
# check for duplicate rows
duplicate_rows = data_breaches.duplicated().sum()
print("There are " + str(duplicate_rows) + " duplicate row(s).\n")

# get a boolean series indicating which rows are duplicates (including the original rows)
duplicate_mask = data_breaches.duplicated(keep=False)

# use mask to filter and display both the original and duplicate rows
duplicate_rows_df = data_breaches[duplicate_mask]
duplicate_rows_df

In [None]:
year_counts = data_breaches.groupby('Year').size().reset_index(name='num_breaches')
year_counts



In [None]:
# Other indicators across space and time
f, ax = plt.subplots(figsize=(10, 3))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.barplot(x='Year', y='num_breaches', data=year_counts, palette='Pastel2', ax=ax)
ax.set_title('', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
unknown = data_breaches.query("Method == 'unknown'")
unknown_counts = unknown.groupby('Year').size().reset_index(name='num_breaches')
unknown_counts

In [None]:
# define a function which accepts input of a string containing either a number (for a number of records) or not a number (if measured in another way)
def check_numeric(value):
    # first, convert each value to a string so the function can be applied
    # if a value is present (non-null), assign the string version of each value to `str_value`. if a null is encountered, an empty string is added
    int_value = int(value) if value is not None else ''
    
    # then, extract non-numeric information from each item in `str_value` and assign the result to `non_numeric`
    # the filter() tests each string using the lambda, which iterates thru the string and captures only non-digits and decimals ('.')
    # the outer method, ''.join(), concatenates any non-numeric characters found into one string
    # .strip() removes extra white spaces
    non_numeric = ''.join(filter(lambda x: not x.isdigit() and x != '.', str_value)).strip()
    return non_numeric if non_numeric else 'numeric'  # if `non_numeric` contains information it is returned; if it is empty, `numeric` is returned

In [None]:
# Sample DataFrame
data = {'column_name': ['123', '45.6', 'abc', '789 xyz']}
df = pd.DataFrame(data)

def extract_numeric(value):
    try:
        # Attempt to convert the value to a float
        numeric_value = float(value)
        return numeric_value
    except ValueError:
        # If the conversion fails, return NaN
        return np.nan

# Apply the custom function to the DataFrame column
df['int_records'] = df['column_name'].apply(extract_numeric)

print(df)


In [None]:
# Define the function
def check_numeric_and_convert(value):
    try:
        # Attempt to convert the value to an integer
        return int(value)
    except (ValueError, TypeError):
        # If conversion fails, return NaN
        return pd.NA

# Example usage with a DataFrame
df = pd.DataFrame({'records': ['123', '456', 'abc', '789.0', None]})

# Apply the function to the column
data_breaches['int_records'] = data_breaches['Records'].apply(check_numeric_and_convert)

# Convert the 'column_name' to numeric, coercing non-numeric values to NaN
data_breaches['int_records'] = pd.to_numeric(data_breaches['int_records'], errors='coerce')

# Display the DataFrame
data_breaches

In [None]:
# Define the function
def check_numeric_and_convert(value):
    try:
        # Attempt to convert the value to an integer
        return int(value)
    except (ValueError, TypeError):
        # If conversion fails, return NaN
        return pd.NA

# Example usage with a DataFrame
df = pd.DataFrame({'records': ['123', '456', 'abc', '789.0', None]})

# Apply the function to the column
data_breaches['int_year'] = data_breaches['Year'].apply(check_numeric_and_convert)

# Convert the 'column_name' to numeric, coercing non-numeric values to NaN
data_breaches['int_year'] = pd.to_numeric(data_breaches['int_year'], errors='coerce')

# Display the DataFrame
data_breaches

In [None]:
# Plot: number of data breaches by the year
f, ax = plt.subplots(figsize=(10, 3))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.countplot(x='int_year', data=data_breaches, ax=ax)
ax.set_title('Number of data breaches by the year', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_ylabel('Number of breaches', weight='bold')

In [None]:
# Plot: 
f, ax = plt.subplots(figsize=(10, 10))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.scatterplot(x='int_year', y='int_records', hue='Organization type', data=data_breaches, ax=ax)
ax.set_title('Number of records for data breaches', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_ylabel('Number of records', weight='bold')
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement

In [None]:
# x-axis: year 
# y-axis: number of records
# scatterplot: each point is a breach. ones with multi-long years are shown as lines 

In [None]:
# Plot: number of data breaches by the year
f, ax = plt.subplots(figsize=(10, 15))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.countplot(y='Organization type', data=data_breaches, ax=ax)
ax.set_title('Number of data breaches by the year', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
# Plot: number of data breaches by the year
f, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.countplot(y='Method', data=data_breaches, ax=ax)
ax.set_title('Number of data breaches by the year', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
hacked = data_breaches.query("Method == 'hacked'")
poor_security = data_breaches.query("Method == 'poor security'")

from matplotlib.ticker import FuncFormatter
def integer_formatter(x, pos):
    return f'{int(x)}'


# Plot: 
f, ax = plt.subplots(2, 1, figsize=(10, 10))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.scatterplot(x='int_year', y='int_records', hue='Organization type', data=hacked, ax=ax[0])
ax[0].set_title('Number of records for data breaches', weight='bold')
ax[0].set_xlabel('Year', weight='bold')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45)
ax[0].set_ylabel('Number of records', weight='bold')
ax[0].set_ylim(0, 3100000000)
ax[0].yaxis.set_major_formatter(FuncFormatter(integer_formatter))
ax[0].legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement


sns.scatterplot(x='int_year', y='int_records', hue='Organization type', data=poor_security, ax=ax[1])
ax[1].set_title('Number of records for data breaches', weight='bold')
ax[1].set_xlabel('Year', weight='bold')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45)
ax[1].set_ylabel('Number of records', weight='bold')
ax[1].set_ylim(0, 3100000000)
ax[1].yaxis.set_major_formatter(FuncFormatter(integer_formatter))
ax[1].legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement

In [None]:
# Plot: number of data breaches by the year
f, ax = plt.subplots(figsize=(10, 8))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.countplot(y='Organization type', data=hacked, ax=ax)
ax.set_title('Number of data breaches by the year', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
# Plot: number of data breaches by the year
f, ax = plt.subplots(figsize=(10, 8))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.countplot(y='Organization type', data=poor_security, ax=ax)
ax.set_title('Number of data breaches by the year', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:



# Plot: 
f, ax = plt.subplots(2, 1, figsize=(10, 10))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

# Plot for injuries
sns.countplot(x='int_year', data=hacked, ax=ax[0])
ax[0].set_title('Number of records for data breaches', weight='bold')
ax[0].set_xlabel('Year', weight='bold')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45)
ax[0].set_ylabel('Number of records', weight='bold')


sns.countplot(x='int_year', data=poor_security, ax=ax[1])
ax[1].set_title('Number of records for data breaches', weight='bold')
ax[1].set_xlabel('Year', weight='bold')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45)
ax[1].set_ylabel('Number of records', weight='bold')

In [None]:
Method_counts = data_breaches['Method'].value_counts().rename('Method_counts')
Method_counts

In [None]:
# Plot: 
web = data_breaches.query("`Organization type` == 'web'")
healthcare = data_breaches.query("`Organization type` == 'healthcare'")


f, ax = plt.subplots(2, 1, figsize=(10, 6))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

sns.lineplot(x='int_year', y='int_records', hue='Method', data=web, ax=ax[0])
ax[0].set_title('Number of records for data breaches', weight='bold')
ax[0].set_xlabel('Year', weight='bold')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45)
ax[0].set_ylabel('Number of records', weight='bold')
ax[0].set_ylim(0, 3100000000)
ax[0].yaxis.set_major_formatter(FuncFormatter(integer_formatter))
ax[0].legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement

sns.scatterplot(x='int_year', y='int_records', hue='Method', data=healthcare, ax=ax[1])
ax[1].set_title('Number of records for data breaches', weight='bold')
ax[1].set_xlabel('Year', weight='bold')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45)
ax[1].set_ylabel('Number of records', weight='bold')
ax[1].set_ylim(0, 3100000000)
ax[1].yaxis.set_major_formatter(FuncFormatter(integer_formatter))
ax[1].legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement

In [None]:
org_counts = data_breaches['Organization type'].value_counts().rename('org_counts')
org_counts

In [None]:
# Plot: 
org_order = data_breaches['Organization type'].value_counts().index
f, ax = plt.subplots(figsize=(10, 15))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

sns.countplot(y='Organization type', data=data_breaches, order=org_order, ax=ax)
ax.set_title('', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
df_total_records = data_breaches.groupby('Organization type', sort=False)["int_records"].sum().reset_index(name ='Total Records')
df_total_records_sorted = df_total_records.sort_values(by='Total Records', ascending=False)
df_total_records_sorted

In [None]:
# Plot: 

f, ax = plt.subplots(figsize=(10, 15))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

sns.barplot(x='Total Records', y='Organization type', data=df_total_records_sorted, ax=ax)
ax.set_title('', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

Below is the breakdown of which types of organizations are affected. 

the left shows which types of organizations most frequently appear in the data. That is, the types of organizations that most frequently experience data breaches.  
The right shows the total records stolen as a part of breaches. it has a different scale, but it is in the same order as the left plot, to make it easy to compare. it shows how,for some of the most-frequently hit types of organizations, the total records 

In [None]:
# Plot: 

f, ax = plt.subplots(1, 2, figsize=(10, 15))

sns.countplot(y='Organization type', data=data_breaches, order=org_order, ax=ax[0])
ax[0].set_title('', weight='bold')
ax[0].set_xlabel('', weight='bold')
ax[0].set_ylabel('', weight='bold')

sns.barplot(x='Total Records', y='Organization type', data=df_total_records_sorted, order=org_order, ax=ax[1])
ax[1].set_title('', weight='bold')
ax[1].set_xlabel('', weight='bold')
ax[1].set_ylabel('', weight='bold')
# Remove tick labels for ax[0]
ax[1].set_yticklabels([])
plt.tight_layout()

In [None]:
data_breaches.query("`Organization type` == 'web, tech'")

In [None]:
data_breaches.query("`Organization type` == 'tech, web'")


In [None]:
# Plot: 
method_order = web['Method'].value_counts().index

f, ax = plt.subplots(8, 3, figsize=(10, 8))
plt.subplots_adjust(hspace=0, wspace=0)


sns.countplot(x='Method', data=web, order=method_order, ax=ax[0,0])
ax[0,0].set_title('web')
ax[0,0].set_xlabel('', weight='bold')
ax[0,0].set_ylabel('', weight='bold')
ax[0,0].set_xticklabels([]) # remove tick labels
#ax[0,0].set_position([0, 0, 0, 0.0008])  # [left, bottom, width, height]

sns.countplot(x='Method', data=healthcare, order=method_order, ax=ax[1,0])
ax[1,0].set_title('healthcare')
ax[1,0].set_xlabel('')
ax[1,0].set_ylabel('')
ax[1,0].set_xticklabels([]) # remove tick labels

sns.countplot(x='Method', data=healthcare, order=method_order, ax=ax[2,0])
ax[2,0].set_title('healthcare')
ax[2,0].set_xlabel('')
ax[2,0].set_ylabel('')
ax[2,0].set_xticklabels([]) # remove tick labels
ax[2,0].set_yticklabels([]) # remove tick labels

legend_labels=['Short (<10 Miles)', 'Medium (10-50 Miles)', 'Long (50+ Miles)']
legend_colors=['Blue', 'Brown', 'Green']
legend_handles = [plt.Line2D([0], [0], marker='s', color='White', label=label, 
                             markersize=11, markerfacecolor=color, linestyle='None') 
                  for label, color in zip(legend_labels, legend_colors)]
ax[0,2].legend(loc='upper left',  bbox_to_anchor=(1.1,1), handles=legend_handles, title='Tornado Track Length', fontsize='small', title_fontsize='small', labelspacing=0.9, borderpad=1, facecolor='white')
#ax[0,0].legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement

plt.tight_layout()

In [None]:
web = data_breaches.query("`Organization type` == 'web'")
healthcare = data_breaches.query("`Organization type` == 'healthcare'")
financial = data_breaches.query("`Organization type` == 'financial'")
government = data_breaches.query("`Organization type` == 'government'")
retail = data_breaches.query("`Organization type` == 'retail'")
tech = data_breaches.query("`Organization type` == 'tech'")
academic = data_breaches.query("`Organization type` == 'academic'")
telecoms = data_breaches.query("`Organization type` == 'telecoms'")
gaming = data_breaches.query("`Organization type` == 'gaming'")
social_network = data_breaches.query("`Organization type` == 'social network'")
hotel = data_breaches.query("`Organization type` == 'hotel'")
transport = data_breaches.query("`Organization type` == 'transport'")
military = data_breaches.query("`Organization type` == 'military'")
energy = data_breaches.query("`Organization type` == 'energy'")
restaurant = data_breaches.query("`Organization type` == 'restaurant'")
media = data_breaches.query("`Organization type` == 'media'")
mobile_carrier = data_breaches.query("`Organization type` == 'mobile carrier'")
web = data_breaches.query("`Organization type` == 'web'")
web = data_breaches.query("`Organization type` == 'web'")
web = data_breaches.query("`Organization type` == 'web'")
web = data_breaches.query("`Organization type` == 'web'")
web = data_breaches.query("`Organization type` == 'web'")


In [None]:
import matplotlib.gridspec as gridspec

f = plt.figure(figsize=(10, 10))
gs = gridspec.GridSpec(8, 3, width_ratios=[1, 1, 1])  # 1 row, 2 columns

method_order = data_breaches['Method'].value_counts().index 

palette = sns.color_palette("deep", 25)

ax1 = plt.subplot(gs[0,0])
sns.countplot(x='Method', data=web, order=method_order, palette=palette, ax=ax1)
ax1.set_title('web', fontsize=9, y=0.6)
ax1.set_xlabel('', weight='bold')
ax1.set_ylabel('', weight='bold')
ax1.set_xticklabels([]) # remove tick labels
ax1.set_yticklabels([]) # remove tick labels

ax01 = plt.subplot(gs[0,1])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax01)
ax01.set_title('healthcare', fontsize=9, y=0.6)
ax01.set_xlabel('', weight='bold')
ax01.set_ylabel('', weight='bold')
ax01.set_xticklabels([]) # remove tick labels
ax01.set_yticklabels([]) # remove tick labels

ax02 = plt.subplot(gs[0,2])
sns.countplot(x='Method', data=financial, order=method_order, palette=palette,  ax=ax02)
ax02.set_title('financial', fontsize=9, y=0.6)
ax02.set_xlabel('', weight='bold')
ax02.set_ylabel('', weight='bold')
ax02.set_xticklabels([]) # remove tick labels
ax02.set_yticklabels([]) # remove tick labels

ax10 = plt.subplot(gs[1,0])
sns.countplot(x='Method', data=government, order=method_order, palette=palette,  ax=ax10)
ax10.set_title('government', fontsize=9, y=0.6)
ax10.set_xlabel('')
ax10.set_ylabel('')
ax10.set_xticklabels([]) # remove tick labels
ax10.set_yticklabels([]) # remove tick labels

ax11 = plt.subplot(gs[1,1])
sns.countplot(x='Method', data=retail, order=method_order, palette=palette,  ax=ax11)
ax11.set_title('retail', fontsize=9, y=0.6)
ax11.set_xlabel('')
ax11.set_ylabel('')
ax11.set_xticklabels([]) # remove tick labels
ax11.set_yticklabels([]) # remove tick labels

ax12 = plt.subplot(gs[1,2])
sns.countplot(x='Method', data=tech, order=method_order, palette=palette,  ax=ax12)
ax12.set_title('tech', fontsize=9, y=0.6)
ax12.set_xlabel('')
ax12.set_ylabel('')
ax12.set_xticklabels([]) # remove tick labels
ax12.set_yticklabels([]) # remove tick labels

ax20 = plt.subplot(gs[2,0])
sns.countplot(x='Method', data=academic, order=method_order, palette=palette,  ax=ax20)
ax20.set_title('academic', fontsize=9, y=0.6)
ax20.set_xlabel('')
ax20.set_ylabel('')
ax20.set_xticklabels([]) # remove tick labels
ax20.set_yticklabels([]) # remove tick labels

ax21 = plt.subplot(gs[2,1])
sns.countplot(x='Method', data=telecoms, order=method_order, palette=palette,  ax=ax21)
ax21.set_title('telecoms', fontsize=9, y=0.6)
ax21.set_xlabel('')
ax21.set_ylabel('')
ax21.set_xticklabels([]) # remove tick labels
ax21.set_yticklabels([]) # remove tick labels

ax22 = plt.subplot(gs[2,2])
sns.countplot(x='Method', data=gaming, order=method_order, palette=palette,  ax=ax22)
ax22.set_title('gaming', fontsize=9, y=0.6)
ax22.set_xlabel('')
ax22.set_ylabel('')
ax22.set_xticklabels([]) # remove tick labels
ax22.set_yticklabels([]) # remove tick labels

ax30 = plt.subplot(gs[3,0])
sns.countplot(x='Method', data=social_network, order=method_order, palette=palette, ax=ax30)
ax30.set_title('social network', fontsize=9, y=0.6)
ax30.set_xlabel('')
ax30.set_ylabel('')
ax30.set_xticklabels([]) # remove tick labels
ax30.set_yticklabels([]) # remove tick labels

ax31 = plt.subplot(gs[3,1])
sns.countplot(x='Method', data=hotel, order=method_order, palette=palette,  ax=ax31)
ax31.set_title('hotel', fontsize=9, y=0.6)
ax31.set_xlabel('')
ax31.set_ylabel('')
ax31.set_xticklabels([]) # remove tick labels
ax31.set_yticklabels([]) # remove tick labels

ax32 = plt.subplot(gs[3,2])
sns.countplot(x='Method', data=transport, order=method_order,  palette=palette, ax=ax32)
ax32.set_title('transport', fontsize=9, y=0.6)
ax32.set_xlabel('')
ax32.set_ylabel('')
ax32.set_xticklabels([]) # remove tick labels
ax32.set_yticklabels([]) # remove tick labels

ax40 = plt.subplot(gs[4,0])
sns.countplot(x='Method', data=military, order=method_order, palette=palette,  ax=ax40)
ax40.set_title('military', fontsize=9, y=0.6)
ax40.set_xlabel('')
ax40.set_ylabel('')
ax40.set_xticklabels([]) # remove tick labels
ax40.set_yticklabels([]) # remove tick labels

ax41 = plt.subplot(gs[4,1])
sns.countplot(x='Method', data=energy, order=method_order, palette=palette,  ax=ax41)
ax41.set_title('energy', fontsize=9, y=0.6)
ax41.set_xlabel('')
ax41.set_ylabel('')
ax41.set_xticklabels([]) # remove tick labels
ax41.set_yticklabels([]) # remove tick labels

ax42 = plt.subplot(gs[4,2])
sns.countplot(x='Method', data=restaurant, order=method_order,  palette=palette, ax=ax42)
ax42.set_title('restaurant', fontsize=9, y=0.6)
ax42.set_xlabel('')
ax42.set_ylabel('')
ax42.set_xticklabels([]) # remove tick labels
ax42.set_yticklabels([]) # remove tick labels

ax50 = plt.subplot(gs[5,0])
sns.countplot(x='Method', data=media, order=method_order, palette=palette,  ax=ax50)
ax50.set_title('media', fontsize=9, y=0.6)
ax50.set_xlabel('')
ax50.set_ylabel('')
ax50.set_xticklabels([]) # remove tick labels
ax50.set_yticklabels([]) # remove tick labels

ax51 = plt.subplot(gs[5,1])
sns.countplot(x='Method', data=mobile_carrier, order=method_order, palette=palette,  ax=ax51)
ax51.set_title('mobile carrier', fontsize=9, y=0.6)
ax51.set_xlabel('')
ax51.set_ylabel('')
ax51.set_xticklabels([]) # remove tick labels
ax51.set_yticklabels([]) # remove tick labels

ax52 = plt.subplot(gs[5,2])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax52)
ax52.set_title('healthcare', fontsize=9, y=0.6)
ax52.set_xlabel('')
ax52.set_ylabel('')
ax52.set_xticklabels([]) # remove tick labels
ax52.set_yticklabels([]) # remove tick labels

ax60 = plt.subplot(gs[6,0])
sns.countplot(x='Method', data=healthcare, order=method_order,  palette=palette, ax=ax60)
ax60.set_title('healthcare', fontsize=9, y=0.6)
ax60.set_xlabel('')
ax60.set_ylabel('')
ax60.set_xticklabels([]) # remove tick labels
ax60.set_yticklabels([]) # remove tick labels

ax61 = plt.subplot(gs[6,1])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax61)
ax61.set_title('healthcare', fontsize=9, y=0.6)
ax61.set_xlabel('')
ax61.set_ylabel('')
ax61.set_xticklabels([]) # remove tick labels
ax61.set_yticklabels([]) # remove tick labels

ax62 = plt.subplot(gs[6,2])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax62)
ax62.set_title('healthcare', fontsize=9, y=0.6)
ax62.set_xlabel('')
ax62.set_ylabel('')
ax62.set_xticklabels([]) # remove tick labels
ax62.set_yticklabels([]) # remove tick labels

ax70 = plt.subplot(gs[7,0])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax70)
ax70.set_title('healthcare', fontsize=9, y=0.6)
ax70.set_xlabel('')
ax70.set_ylabel('')
ax70.set_xticklabels([]) # remove tick labels
ax70.set_yticklabels([]) # remove tick labels

ax71 = plt.subplot(gs[7,1])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax71)
ax71.set_title('healthcare', fontsize=9, y=0.6)
ax71.set_xlabel('')
ax71.set_ylabel('')
ax71.set_xticklabels([]) # remove tick labels
ax71.set_yticklabels([]) # remove tick labels

ax72 = plt.subplot(gs[7,2])
sns.countplot(x='Method', data=healthcare, order=method_order, palette=palette,  ax=ax72)
ax72.set_title('healthcare', fontsize=9, y=0.6)
ax72.set_xlabel('')
ax72.set_ylabel('')
ax72.set_xticklabels([]) # remove tick labels
ax72.set_yticklabels([]) # remove tick labels

# define colors for each "top" category
first_color = palette[0]
second_color = palette[1]
third_color = palette[2]
fourth_color = palette[3]
fifth_color = palette[4]
sixth_color = palette[5]
seventh_color = palette[6]
eighth_color = palette[8]
ninth_color = palette[9]

legend_labels=['hacked', 'poor security', 'lost / stolen media', 'accidentally published', 'inside job', 'lost / stolen computer', 'unknown', 'improper setting, hacked']
legend_colors=[first_color, second_color, third_color, fourth_color, fifth_color, sixth_color, seventh_color, eighth_color, ninth_color]
legend_handles = [plt.Line2D([0], [0], marker='s', color='White', label=label, 
                             markersize=11, markerfacecolor=color, linestyle='None') 
                  for label, color in zip(legend_labels, legend_colors)]
ax02.legend(loc='upper left',  bbox_to_anchor=(1.1,1), handles=legend_handles, title='Method', fontsize='small', title_fontsize='small', labelspacing=0.9, borderpad=1, facecolor='white')
#ax[0,0].legend(loc='upper left', bbox_to_anchor=(1, 1))  # customize the legend placement

# Adjust spacing between subplots
gs.update(hspace=0.3, wspace=0.15)  # Adjust horizontal spacing


In [None]:
mobile_carrier

In [None]:
# Plot:
order = data_breaches['Method'].value_counts().index 
f, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

sns.countplot(y='Method', data=data_breaches, order=order, ax=ax)
ax.set_title('', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
# create new column that classifies `Organization type` by broader categories
def broad_categories(org_type):
    if org_type == 'healthcare' or org_type == 'Clinical Laboratory' or org_type == 'military, healthcare':
        return 'Healthcare/Wellness'
    elif org_type == 'social networking' or org_type == 'social network' or org_type == 'dating' or org_type == 'messaging app' or org_type == 'social media':
        return 'Social Media'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security' or org_type == 'information technology':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    elif org_type == 'tech' or org_type == 'Information Security':
        return 'Technology/IT'
    else:
        return 'None'

data_breaches['broad_categories'] = data_breaches['Organization type'].apply(broad_categories)
data_breaches

In [None]:
# Plot: visualize the new organization type categories
cat_order = data_breaches['broad_categories'].value_counts().index 
f, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

sns.countplot(y='broad_categories', data=data_breaches, order=cat_order, ax=ax)
ax.set_title('', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
# create new column that classifies `Method` by broader categories
def method_categories(method):
    if method == 'hacked' or method == 'inside job, hacked' or method == 'poor security/hacked' or method == 'ransomware hacked' or method == 'improper setting, hacked' or method == 'hacked/misconfiguration' or method == 'zero-day vulnerabilities':
        return 'Hacking/Cyberattacks'
    elif method == 'poor security' or method == 'Poor security' or method == 'unsecured S3 bucket' or method == 'unprotected api' or method == 'poor security/inside job' or method == 'misconfiguration/poor security':
        return 'Poor Security Practices'
    elif method == 'accidentally published' or method == 'accidentally uploaded' or method == 'accidentally exposed':
        return 'Accidental Exposure'
    else:
        return 'None'

data_breaches['method_categories'] = data_breaches['Method'].apply(method_categories)
data_breaches

In [None]:
# Plot: visualize the new method categories 
method_cat_order = data_breaches['method_categories'].value_counts().index 
f, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()

sns.countplot(y='method_categories', data=data_breaches, order=method_cat_order, ax=ax)
ax.set_title('', weight='bold')
ax.set_xlabel('', weight='bold')
ax.set_ylabel('', weight='bold')

In [None]:
# Plot: 

In [None]:

top_orgs = data_breaches['Organization type'].value_counts(24).index
# display the number of unique values in each categorical column
object_columns = data_breaches.select_dtypes(include=['object']).columns  # set up a list of only the `object` dtypes

for column in object_columns:  # iterate thru columns list
    unique_count = data_breaches[column].nunique()  # count how many unique values are present 
    print(f"There are {unique_count} different values in `{column}`.")  # print the number in the specified format

f, ax = plt.subplots(8, 3, figsize=(12, 10))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.despine()
sns.countplot(y='Method', data=healthcare, ax=ax[1])
ax[1].set_title('Number of records for data breaches', weight='bold')
ax[1].set_xlabel('Year', weight='bold')
ax[1].set_ylabel('Number of records', weight='bold')

In [None]:
# Set up the figure and axes
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(10, 16))

# List of column names to plot
columns_to_plot = ['mag', 'loss', 'slat', 'slon', 'elat', 'elon', 'len', 'wid', 'yr', 'inj', 'fat', 'ns']

# Iterate over the columns and create boxplots
for i, column in enumerate(columns_to_plot):
    row = i // 2  # Determine the row of the subplot
    col = i % 2   # Determine the column of the subplot
    sns.boxplot(y=tornadoes[column], ax=axes[row, col])
    axes[row, col].set_title(f'Boxplot for {column}')
    axes[row, col].set_ylabel(column)

plt.tight_layout()
plt.show()
