In [9]:
"""
Figure 7: Central Tendency Analysis
Purpose: Analyse mean values of subjects affected across different sample sizes
"""
import pandas as pd
import plotly.graph_objects as go
from tabulate import tabulate 
import numpy as np

# Load the data
df = pd.read_csv("../data/data-security-incidents-trends-2023-2024_enhanced.csv") 

# Generate one random sample size between 100 and length of dataset
random_n = np.random.randint(100, len(df))

# Define random sample of the subjects_num column
random_sample = df['subjects_num'].sample(n=random_n,random_state=42)

# Configure n
n100 = 100
n1000 = 1000
n5000 = 5000

# Calculate all means
results = {
    'Metric': [
        'Population Mean',
        'Random Sample Mean',
        'Sample Mean 2023',
        'Sample Mean 2024'
    ],
    'Value n=100': [
        df['subjects_num'].mean().round(),
        random_sample.dropna().mean().round(),
        df[df['Year'] == 2023]['subjects_num'].sample(n100, random_state=42).dropna().mean().round(),
        df[df['Year'] == 2024]['subjects_num'].sample(n100, random_state=42).dropna().mean().round()
    ],
    'Value n=1000': [
        df['subjects_num'].mean().round(),
        random_sample.dropna().mean().round(),
        df[df['Year'] == 2023]['subjects_num'].sample(n1000, random_state=42).dropna().mean().round(),
        df[df['Year'] == 2024]['subjects_num'].sample(n1000, random_state=42).dropna().mean().round()
    ],
    'Value n=5000': [
        df['subjects_num'].dropna().mean().round(),
        random_sample.dropna().mean().round(),
        df[df['Year'] == 2023]['subjects_num'].sample(n5000, random_state=42).dropna().mean().round(),
        df[df['Year'] == 2024]['subjects_num'].sample(n5000, random_state=42).dropna().mean().round()
    ]
}

# Create DataFrame for display
results_df = pd.DataFrame(results)


print("\nNo. Data Subjects Affected Central Tendency Analysis:")
print(tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False))





No. Data Subjects Affected Central Tendency Analysis:
+--------------------+-------------+--------------+--------------+
|       Metric       | Value n=100 | Value n=1000 | Value n=5000 |
+--------------------+-------------+--------------+--------------+
|  Population Mean   |   10234.0   |   10234.0    |   10234.0    |
| Random Sample Mean |   10422.0   |   10422.0    |   10422.0    |
|  Sample Mean 2023  |   28969.0   |   13349.0    |   11675.0    |
|  Sample Mean 2024  |   9185.0    |   10375.0    |    9765.0    |
+--------------------+-------------+--------------+--------------+


In [10]:
"""
Figure 8: Central Tendency Analysis Bar chart
Purpose: Analyse and visualize mean number of subjects affected across different sample sizes
"""

import pandas as pd
import plotly.express as px

# Constants for sample sizes and random state
SAMPLE_SIZE_SMALL = 100
SAMPLE_SIZE_LARGE = 1000
RANDOM_SEED = 42

def calculate_means(df):
    """
    Calculate means for different sample groups
    Args:
        df: DataFrame containing subjects_num and Year columns
    Returns:
        tuple: All calculated means
    """
    return (
        df['subjects_num'].mean().round(),  # population mean
        random_sample.dropna().mean().round(),  # random sample mean
        df[df['Year'] == 2023]['subjects_num'].sample(
            n=SAMPLE_SIZE_SMALL, 
            random_state=RANDOM_SEED
        ).dropna().mean().round(),  # 2023 small sample
        df[df['Year'] == 2024]['subjects_num'].sample(
            n=SAMPLE_SIZE_SMALL, 
            random_state=RANDOM_SEED
        ).dropna().mean().round(),  # 2024 small sample
        df[df['Year'] == 2023]['subjects_num'].sample(
            n=SAMPLE_SIZE_LARGE, 
            random_state=RANDOM_SEED
        ).dropna().mean().round(),  # 2023 large sample
        df[df['Year'] == 2024]['subjects_num'].sample(
            n=SAMPLE_SIZE_LARGE, 
            random_state=RANDOM_SEED
        ).dropna().mean().round()   # 2024 large sample
    )

def create_means_dataframe(means):
    """
    Create a formatted DataFrame for visualization
    Args:
        means: tuple of calculated means
    Returns:
        DataFrame: Formatted data for plotting
    """
    return pd.DataFrame({
        'Group': [
            'Population', 'Random Sample', 
            'Sample 2023', 'Sample 2024', 
            'Sample 2023', 'Sample 2024'
        ],
        'Sample Size': [
            'n=all', 'n=100', 
            'n=100', 'n=100', 
            'n=1000', 'n=1000'
        ],
        'Mean Subjects Affected': means
    })

def create_bar_plot(means_df):
    """
    Create a grouped bar plot using plotly express
    Args:
        means_df: DataFrame containing the means data
    Returns:
        plotly.graph_objects.Figure: The configured plot
    """
    fig = px.bar(
        means_df,
        x='Group',
        y='Mean Subjects Affected',
        color='Sample Size',
        barmode='group',
        text='Mean Subjects Affected',
        title='Mean Number of Subjects Affected by Group and Sample Size'
    )

    fig.update_layout(
        yaxis_title='Mean Subjects Affected',
        xaxis_title='Group',
        legend_title='Sample Size',
        bargap=0.2
    )
    
    return fig

# Calculate all means
population_mean, random_sample_mean, s_mean_2023_n100, s_mean_2024_n100, \
    s_mean_2023_n1000, s_mean_2024_n1000 = calculate_means(df)

# Create and format DataFrame
means_df = create_means_dataframe((
    population_mean, random_sample_mean,
    s_mean_2023_n100, s_mean_2024_n100,
    s_mean_2023_n1000, s_mean_2024_n1000
))

# Create and display plot
fig = create_bar_plot(means_df)
fig.show()

In [18]:
"""
P-test
Year-to-Year Comparison using t-test
Purpose: Compare subjects affected between 2023 and 2024
"""

import pandas as pd
from scipy.stats import ttest_ind

# Constants
FILE_PATH = "../data/data-security-incidents-trends-2023-2024_enhanced.csv"

RANDOM_SEED = 42
SAMPLE_SIZE = 1000

# Load the data
df = pd.read_csv(FILE_PATH)

# Take samples from each year
sample_2023 = df[df['Year'] == 2023]['subjects_num'].dropna().sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)
sample_2024 = df[df['Year'] == 2024]['subjects_num'].dropna().sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)

# Perform t-test
t_stat, p_val = ttest_ind(sample_2023, sample_2024)

# Display results
print(f"T-test comparing subjects affected between 2023 and 2024:")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_val:.4f}")
print(f"Interpretation: {'Significant difference' if p_val < 0.05 else 'No significant difference'} at α=0.05")


T-test comparing subjects affected between 2023 and 2024:
T-statistic: 0.8923
P-value: 0.3723
Interpretation: No significant difference at α=0.05


In [27]:
"""
Figure 9: Number of subjects affected Median analysis 
Purpose: Compare subjects affected between 2023 and 2024
"""

import pandas as pd
import plotly.express as px
import numpy as np
from tabulate import tabulate 

# Load the data
df = pd.read_csv("../data/data-security-incidents-trends-2023-2024_enhanced.csv")

# Generate random sample
random_n = np.random.randint(100, len(df))
random_sample = df['subjects_num'].sample(n=random_n)

# Calculate medians
results = {
    'Group': ['Population', 'Random Sample', 'Sample 2023', 'Sample 2024', 'Sample 2023', 'Sample 2024'],
    'Sample Size': ['n=all', 'n=100', 'n=100', 'n=100', 'n=1000', 'n=1000'],
    'Median Subjects Affected': [
        df['subjects_num'].median(),
        random_sample.dropna().median(),
        df[df['Year'] == 2023]['subjects_num'].sample(n=100, random_state=42).dropna().median(),
        df[df['Year'] == 2024]['subjects_num'].sample(n=100, random_state=42).dropna().median(),
        df[df['Year'] == 2023]['subjects_num'].sample(n=1000, random_state=42).dropna().median(),
        df[df['Year'] == 2024]['subjects_num'].sample(n=1000, random_state=42).dropna().median()
    ]
}

# Create DataFrame and display table
results_df = pd.DataFrame(results)
print("\nMedian Analysis:")
print(tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False))

# Create bar plot
fig = px.bar(
    results_df,
    x='Group',
    y='Median Subjects Affected',
    color='Sample Size',
    barmode='group',
    text='Median Subjects Affected',
    title='Median Number of Subjects Affected by Group and Sample Size'
)

fig.update_layout(
    yaxis_title='Median Subjects Affected',
    xaxis_title='Group',
    legend_title='Sample Size',
    bargap=0.2
)

fig.show()


Median Analysis:
+---------------+-------------+--------------------------+
|     Group     | Sample Size | Median Subjects Affected |
+---------------+-------------+--------------------------+
|  Population   |    n=all    |           54.5           |
| Random Sample |    n=100    |           54.5           |
|  Sample 2023  |    n=100    |           54.5           |
|  Sample 2024  |    n=100    |           54.5           |
|  Sample 2023  |   n=1000    |           54.5           |
|  Sample 2024  |   n=1000    |           54.5           |
+---------------+-------------+--------------------------+


In [None]:
"""
Mode analysis
Purpose: Compare subjects affected between 2023 and 2024
"""

import pandas as pd
import numpy as np
import plotly.express as px

# Load the data
df = pd.read_csv("../data/data-security-incidents-trends-2023-2024_enhanced.csv")

# Calculate mode for each year
mode_2023 = df[df['Year'] == 2023]['No. Data Subjects Affected'].mode().iloc[0] 
mode_2024 = df[df['Year'] == 2024]['No. Data Subjects Affected'].mode().iloc[0]

# Create a DataFrame for mode analysis
mode_df = pd.DataFrame({
    'Year': ['2023', '2024'],
    'Mode': [mode_2023, mode_2024]  
})

# Display the mode analysis
print("\nMode Analysis:")
print(tabulate(mode_df, headers='keys', tablefmt='pretty', showindex=False))            




In [None]:
#Calculate range & standard deviation

import pandas as pd
# Load the data
df = pd.read_csv("../data/data-security-incidents-trends-2023-2024_enhanced.csv")

no_impacted_range = df['subjects_num'].max() - df['subjects_num'].min()
no_impacted_std = df['subjects_num'].std()

# Print 
print(f'Range is: {no_impacted_range}')
print(f'Standard deviation is: {no_impacted_std}')


In [10]:
#Calculate interquartile range
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv("../data/data-security-incidents-trends-2023-2024_enhanced.csv")

x =  df['subjects_num'].dropna()
q1 = np.quantile(x, 0.25, interpolation='midpoint')
q2 = np.quantile(x, 0.5, interpolation='midpoint')
q3 = np.quantile(x, 0.75, interpolation='midpoint')

print(f'Q1: {q1}')
print(f'Q2: {q2}')
print(f'Q3: {q3}')

print("IQR: =", q3 - q1)

Q1: 5.0
Q2: 54.5
Q3: 550.0
IQR: = 545.0


In [None]:
# Figure 10: Number of people affected and severity score analysis  
import pandas as pd
import plotly.express as px

# Load the data
df = pd.read_csv("../data/data-security-incidents-trends-2023-2024_enhanced.csv")

# Define the order
category_order = [
    "1 to 9", 
    "10 to 99", 
    "100 to 1k", 
    "1k to 10k", 
    "10k to 100k", 
    "100k and above", 
    "Unknown"
]

# Convert to ordered categorical
df["No. Data Subjects Affected"] = pd.Categorical(
    df["No. Data Subjects Affected"],
    categories=category_order,
    ordered=True
)

# Create boxplot (use column name as a string)
fig = px.box(
    df,
    x="No. Data Subjects Affected",
    y="severity_score",
    title="Severity Score by Number of Subjects Affected",
    points="all",  
    boxmode='group'
)

fig.update_traces(boxmean='sd')  # 'true' or 'sd' for standard deviation


# Update layout for clarity
fig.update_layout(xaxis_tickangle=-45)
fig.update_traces(marker_color='blue', fillcolor='rgba(0, 0, 255, 0.3)', line_color='blue')
fig.update_layout(yaxis_title="Severity Score (1 to 10)", xaxis_title="Number of Subjects Affected")
fig.show()
