In [269]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import json

sns.set(style="darkgrid")
sns.set_palette('hls', 4)

# Help functions

In [270]:
def count_stats(key, arr):
    s = {}
    s['mean'] = arr[key].mean()
    s['median'] = arr[key].median()
    s['range'] = arr[key].max() - arr[key].min()
    s['var'] = arr[key].var()
    s['max'] = arr[key].max()
    s['min'] = arr[key].min()
    s['std'] = arr[key].std()
    # s['quantile'] = arr[key].quantile([0.25, 0.5, 0.75]).to_list()  # Convert quantiles to a list for easier tabular display
    return s

In [271]:
def extract_memory_value(memory_str):
    try:
        return float(memory_str.split()[0])
    except (ValueError, IndexError):
        return None

In [272]:
real_resolutions = {
    'PC_A_WIN10_Brave': [2048, 1042],
    'PC_A_WIN10_Chrome': [2048, 1035],
    'PC_A_WIN10_Firefox': [2048, 1037],
    'PC_A_WIN10_Tor': [2400, 1000],
    'PC_A_Ubuntu_Brave': [2490, 1332],
    'PC_A_Ubuntu_Firefox': [2490, 1328],
    'PC_A_Ubuntu_Chrome': [2490, 1328],
    'PC_A_Ubuntu_Tor': [2400, 1000],
    'PC_B_WIN11_Brave': [1536, 735],
    'PC_B_WIN11_Chrome': [1536, 730],
    'PC_B_WIN11_Firefox': [1536, 731],
    'PC_B_WIN11_Tor': [1600, 800],
    'PC_C_MacOS_Brave': [1512, 862],
    'PC_C_MacOS_Chrome': [1512, 858],
    'PC_C_MacOS_Firefox': [1512, 860],
    'PC_C_MacOS_Safari': [1512, 982],
    'PC_C_MacOS_Tor': [1400, 800],
}

def safe_string_to_dict(s):
    import ast
    if pd.isna(s) or not isinstance(s, str) or not s.strip():
        return {}
    try:
        # Fix the broken pattern: "'Navigator Vendor': ','Vendor': null"
        s = s.replace("'Navigator Vendor': ','Vendor': null", "'Navigator Vendor': '', 'Vendor': None")
        # Also handle any remaining 'null'
        s = s.replace('null', 'None')
        return ast.literal_eval(s)
    except Exception as e:
        return {}

def load_df(file_path):
    df = pd.read_csv(file_path)

    df['Attributes'] = df['Attributes'].apply(safe_string_to_dict)
    all_keys = set()
    for d in df['Attributes']:
        if isinstance(d, dict):
            all_keys.update(d.keys())
    for key in all_keys:
        df[key] = df['Attributes'].apply(lambda x: x.get(key) if isinstance(x, dict) else None)

    df['Memory_GB'] = df['Attributes'].apply(lambda x: extract_memory_value(x.get('Memory', '0 GB')))
    
    for key in real_resolutions:
        res = real_resolutions[key]
        if key in file_path:
            # Use .get to avoid KeyError if column missing
            df['Width Deviation'] = df.get('Screen Width', 0) - res[0]
            df['Height Deviation'] = df.get('Screen Height', 0) - res[1]
            break

    df.drop(columns=['Attributes'], inplace=True)
    return df
    

In [273]:
def get_stats(key):
    stats = {name: count_stats(key, df) for name, df in dataframes.items()}
    stats_df = pd.DataFrame(stats).T
    pd.options.display.float_format = '{:,.2f}'.format

    return stats_df


In [274]:
def print_unique_counts(dataframes):
    summary = {}
    for name, df in dataframes.items():
        counts = {}
        for col in df.columns:
            try:
                count = df[col].nunique()
            except TypeError:
                count = df[col].apply(lambda x: str(x)).nunique()
            counts[col] = count
        summary[name] = counts
    summary_df = pd.DataFrame(summary)
    print(summary_df)

In [275]:
def create_summarized_df(dataframes):
    columns = ['CPU', 'Memory_GB', 'Width Deviation', 'Height Deviation', 'Name']
    summarized_df = pd.DataFrame()  # Initialize an empty DataFrame

    for name, df in dataframes.items():
        tmp_df = df.copy()
        # Add missing columns with default value 0
        for col in columns[:-1]:
            if col not in tmp_df.columns:
                tmp_df[col] = 0
        tmp_df = tmp_df[columns[:-1]]  # Select the necessary columns
        tmp_df['Name'] = name  # Assign the name to the 'Name' column
        # Ensure all numeric columns are of a consistent type to avoid the warning
        tmp_df[columns[:-1]] = tmp_df[columns[:-1]].apply(pd.to_numeric, errors='coerce').fillna(0)
        summarized_df = pd.concat([summarized_df, tmp_df], ignore_index=True)

    summarized_df = summarized_df.fillna(0)  # Replace any remaining NaN values with 0
    return summarized_df

In [276]:
def create_distribution_plot(df1, df2, df1_title, df2_title, title, x_label, y_label, key):
    df1['Title'] = df1_title
    df2['Title'] = df2_title
    combined = pd.concat([df1, df2])

    counts = combined.groupby([key, 'Title']).size().reset_index(name='Counts')

    df1_counts = counts[counts['Title'] == df1_title].set_index(key)['Counts']
    df2_counts = counts[counts['Title'] == df2_title].set_index(key)['Counts']

    all_cpus = df1_counts.index.union(df2_counts.index)
    df1_counts = df1_counts.reindex(all_cpus, fill_value=0)
    df2_counts = df2_counts.reindex(all_cpus, fill_value=0)

    bar_width = 0.35
    index = np.arange(len(all_cpus))

    fig, ax = plt.subplots(figsize=(10, 6))

    bar1 = ax.bar(index - bar_width/2, df1_counts, bar_width, label=df1_title)
    bar2 = ax.bar(index + bar_width/2, df2_counts, bar_width, label=df2_title)

    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.set_xticks(index)
    ax.set_xticklabels(all_cpus)
    ax.legend()

    plt.tight_layout()
    plt.show()

In [277]:
def create_boxplot(settings, title, x_label, y_label, attr):
    combined_aggressive = pd.concat(
        [df.assign(OS=key.replace(settings, "")) for key, df in dataframes.items() if settings in key],
        ignore_index=True
    )
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='OS', y=attr, data=combined_aggressive)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    
    plt.show()

In [278]:
def create_comparing_boxplot(attr, title, x_label, y_label):  
    
    combined_data = pd.concat(
        [df.assign(OS=key.split(" ")[0], Setting="Aggressive" if "Aggressive" in key else "Classic") 
         for key, df in dataframes.items()],
        ignore_index=True
    )

    plt.figure(figsize=(12, 6))
    sns.boxplot(x='OS', y=attr, hue='Setting', data=combined_data, dodge=True)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(title='Setting')

    plt.show()

In [279]:
def create_sma_bar_plot(df, key, title, x_label, y_label):
    
    palette = sns.color_palette("hls")
    
    df['5SMA'] = df[key].rolling(window=5).mean()
    df['15SMA'] = df[key].rolling(window=15).mean()
    df['30SMA'] = df[key].rolling(window=30).mean()
    
    plt.figure(figsize=(14, 7))
    
    bar_label = key + " Count"
    plt.bar(range(len(df)), df[key], label=bar_label)
    
    plt.plot(df['5SMA'], label='5 SMA', linewidth=2, color=palette[2])
    plt.plot(df['15SMA'], label='15 SMA', linewidth=2, color=palette[3])
    plt.plot(df['30SMA'], label='30 SMA', linewidth=2, color=palette[4])
    
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    
    plt.tight_layout()
    plt.show()

In [280]:
def create_correlation_matrix4(df):
    
    fieldnames = ['CPU', 'Memory_GB', 'Width Deviation', 'Height Deviation']
    df2 = df[fieldnames]
    
    correlation_matrix = df2.corr()

    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Matrix of Browser Attributes")
    plt.show()

In [281]:

def create_freq_table(key):
    freq = {
        "PC C Safari": pc_c_safari[key].value_counts(),
        "PC C Safari Private": pc_c_safari_private[key].value_counts(),
    } 

    freq_df = pd.DataFrame(freq)
    freq_df.fillna(0, inplace=True)
    freq_df = freq_df.astype(int)  # Convert all float values to integer

    freq_df.reset_index(inplace=True)
    freq_df.rename(columns={'index': key}, inplace=True)
    
    df_corrected = pd.DataFrame(freq_df.set_index(key)) 
    df_corrected = df_corrected.reset_index().rename(columns={'index': 'Configuration'})
    df_corrected = df_corrected.set_index(key)
    
    return df_corrected

# Common statistics

In [282]:
pc_a_ubuntu_tor = load_df("../data/browser_data/PC_A_Ubuntu_Tor.csv")
pc_a_win10_tor = load_df("../data/browser_data/PC_A_WIN10_Tor.csv")
pc_b_win11_tor = load_df("../data/browser_data/PC_B_WIN11_Tor.csv")
pc_c_macos_tor = load_df("../data/browser_data/PC_C_MacOS_Tor.csv")
dataframes = {
        "PC A Ubuntu Tor": pc_a_ubuntu_tor,
        "PC A Win10 Tor": pc_a_win10_tor,
        "PC B Win11 Tor": pc_b_win11_tor,
        "PC C MacOS Tor": pc_c_macos_tor,
    }

In [283]:
summary_statistics = create_summarized_df(dataframes)
summary_statistics = summary_statistics.groupby('Name').agg(['mean', 'median', 'std', 'min', 'max'])
summary_statistics

Unnamed: 0_level_0,CPU,CPU,CPU,CPU,CPU,Memory_GB,Memory_GB,Memory_GB,Memory_GB,Memory_GB,Width Deviation,Width Deviation,Width Deviation,Width Deviation,Width Deviation,Height Deviation,Height Deviation,Height Deviation,Height Deviation,Height Deviation
Unnamed: 0_level_1,mean,median,std,min,max,mean,median,std,min,max,mean,median,std,min,max,mean,median,std,min,max
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
PC A Ubuntu Tor,2.0,2.0,0.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
PC A Win10 Tor,2.0,2.0,0.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
PC B Win11 Tor,2.0,2.0,0.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
PC C MacOS Tor,2.0,2.0,0.0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0


In [284]:
# Proves that Tor Browser behaves uniformly across all platforms
# No values randomised


print_unique_counts(dataframes)

                            PC A Ubuntu Tor  PC A Win10 Tor  PC B Win11 Tor  \
ID                                     0.00            0.00            0.00   
Log                                    0.00            0.00            0.00   
AttributesHash                         1.00            1.00            1.00   
Audio                                  1.00            1.00            1.00   
Fonts                                  1.00            1.00            1.00   
Geom Canvas                            1.00            1.00            1.00   
Media Capabilities                     1.00            1.00            1.00   
MediaHash                              1.00            1.00            1.00   
Name                                   1.00            1.00            1.00   
Plugins                                1.00            1.00            1.00   
PluginsHash                            1.00            1.00            1.00   
TXT Canvas                             1.00         