In [153]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re

sns.set(style="darkgrid")
sns.set_palette('hls', 4)

# Help functions

In [154]:
def count_stats(key, arr):
    s = {}
    s['mean'] = arr[key].mean()
    s['median'] = arr[key].median()
    s['range'] = arr[key].max() - arr[key].min()
    s['var'] = arr[key].var()
    s['max'] = arr[key].max()
    s['min'] = arr[key].min()
    s['std'] = arr[key].std()
    # s['quantile'] = arr[key].quantile([0.25, 0.5, 0.75]).to_list()  # Convert quantiles to a list for easier tabular display
    return s

In [155]:
def extract_memory_value(memory_str):
    try:
        return float(memory_str.split()[0])
    except (ValueError, IndexError):
        return None

In [156]:
real_resolutions = {
    'PC_A_WIN10_Brave': [2048, 1042],
    'PC_A_WIN10_Chrome': [2048, 1035],
    'PC_A_WIN10_Firefox': [2048, 1152],
    'PC_A_WIN10_Tor': [2400, 1000],
    'PC_A_Ubuntu_Brave': [2490, 1332],
    'PC_A_Ubuntu_Firefox': [2560, 1440],
    'PC_A_Ubuntu_Chrome': [2490, 1328],
    'PC_A_Ubuntu_Tor': [2400, 1000],
    'PC_B_WIN11_Brave': [1536, 735],
    'PC_B_WIN11_Chrome': [1536, 730],
    'PC_B_WIN11_Firefox': [1536, 864],
    'PC_B_WIN11_Tor': [1400, 800],
    'PC_C_MacOS_Brave': [1512, 862],
    'PC_C_MacOS_Chrome': [1512, 858],
    'PC_C_MacOS_Firefox': [1512, 982],
    'PC_C_MacOS_Safari': [1512, 982],
    'PC_C_MacOS_Tor': [1400, 800],
}

def load_df(file_path):
    df = pd.read_csv(file_path)

    df['Attributes'] = df['Attributes'].apply(ast.literal_eval)
    for key in set().union(*(df['Attributes'].dropna().tolist())):
        df[key] = df['Attributes'].apply(lambda x: x.get(key) if isinstance(x, dict) else None)
    
    df['Memory_GB'] = df['Attributes'].apply(lambda x: extract_memory_value(x.get('Memory', '0 GB')))
    
    for key in real_resolutions:
        res = real_resolutions[key]
    
        if key in file_path:
            df['Width Deviation'] = df['Screen Width'] - res[0]
            df['Height Deviation'] = df['Screen Height'] - res[1]
            break
    
    df.drop(columns=['Attributes'], inplace=True)
    
    return df

In [157]:
def get_stats(key):
    stats = {name: count_stats(key, df) for name, df in dataframes.items()}
    stats_df = pd.DataFrame(stats).T
    pd.options.display.float_format = '{:,.2f}'.format

    return stats_df


In [None]:
def print_unique_counts(dataframes):
    summary = {}
    for name, df in dataframes.items():
        counts = {}
        for col in df.columns:
            try:
                count = df[col].nunique()
            except TypeError:
                count = df[col].apply(lambda x: str(x)).nunique()
            counts[col] = count
        summary[name] = counts
    summary_df = pd.DataFrame(summary)
    print(summary_df)

In [159]:
def create_summarized_df(dataframes):
    columns = ['CPU', 'Memory_GB', 'Width Deviation', 'Height Deviation', 'Name']
    summarized_df = pd.DataFrame()  # Initialize an empty DataFrame

    for name, df in dataframes.items():
        tmp_df = df.copy()
        # Add missing columns with default value 0
        for col in columns[:-1]:
            if col not in tmp_df.columns:
                tmp_df[col] = 0
        tmp_df = tmp_df[columns[:-1]]  # Select the necessary columns
        tmp_df['Name'] = name  # Assign the name to the 'Name' column
        # Ensure all numeric columns are of a consistent type to avoid the warning
        tmp_df[columns[:-1]] = tmp_df[columns[:-1]].apply(pd.to_numeric, errors='coerce').fillna(0)
        summarized_df = pd.concat([summarized_df, tmp_df], ignore_index=True)

    summarized_df = summarized_df.fillna(0)  # Replace any remaining NaN values with 0
    return summarized_df

In [160]:
def create_distribution_plot(df1, df2, df1_title, df2_title, title, x_label, y_label, key):
    df1['Title'] = df1_title
    df2['Title'] = df2_title
    combined = pd.concat([df1, df2])

    counts = combined.groupby([key, 'Title']).size().reset_index(name='Counts')

    df1_counts = counts[counts['Title'] == df1_title].set_index(key)['Counts']
    df2_counts = counts[counts['Title'] == df2_title].set_index(key)['Counts']

    all_cpus = df1_counts.index.union(df2_counts.index)
    df1_counts = df1_counts.reindex(all_cpus, fill_value=0)
    df2_counts = df2_counts.reindex(all_cpus, fill_value=0)

    bar_width = 0.35
    index = np.arange(len(all_cpus))

    fig, ax = plt.subplots(figsize=(10, 6))

    bar1 = ax.bar(index - bar_width/2, df1_counts, bar_width, label=df1_title)
    bar2 = ax.bar(index + bar_width/2, df2_counts, bar_width, label=df2_title)

    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.set_xticks(index)
    ax.set_xticklabels(all_cpus)
    ax.legend()

    plt.tight_layout()
    plt.show()

In [161]:
def create_boxplot(settings, title, x_label, y_label, attr):
    combined_aggressive = pd.concat(
        [df.assign(OS=key.replace(settings, "")) for key, df in dataframes.items() if settings in key],
        ignore_index=True
    )
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='OS', y=attr, data=combined_aggressive)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    
    plt.show()

In [162]:
def create_comparing_boxplot(attr, title, x_label, y_label):  
    
    combined_data = pd.concat(
        [df.assign(OS=key.split(" ")[0], Setting="Aggressive" if "Aggressive" in key else "Classic") 
         for key, df in dataframes.items()],
        ignore_index=True
    )

    plt.figure(figsize=(12, 6))
    sns.boxplot(x='OS', y=attr, hue='Setting', data=combined_data, dodge=True)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(title='Setting')

    plt.show()

In [163]:
def create_sma_bar_plot(df, key, title, x_label, y_label):
    
    palette = sns.color_palette("hls")
    
    df['5SMA'] = df[key].rolling(window=5).mean()
    df['15SMA'] = df[key].rolling(window=15).mean()
    df['30SMA'] = df[key].rolling(window=30).mean()
    
    plt.figure(figsize=(14, 7))
    
    bar_label = key + " Count"
    plt.bar(range(len(df)), df[key], label=bar_label)
    
    plt.plot(df['5SMA'], label='5 SMA', linewidth=2, color=palette[2])
    plt.plot(df['15SMA'], label='15 SMA', linewidth=2, color=palette[3])
    plt.plot(df['30SMA'], label='30 SMA', linewidth=2, color=palette[4])
    
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    
    plt.tight_layout()
    plt.show()

In [164]:
def create_correlation_matrix4(df):
    
    fieldnames = ['CPU', 'Memory_GB', 'Width Deviation', 'Height Deviation']
    df2 = df[fieldnames]
    
    correlation_matrix = df2.corr()

    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Matrix of Browser Attributes")
    plt.show()

In [None]:

def create_freq_table(key):
    freq = {
        "PC C Safari": pc_c_safari[key].value_counts(),
        "PC C Safari Private": pc_c_safari_private[key].value_counts(),
    } 

    freq_df = pd.DataFrame(freq)
    freq_df.fillna(0, inplace=True)
    freq_df = freq_df.astype(int)  # Convert all float values to integer

    freq_df.reset_index(inplace=True)
    freq_df.rename(columns={'index': key}, inplace=True)
    
    df_corrected = pd.DataFrame(freq_df.set_index(key)) 
    df_corrected = df_corrected.reset_index().rename(columns={'index': 'Configuration'})
    df_corrected = df_corrected.set_index(key)
    
    return df_corrected

In [166]:
def safe_string_to_dict(s):
    try:
        return ast.literal_eval(s)
    except ValueError:
        return {}

# Common statistics

In [167]:
pc_c_safari = load_df("../data/browser_data/PC_C_MacOS_Safari.csv")
pc_c_safari_private = load_df("../data/browser_data/PC_C_MacOS_Safari_Private.csv")

dataframes = {
        "PC C MacOS Safari": pc_c_safari,
        "PC C MacOS Safari Private": pc_c_safari_private,
    }

In [168]:
summary_statistics = create_summarized_df(dataframes)
summary_statistics = summary_statistics.groupby('Name').agg(['mean', 'median', 'std', 'min', 'max'])
summary_statistics

Unnamed: 0_level_0,CPU,CPU,CPU,CPU,CPU,Memory_GB,Memory_GB,Memory_GB,Memory_GB,Memory_GB,Width Deviation,Width Deviation,Width Deviation,Width Deviation,Width Deviation,Height Deviation,Height Deviation,Height Deviation,Height Deviation,Height Deviation
Unnamed: 0_level_1,mean,median,std,min,max,mean,median,std,min,max,mean,median,std,min,max,mean,median,std,min,max
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
PC C MacOS Safari,8.0,8.0,0.0,8,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
PC C MacOS Safari Private,8.0,8.0,0.0,8,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0


In [190]:
# Proves that Safari behaves in classic mode as expected
# No values randomised

# Proves that Safari behaves in Private mode as expected
# Only Canvas, WebGL and Audio values are randomised

print_unique_counts(dataframes)

                            PC C MacOS Safari  PC C MacOS Safari Private
ID                                          0                          0
Log                                         0                          0
AttributesHash                              1                          1
Audio                                       1                         50
Fonts                                       1                          1
Geom Canvas                                 1                         50
Media Capabilities                          1                          1
MediaHash                                   1                          1
Name                                        0                          0
Plugins                                     1                          1
PluginsHash                                 1                          1
TXT Canvas                                  1                         50
Memory                                      1      

# Analysis of CPU cores

In [183]:
# No randomisation found

cpu_stats = get_stats('CPU')

print("CPU")
print(cpu_stats)

CPU
                           mean  median  range  var  max  min  std
PC C MacOS Safari          8.00    8.00   0.00 0.00 8.00 8.00 0.00
PC C MacOS Safari Private  8.00    8.00   0.00 0.00 8.00 8.00 0.00


# Memory analysis

In [None]:
# No point of analysing Memory as Memory API is not supported in Safari