### Generating Combined CSV Files

In [54]:
import os
import pandas as pd

def make_citizen_lab_list():
    # Directory containing the CSV files
    csv_directory = "../test-lists/lists/" # From cloned github: https://github.com/ooni/test-lists/tree/master
    # Ensure the directory exists
    if not os.path.exists(csv_directory):
        print(f"Directory {csv_directory} does not exist.")
        exit(1)

    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(csv_directory) if file.endswith('.csv')]

    # Read and combine all CSV files into one DataFrame
    combined_df = pd.concat([pd.read_csv(os.path.join(csv_directory, file)) for file in csv_files])

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv("citizen_lab_censored.csv", index=False)

    print("Combined CSV saved as 'citizen_lab_censored.csv'")

    # Write only the 'url' column to a text file
    a = combined_df["url"]    
    a.to_csv("citizen_lab_censored.txt", index=False, header=False)
    return 


In [73]:
import os
import pandas as pd

def combine_csvs(directory = "results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/", name = ""): 
  
    # List to store DataFrames
    all_traffic_dataframes = []
    website_traffic_dataframes = []

    # Iterate through all files in the directory
    for root, _, files in os.walk(directory):
   
        for file in files:
            if file == "metrics_all.csv":
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                all_traffic_dataframes.append(df)
            elif file == "metrics_website_only.csv":
                try:
                    file_path = os.path.join(root, file)
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                    continue
                website_traffic_dataframes.append(df)

    # Combine all DataFrames into one
    all_traffic_combined_df = pd.concat(all_traffic_dataframes, ignore_index=True)
    website_traffic_combined_df = pd.concat(website_traffic_dataframes, ignore_index=True)    

    if name: 
        # Save the combined DataFrame to a new CSV file
        all_traffic_combined_df.to_csv(name + "_all_traffic.csv", index=False)
        website_traffic_combined_df.to_csv(name + "_website_only.csv", index=False)
    return all_traffic_combined_df, website_traffic_combined_df

In [78]:
baseline_all, baseline_website = combine_csvs(directory = "results/baseline", name = "combined_baseline")
dpyproxy_all, dpyproxy_website = combine_csvs(directory = "results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/", name = "combined_dpyproxy_20")
dpyproxy_all, dpyproxy_website = combine_csvs(directory = "results/dpyproxy/frag_size=5__tcp_frag=True__record_frag=False/", name = "combined_dpyproxy_5")

Error reading results/baseline/dpyproxy=False/_wwf_org_my__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 3, saw 50

Error reading results/baseline/dpyproxy=False/_church_com_my__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 4, saw 50

Error reading results/baseline/dpyproxy=False/_mca_org_my__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 3, saw 50

Error reading results/baseline/dpyproxy=False/ecentral_my__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 3, saw 50

Error reading results/baseline/dpyproxy=False/_malaysia-today_net__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 3, saw 50

Error reading results/baseline/dpyproxy=False/_asianmigrantcentre_org__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 3, saw 50

Error reading results/dpyproxy/frag_size=

### Analysis 

Avg RTT vs Data Size. 
Frag Size 20 
Header size? What is average? 
Try Extreme: 2? 

In [72]:
def calculate_col_means(df1, df2, df1_name = "DF1", df2_name = "DF2"): 

    # Calculate the mean of numerical columns for both dataframes
    df1_means= df1.mean(numeric_only=True)
    df2_means = df2.mean(numeric_only=True)

    # Combine the means into a single dataframe for comparison
    comparison = pd.DataFrame({df1_name: df1_means, df2_name: df2_means})
    comparison['Difference'] = comparison[df1_name] - comparison[df2_name]
    return comparison

In [76]:
from scipy.stats import ttest_ind

# FIX: Compare distributions rather than means 

def calculate_sig_diff_means(baseline, dpyproxy):
    # Group by website and calculate the mean for each metric
    baseline_grouped = baseline.groupby('website').mean(numeric_only=True)
    dpyproxy_grouped = dpyproxy.groupby('website').mean(numeric_only=True)

    # Combine the means into a single dataframe for comparison
    comparison = pd.DataFrame({
        "Baseline": baseline_grouped.mean(numeric_only=True),
        "DPYProxy": dpyproxy_grouped.mean(numeric_only=True)
    })
    comparison['Difference'] = comparison["DPYProxy"] - comparison["Baseline"]

    # Perform a T-test for each metric
    p_values = {}
    for column in baseline_grouped.columns:
        if column in dpyproxy_grouped.columns:
            t_stat, p_val = ttest_ind(baseline_grouped[column], dpyproxy_grouped[column], equal_var=False, nan_policy='omit')
            p_values[column] = p_val

    # Add p-values to the comparison dataframe
    comparison['P-Value'] = pd.Series(p_values)
    # Add a column to indicate whether the P-Value is significant (e.g., p < 0.05)
    comparison['Significant'] = comparison['P-Value'] < 0.05
    # comparison['Difference'] = comparison["DPYProxy"] - comparison["Baseline"]

    # Display the comparison dataframe
    # Differentiate header vs payload 
    # Check 
    return comparison



In [85]:
from scipy.stats import ks_2samp

def run_ks_test_with_comparison(baseline, dpyproxy): 
    # Group by website and calculate the mean for each metric
    baseline_grouped = baseline.groupby('website').mean(numeric_only=True)
    dpyproxy_grouped = dpyproxy.groupby('website').mean(numeric_only=True)

    # Perform a Kolmogorov-Smirnov test for each metric to compare distributions
    ks_test_results = {}
    for column in baseline_grouped.columns:
        if column in dpyproxy_grouped.columns:
            ks_stat, p_val = ks_2samp(baseline_grouped[column].dropna(), dpyproxy_grouped[column].dropna())
            ks_test_results[column] = {"KS-Statistic": ks_stat, "P-Value": p_val}

    # Convert the results into a DataFrame for better visualization
    ks_comparison = pd.DataFrame(ks_test_results).T
    ks_comparison['Significant'] = ks_comparison['P-Value'] < 0.07

    # Add a column to indicate whether the value is greater or less than the baseline
    ks_comparison['Comparison'] = [
        "Greater" if dpyproxy_grouped[column].mean() > baseline_grouped[column].mean() else "Less"
        for column in ks_comparison.index
    ]

    ks_sig = ks_comparison[ks_comparison['Significant'] == True].sort_values(by="P-Value", ascending=True)
    return ks_comparison, ks_sig


In [89]:
# Filter for a specific website 
dpyproxy_website20 = pd.read_csv("combined_dpyproxy_20_website_only.csv")
dpyproxy_website5 = pd.read_csv("combined_dpyproxy_5_website_only.csv")
baseline_website = pd.read_csv("combined_baseline_website_only.csv")

ks_comparison20, ks_sig20 = run_ks_test_with_comparison(baseline_website, dpyproxy_website20)
ks_comparison20[ks_comparison20['Significant'] == False].sort_values(by="P-Value", ascending=False)


Unnamed: 0,KS-Statistic,P-Value,Significant,Comparison
udp_packet_min_size,0.0,1.0,False,Less
dns_incomplete_transactions,0.0,1.0,False,Less
udp_packet_max_size,0.0,1.0,False,Less
udp_packet_total_size,0.0,1.0,False,Less
udp_packet_count,0.0,1.0,False,Less
tcp_packet_min_size,0.041098,0.997721,False,Less
ip_packet_min_size,0.041098,0.997721,False,Less
all_packet_min_size,0.041098,0.997721,False,Less
data_total_payload_size,0.054924,0.950158,False,Less
data_min_payload_size,0.054924,0.950158,False,Less


In [88]:
ks_comparison5, ks_sig5 = run_ks_test_with_comparison(baseline_website, dpyproxy_website5)
#ks_comparison5
ks_sig5


Unnamed: 0,KS-Statistic,P-Value,Significant,Comparison
data_count,0.340302,0.004706,True,Greater
tcp_avg_rtt,0.339479,0.010605,True,Greater
