### Generating Combined CSV Files

In [24]:
import os
import pandas as pd

def make_citizen_lab_list():
    # Directory containing the CSV files
    csv_directory = "../test-lists/lists/" # From cloned github: https://github.com/ooni/test-lists/tree/master
    # Ensure the directory exists
    if not os.path.exists(csv_directory):
        print(f"Directory {csv_directory} does not exist.")
        exit(1)

    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(csv_directory) if file.endswith('.csv')]

    # Read and combine all CSV files into one DataFrame
    combined_df = pd.concat([pd.read_csv(os.path.join(csv_directory, file)) for file in csv_files])

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv("citizen_lab_censored.csv", index=False)

    print("Combined CSV saved as 'citizen_lab_censored.csv'")

    # Write only the 'url' column to a text file
    a = combined_df["url"]    
    a.to_csv("citizen_lab_censored.txt", index=False, header=False)
    return 


In [43]:
import os
import pandas as pd

def combine_csvs(directory = "results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/", name = ""): 
  
    # List to store DataFrames
    all_traffic_dataframes = []
    website_traffic_dataframes = []

    # Iterate through all files in the directory
    for root, _, files in os.walk(directory):
   
        for file in files:
            if file == "metrics_all.csv":
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                all_traffic_dataframes.append(df)
            elif file == "metrics_website_only.csv":
                try:
                    file_path = os.path.join(root, file)
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                    continue
                website_traffic_dataframes.append(df)

    # Combine all DataFrames into one
    all_traffic_combined_df = pd.concat(all_traffic_dataframes, ignore_index=True)
    website_traffic_combined_df = pd.concat(website_traffic_dataframes, ignore_index=True)    

    if name: 
        # Save the combined DataFrame to a new CSV file
        all_traffic_combined_df.to_csv(name + "_all_traffic.csv", index=False)
        website_traffic_combined_df.to_csv(name + "_website_only.csv", index=False)
    return all_traffic_combined_df, website_traffic_combined_df

baseline_all, baseline_website = combine_csvs(directory = "results/baseline", name = "combined_baseline")
dpyproxy_all, dpyproxy_website = combine_csvs(directory = "results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/", name = "combined_dpyrpoxy")

Error reading results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/bolod_mn__/metrics_website_only.csv: Error tokenizing data. C error: Expected 41 fields in line 5, saw 52



In [None]:
# Example 
dpyproxy_website["website"].value_counts()

website
http://info.mn/                      10
http://eagle.mn/                     10
http://everyday.mn/                  10
http://dorgio.mn/                    10
http://www.smhric.org/               10
http://www.ikon.mn/                  10
http://zone.mn/                      10
http://paparatsi.mn/                 10
https://www.news.mn/                 10
http://bataar.mn/                    10
http://mnews.mn/                     10
http://mongolnews.mn/                10
http://wikimon.mn/                   10
http://goolingoo.mn/                 10
http://shuurhai.mn/                  10
http://assa.mn/                      10
http://sonin.mn/                     10
http://16honeys.com/                 10
http://sportnews.mn/                 10
http://www.24tsag.mn/                10
http://www.shuud.mn/                 10
http://chuhal.mn/                    10
http://mnb.mn/                       10
http://vip76.mn/                     10
http://ugluu.mn/                

### Analysis 

In [50]:
def calculate_col_means(df1, df2, df1_name = "DF1", df2_name = "DF2"): 

    # Calculate the mean of numerical columns for both dataframes
    df1_means= df1.mean(numeric_only=True)
    df2_means = df2.mean(numeric_only=True)

    # Combine the means into a single dataframe for comparison
    comparison = pd.DataFrame({df1_name: df1_means, df2_name: df2_means})
    comparison['Difference'] = comparison[df1_name] - comparison[df2_name]
    return comparison

# Filter for a specific website 
dpyproxy_website = pd.read_csv("combined_dpyproxy_website_only.csv")
baseline_website = pd.read_csv("combined_baseline_website_only.csv")

# Display the comparison (NOTE: this is for website traffic)
calculate_col_means(dpyproxy_website, baseline_website, df1_name = "Dpyproxy", df2_name = "Baseline")

Unnamed: 0,Dpyproxy,Baseline,Difference
all_packet_average_size,500.273431,624.345146,-124.0717
all_packet_count,155.762115,144.81681,10.9453
all_packet_max_size,1041.387665,1145.37931,-103.9916
all_packet_min_size,56.123348,59.49569,-3.372342
all_packet_total_size,118406.240088,128080.762931,-9674.523
data_avg_payload_size,36.799559,41.789665,-4.990106
data_count,11.469163,10.10917,1.359993
data_max_payload_size,36.799559,43.362445,-6.562886
data_min_payload_size,36.799559,39.644105,-2.844545
data_total_payload_size,76.517621,129.635371,-53.11775


In [47]:
### Avg RTT vs Data Size. 
### Frag Size 20 
### Header size? What is average? 
### Try Extreme: 2? 

In [None]:
from scipy.stats import ttest_ind

# FIX: Compare distributions rather than means 

# Group by website and calculate the mean for each metric
baseline_grouped = baseline_website.groupby('website').mean(numeric_only=True)
dpyproxy_grouped = dpyproxy_website.groupby('website').mean(numeric_only=True)

# Combine the means into a single dataframe for comparison
comparison = pd.DataFrame({
    "Baseline": baseline_grouped.mean(numeric_only=True),
    "DPYProxy": dpyproxy_grouped.mean(numeric_only=True)
})
comparison['Difference'] = comparison["DPYProxy"] - comparison["Baseline"]

# Perform a T-test for each metric
p_values = {}
for column in baseline_grouped.columns:
    if column in dpyproxy_grouped.columns:
        t_stat, p_val = ttest_ind(baseline_grouped[column], dpyproxy_grouped[column], equal_var=False, nan_policy='omit')
        p_values[column] = p_val

# Add p-values to the comparison dataframe
comparison['P-Value'] = pd.Series(p_values)
# Add a column to indicate whether the P-Value is significant (e.g., p < 0.05)
comparison['Significant'] = comparison['P-Value'] < 0.05
# comparison['Difference'] = comparison["DPYProxy"] - comparison["Baseline"]

# Display the comparison dataframe
# Differentiate header vs payload 
# Check 
comparison

Unnamed: 0,Baseline,DPYProxy,Difference,P-Value,Significant
all_packet_average_size,616.374782,493.748126,-122.6267,0.087882,False
all_packet_count,142.968085,153.730435,10.76235,0.748926,False
all_packet_max_size,1130.757447,1027.804348,-102.9531,0.307947,False
all_packet_min_size,58.73617,55.391304,-3.344866,0.41433,False
all_packet_total_size,126445.689362,116861.81087,-9583.878,0.775586,False
data_avg_payload_size,40.722695,36.319565,-4.40313,0.831503,False
data_count,9.884397,11.319565,1.435168,0.680246,False
data_max_payload_size,42.255319,36.319565,-5.935754,0.776209,False
data_min_payload_size,38.631915,36.319565,-2.31235,0.91008,False
data_total_payload_size,126.325532,75.519565,-50.80597,0.509264,False
