In [None]:
import os
import pandas as pd

def make_citizen_lab_list():
    # Directory containing the CSV files
    csv_directory = "../test-lists/lists/" # From cloned github: https://github.com/ooni/test-lists/tree/master
    # Ensure the directory exists
    if not os.path.exists(csv_directory):
        print(f"Directory {csv_directory} does not exist.")
        exit(1)

    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(csv_directory) if file.endswith('.csv')]

    # Read and combine all CSV files into one DataFrame
    combined_df = pd.concat([pd.read_csv(os.path.join(csv_directory, file)) for file in csv_files])

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv("citizen_lab_censored.csv", index=False)

    print("Combined CSV saved as 'citizen_lab_censored.csv'")

    # Write only the 'url' column to a text file
    a = combined_df["url"]    
    a.to_csv("citizen_lab_censored.txt", index=False, header=False)
    return 


In [26]:
import os
import pandas as pd

def combine_csvs(directory = "results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/", name = ""): 
  
    # List to store DataFrames
    all_traffic_dataframes = []
    website_traffic_dataframes = []

    # Iterate through all files in the directory
    for root, _, files in os.walk(directory):
   
        for file in files:
            if file == "metrics_all.csv":
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                all_traffic_dataframes.append(df)
            elif file == "metrics_website_only.csv":
                try:
                    file_path = os.path.join(root, file)
                    df = pd.read_csv(file_path)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                    continue
                website_traffic_dataframes.append(df)

    # Combine all DataFrames into one
    all_traffic_combined_df = pd.concat(all_traffic_dataframes, ignore_index=True)
    website_traffic_combined_df = pd.concat(website_traffic_dataframes, ignore_index=True)    

    if name: 
        # Save the combined DataFrame to a new CSV file
        all_traffic_combined_df.to_csv(name + "_all_traffic.xac", index=False)
        website_traffic_combined_df.to_csv(name + "_website_only.csv", index=False)
    return all_traffic_combined_df, website_traffic_combined_df

baseline_all, baseline_website = combine_csvs(directory = "results/baseline", name = "combined_baseline")
dpyproxy_all, dpyproxy_website = combine_csvs(directory = "results/dpyproxy/frag_size=20__tcp_frag=True__record_frag=False/", name = "combined_dpyrpoxy")

Error reading results/baseline/dpyproxy=False/16honeys_com__/metrics_website_only.csv: Error tokenizing data. C error: Expected 39 fields in line 3, saw 50



In [27]:
# Example File
dpyproxy_website.head(5)

Unnamed: 0,website,id,date,layer_counts,all_packet_count,all_packet_sizes,all_packet_total_size,all_packet_min_size,all_packet_max_size,all_packet_average_size,...,dns_response_codes,dns_incomplete_transactions,data_count,data_total_payload_size,data_min_payload_size,data_max_payload_size,data_avg_payload_size,param_frag_size,param_tcp_frag,param_record_frag
0,http://eagle.mn/,3f13f7f6-bbe3-492f-9868-d3b55a06e773,2025-04-07 13:39:42,"{'eth': 73, 'ip': 73, 'tcp': 73, 'DATA': 7, 'h...",73,"[78, 74, 66, 91, 91, 87, 66, 66, 66, 1073, 66,...",11867,66,1354,162.561644,...,{},0,7,0,0,0,0,20,True,False
1,http://olloo.mn/,cb72c66f-2543-4cb7-98af-95545feb888f,2025-04-07 13:39:01,{},0,[],0,0,0,0.0,...,{},0,0,0,0,0,0,20,True,False
2,http://medee.mn/,9b8e5073-a718-4661-b8de-7b193c38c416,2025-04-07 13:34:45,"{'eth': 204, 'ip': 204, 'tcp': 204, 'DATA': 12...",204,"[78, 74, 66, 91, 91, 87, 66, 66, 66, 563, 66, ...",141593,54,1354,694.083333,...,{},0,12,0,0,0,0,20,True,False
3,http://medee.mn/,932ef6b5-911a-4612-a852-8d3c3004c7ba,2025-04-07 13:37:31,"{'eth': 196, 'ip': 196, 'tcp': 196, 'DATA': 12...",196,"[78, 74, 66, 91, 91, 87, 66, 66, 66, 563, 66, ...",141067,54,1354,719.729592,...,{},0,12,0,0,0,0,20,True,False
4,http://123moviesfreez.com/,9596808e-1677-4fe2-be83-84acbd9676d2,2025-04-07 13:32:37,"{'eth': 17, 'ip': 17, 'tcp': 17, 'DATA': 1, 'h...",17,"[78, 58, 54, 79, 79, 79, 60, 54, 54, 54, 54, 9...",1956,54,983,115.058824,...,{},0,1,0,0,0,0,20,True,False


In [29]:
# Filter for a specific website 
website = dpyproxy_website['website'].iloc[0]
dpyproxy = dpyproxy_website[dpyproxy_website['website'] == website]
baseline = baseline_website[baseline_website['website'] == website]

# Calculate the mean of numerical columns for both dataframes
baseline_means = baseline.mean(numeric_only=True)
dpyproxy_means = dpyproxy.mean(numeric_only=True)

# Combine the means into a single dataframe for comparison
comparison = pd.DataFrame({'Baseline': baseline_means, 'Dpyproxy': dpyproxy_means})
comparison['Difference'] = comparison['Dpyproxy'] - comparison['Baseline']

# Display the comparison (NOTE: this is for website traffic)
comparison

Unnamed: 0,Baseline,Dpyproxy,Difference
all_packet_average_size,233.837209,162.561644,-71.275565
all_packet_count,43.0,73.0,30.0
all_packet_max_size,1354.0,1354.0,0.0
all_packet_min_size,66.0,66.0,0.0
all_packet_total_size,10055.0,11867.0,1812.0
data_avg_payload_size,0.0,0.0,0.0
data_count,1.0,7.0,6.0
data_max_payload_size,0.0,0.0,0.0
data_min_payload_size,0.0,0.0,0.0
data_total_payload_size,0.0,0.0,0.0
