# Other analysis scripts

In [1]:
CSV_DIR = './files/csv'

## CSV comparison

In [9]:
import csv

def compute_jaccard_simmilarity(file1, file2):
    """
    Check similarity between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)
        
        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]
        
        # Compute Jaccard Similarity
        # i.e., intersection / union
        similarity = len(set(column1) & set(column2)) / len(set(column1) | set(column2))
        
        return similarity
    
def get_intersection(file1, file2):
    """
    Get intersection between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)

        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]

        return set(column1) & set(column2)


# Rejected > Accepted
file1_path = CSV_DIR + '/headless_more_rejected_than_accepted.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_accepted.csv'
similarity_score = compute_jaccard_simmilarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: {similarity_score}")
print(get_intersection(file1_path, file2_path))

# Rejected > No Interaction
file1_path = CSV_DIR + '/headless_more_rejected_than_no_interaction.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_no_interaction.csv'
similarity_score = compute_jaccard_simmilarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: {similarity_score}")
print(get_intersection(file1_path, file2_path))

Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: 0.25925925925925924
{'https://worldcat.org', 'https://cancer.org', 'https://avg.com', 'https://match.com', 'https://amplitude.com', 'https://rackspacecloud.com', 'https://bmj.com'}
Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: 0.6394557823129252
{'https://gumgum.com', 'https://oclc.org', 'https://digicert.com', 'https://barracudanetworks.com', 'https://ccleaner.com', 'https://greenhouse.io', 'https://doodle.com', 'https://tfl.gov.uk', 'https://elsevierhealth.com', 'https://comodo.com', 'https://watchguard.com', 'https://askubuntu.com', 'https://siteground.us', 'https://thelancet.com', 'https://match.com', 'https://urbanairship.com', 'https://teads.tv', 'https://king.com', 'https://technologyreview.com', 'https://senderscore.com', 'https://adtelligent.com', 'https://premierleague.com', 'https://algolianet.com', 'https://docker.com', 'https://p