# Process each harvest HTTP responses

Load every harvest HTTP responses, find root domain of every response URL and originally requested site, and export as CSV

    • input: .sqlite database file of each harvest
    • output: enriched http_responses with a root domain of each visited site and response site, and columns indicating if response is FP-FP or FP-TP – CSV format
    • script steps:
        1. Import libraries
        2. Iterate through extracted folders of all harvests:
            (a) From an .sqlite database file load http_responses and site_visits tables as separate Dataframes (DFs)
            (b) Merge responses and visited sites into a single DF based on visit_id column of both DFs
            (c) In the merged DF find a Root domain (RD) of every originally requested site – FP (site_url column) and all response sites (url column) and assign root domains to new columns - RD_site_url, RD_url respectively
            (d) Compare the RD_url and RD_site_url columns to find if the response RD matches the originally requested FP RD. If so, assign True to the new DF column – first_party, else assign False
            (e) Export http_responses enriched with a root domain of each visited site and response site, and columns indicating if response is FP-FP or FP-TP – CSV format

In [1]:
# Import
import sqlite3
import re
import os
import socket
import pandas as pd
import numpy as np
from tld import get_tld, is_tld 

In [2]:
# Load a database file and return HTTP resposes and visited sites tables as dataframes
def load_tables(folder_name, file_name):

    conn = None
    try:
        conn = sqlite3.connect("/home/ubuntu/data/crawl_datasets/"+ folder_name + "/" + file_name)
    except Error as e:
        print(e)

    df_res = pd.read_sql_query("SELECT * FROM http_responses", conn)
    df_visited = pd.read_sql_query("SELECT * FROM site_visits", conn)
    
    conn.close()
    
    print('HTTP responses and site visits loaded')
    return(df_res, df_visited)

In [3]:
# Take dataframe and column on which operations will be performed
def get_root_url(df, column):

    # Take the column and create a list out of it
    urls = df[column].astype(str).tolist()
    
    # Initialize variables
    root_url = []
    re_IP_pattern = '(\d{1,3}\W\d{1,3}\W\d{1,3}\W\d{1,3})'
    
    # For each site in the column list, first check if it is possible to retrieve a top level domain out of it 
    for site in urls:
        root = get_tld(site, as_object=True, fail_silently=True)
        
        # If TLD can be obtained, find full domain and append it to the list
        if root != None:
            url_domain = get_tld(site, as_object=True).fld
            root_url.append(url_domain)
        
        # If not, check if the site is an IP address
        else:
            IP_address = re.findall(re_IP_pattern, site)
            
            # If it is an IP address, try to do reversne DNS lookup to obtain a full top level domain
            if IP_address:
                try:
                    domain_name = socket.gethostbyaddr(IP_address[0])[0]
                    root_url.append(domain_name)
                # If not possible, append IP address
                except:
                    root_url.append(IP_address[0])
            # If it is not an IP address, append original URL
            else:
                root_url.append(site)

    return(root_url)

In [6]:
# Get all folder names in the harvest data location
list_of_crawl_folders = os.listdir('/home/ubuntu/data/crawl_datasets/')

# Define a database file name
file_name = 'crawl-data.sqlite'

# Define variabels
i = 0
re_pattern= '\d{4}\D\d{2}\D\d{2}'

# Loop through all folder names
for folder_name in list_of_crawl_folders:
    print(folder_name)
    
    if (folder_name!='.ipynb_checkpoints'):
        
        # Get the crawl date from the folder name
        crawl_date = re.match(re_pattern, folder_name).group()
        print(os.listdir('/home/ubuntu/data/crawl_datasets/'+ folder_name))
        
        # Load tables to dataframes
        df_res, df_visited = load_tables(folder_name, file_name)
        
        # Merge left df_res with df_visited on visit_id and crawl_id so that we have one dataframe 
            # containing all info needed
        df_res_merged = df_res.merge(df_visited, left_on=['visit_id', 'crawl_id'], right_on=['visit_id','crawl_id'])[['crawl_id', 'visit_id', 'url', 'site_url', 'method', 'referrer', 'response_status', 'response_status_text', 'is_cached', 'headers', 'channel_id', 'time_stamp']]
        print('Finished merging')
        
        # Find root domain of url accessed and originally visited site
        root_domain_res = get_root_url(df_res_merged, 'url')
        df_res_merged['RD_url'] = root_domain_res
        root_domain_res = get_root_url(df_res_merged, 'site_url')
        df_res_merged['RD_site_url'] = root_domain_res
        print('Finished getting root URL')
        
        # Create a new column based on comparison of RD_url and RD_site_url to see which responses 
            # were FP and which TP
        df_res_merged['first_party'] = np.where(df_res_merged["RD_url"] == df_res_merged["RD_site_url"], True, False)
        df_res_merged['third_party'] = np.where(df_res_merged["RD_url"] == df_res_merged["RD_site_url"], False, True)
        print('Finished creating FP and TP columns')
        
        # Generate .csv files for request and response merged dataframes and unique TPs
        df_res_merged.to_csv (r'/home/ubuntu/data/processed/crawls/response/RES-'+ crawl_date +'.csv', index = False, header = True)
        i += 1
        print('Finished generating CSVs', i)
        
print('Completed')
    
    
    

2018-06-07-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', 'openwpm.log']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 1
.ipynb_checkpoints
2018-07-06-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', 'openwpm.log']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 2
2018-02-07-harvest-WITH_cookies-NO_js-NO_login
['crawl-data.sqlite', 'openwpm.log', 'sokol-links-to-visit-no-sublinks.txt', 'sokol-links-to-visit.txt']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 3
2018-05-19-harvest-WITH_cookies-WITH_js-NO_login
['sokol-urls.

HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 27
2018-11-19-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', 'openwpm.log']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 28
2018-06-01-harvest-WITH_cookies-WITH_js-NO_login
['sokol-urls.txt', 'crawl-data.sqlite', 'sokol-harvest-links.py', 'sokol-readme.txt', 'sokol-extract-links.py', 'openwpm.log', 'sokol-links-to-visit-no-sublinks.txt', 'sokol-links-to-visit.txt']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 29
2019-06-28-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', '

HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 54
2020-02-07-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', 'openwpm.log']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 55
2019-10-16-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', 'openwpm.log']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
Finished creating list of unique TPs
Finished generating CSVs 56
2019-12-22-harvest-WITH_cookies-WITH_js-NO_login
['crawl-data.sqlite', 'sources', 'screenshots', 'openwpm.log']
HTTP responses and site visits loaded
Finished merging
Finished getting root URL
Finished creating FP and TP columns
F