# Obtain various metrics from response files

    • input: all enriched responses .csv files
    • output: (i) TPs_per_site.json - number of unique TPs per site (ii) responses_total_combined.json - total number of a) all responses, b) all responses for EU/EEA, c) all responses for EU/EEA with FP-TP communication, and (iii) visited_sites_total_combined.json - total number of visited sites per harvest a) with responses, b) from EU/EEA, c) with FP-TP communication from EU/EEA
    • script steps:
        1. Import libraries
        2. Iterate through all enriched responses files:
            (a) Load a +response file as a dataframe
            (b) Filter out all responses to non EU/EEA FP requests
            (c) Filter out all FP-to-FP communication
            (d) Calculate the number of unique TPs per site and append the results as a
            JSON object to the corresponding list
            (e) Calculate the number of responses: (i) all, (ii) EU/EEA only, and (iii) EU/EEA with FP-TP communication, and append the results as a JSON object to the corresponding list
            (f) Calculate the number of visited sites: (i) with responses, (ii) with EU/EEA origin, and (iii) with with EU/EEA origin with FP-TP communication, and append the results as a JSON object to the corresponding list
        3. Export each corresponding list as a separate .json file: (i) TPs_per_site, (ii) responses_total_combined, and (iii) TPs_total_combined

#### All possible outputs:
    
    1) Count how many responses per site (only TPs, no FP-FP communication)
    2a) Count how many unique TPs per site (only TPs, no FP-FP communication)
    2b) Occurence of each TP per site root domain
    2c) Occurence of each TP per site visit
    3a) Count how many unique TPs per FP category (only TPs, no FP-FP communication)
    3b) Count how many unique TPs per FP category per sector (only TPs, no FP-FP communication)
    3c) Count how many unique TPs per FP category per sector per URL type (only TPs, no FP-FP communication)
    4) Occurance of each TP
    5a) Number of unique TPs per country
    5b) Number of responses per country
    6) Total number of responses per crawl
    7) Total number of unique TPs per crawl
    8) Total number of sites per harvest - number of unique visit_id-s
    9a) Total number of all RES for EU/EEA for FP-TP communication
    9b) Total number of all RES for EU/EEA
    9c) Total number of all RES
    10 a) Total number of visited sites with FP-TP communication from EU/EEA
    10 b) Total number of visited sites from EU/EEA
    10 c) Total number of visited sites with a RES

In [1]:
# Import
import pandas as pd
import numpy as np
import re
import os
import json

In [2]:
# Define function for loading CSVs to DFs
def load_dataset(path, name):
    df = pd.read_csv(path + name)
    
    print('Dataset ', name, ' loaded')
    
    return df

In [3]:
# Define function for exporting data in JSON
def export_json(path, data, export_name):
    with open(path + export_name + '.json',"w") as jsonFilehandle:
        json_data = json.dumps(data)
        jsonFilehandle.write(json_data)
        jsonFilehandle.close()
    
    print('Exported ', export_name)

In [4]:
# Define function for calling export in JSON with correct parameters
def exporting(export_path, total_number_of_responses_per_site, total_number_of_TPs_per_site, 
              TP_occurence_per_site_TLD, TP_occurence_per_site_visit, total_number_of_TPs_cat, 
              total_number_of_TPs_cat_sector, total_occurence_of_TP, total_number_of_TPs_country, 
              total_number_of_res_country, total_number_of_res, total_number_of_TPs, total_number_of_TPs_per_cat_type_sector,
              total_number_of_sites_per_harvest, total_number_of_total_res_COMB, total_number_visited_sites_COMB):
    
    # EXPORTING
    # 1
#    export_json(export_path, total_number_of_responses_per_site, 'responses_per_site')
    # 2a)
    export_json(export_path, total_number_of_TPs_per_site, 'TPs_per_site')
    # 2b)
#    export_json(export_path, TP_occurence_per_site_TLD, 'TP_occurence_per_site-TLD')
    # 2c)
#    export_json(export_path, TP_occurence_per_site_visit, 'TP_occurence_per_visit')
    # 3a
#    export_json(export_path, total_number_of_TPs_cat, 'TPs_per_cat')
    # 3b
#    export_json(export_path, total_number_of_TPs_cat_sector, 'TPs_per_cat_per_sector')
    # 3c
#    export_json(export_path, total_number_of_TPs_per_cat_type_sector, 'TPs_per_cat_per_sector_per_URLtype')
    # 4
#    export_json(export_path, total_occurence_of_TP, 'TP_occurance')
    # 5a
#    export_json(export_path, total_number_of_TPs_country, 'TPs_per_country')
    # 5b
#    export_json(export_path, total_number_of_res_country, 'responses_per_country')
    # 6
#    export_json(export_path, total_number_of_res, 'responses_total')
    # 7
#    export_json(export_path, total_number_of_TPs, 'TPs_total')
    # 8 
#    export_json(export_path, total_number_of_sites_per_harvest, 'unique_visit_ids_per_harvest')
    # 9
    export_json(export_path, total_number_of_total_res_COMB, 'responses_total_combined')
    # 10
    export_json(export_path, total_number_visited_sites_COMB, 'visited_sites_total_combined')
    
    

In [None]:
# Set up paths
f_path = '/home/ubuntu/data/processed/crawls/response_enriched/v.3/'
export_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'

# Set up regex pattern
re_pattern = '\d{4}\W\d{2}\W\d{2}'

# Initialize empty lists
total_number_of_responses_per_site = []
total_number_of_TPs_per_site = []
TP_occurence_per_site_TLD = []
TP_occurence_per_site_visit = []
total_number_of_TPs_cat = []
total_number_of_TPs_cat_sector = []
total_occurence_of_TP = []
total_number_of_TPs_country = []
total_number_of_res_country = []
total_number_of_res = []
total_number_of_TPs = []
total_number_of_sites_per_harvest = []
total_number_of_TPs_per_cat_type_sector = []
total_number_of_total_res_COMB = []
total_number_visited_sites_COMB = []

# Initialize control variable
i = 0

# Get all file in the folder
list_of_dataset_names = os.listdir(f_path)
list_of_dataset_names

# Loop through all files in the folder
for f_name in list_of_dataset_names:
    if (f_name!='.ipynb_checkpoints'):
        
        i += 1
        
        print(i, f_name)
        
        # Load dataset
        df_res = load_dataset(f_path, f_name)
        #print(df_res)
        
        # Get crawl date out of file name
        crawl_date = re.findall(re_pattern, f_name)[0]
        #print("Crawl date: ", crawl_date)
        
        # Reducing dataset column size
        df_res_reduced = df_res[['visit_id', 'url', 'site_url', 'response_status', 'RD_url', 'RD_site_url', 'first_party',
                                    'third_party', 'Country', 'Europe', 'PublicPrivate', 'SiteCategory','URLtype']]
        #print("Columns reduced: ", df_res_reduced)
        
         # Filter out all non European sites
        df_res_TP_EU = df_res_reduced[df_res_reduced['Europe'].isin(['EU', 'EEA'])]
        #print("Only European sites: ", df_res_TP_EU)
    
        # Filter out all first-to-first party communication
        df_res_filt = df_res_TP_EU.where(df_res_TP_EU['third_party']==True).dropna(subset=['third_party'])
        #print("Only FP-to_TP comunication: ", df_res_filt)
        
        # Change visit_id column dtype to int
        df_res_filt = df_res_filt.astype({'visit_id': 'int32'})
        
        # 1) Count how many responses per site (only TPs, no FP-FP communication)
#        res_per_site = df_res_filt.groupby('RD_site_url').url.count().to_dict()
#        json_obj = {'date': crawl_date, 'res_per_site': res_per_site}
#        total_number_of_responses_per_site.append(json_obj)
        #print(total_number_of_responses_per_site)
        
        # 2a) Count how many unique TPs per site (only TPs, no FP-FP communication)
        occurence_TP_per_site = df_res_filt.groupby(['RD_site_url', 'RD_url']).size()
        occurence_TP_per_site = occurence_TP_per_site.reset_index()
        occurence_TP_per_site = occurence_TP_per_site.groupby('RD_site_url').size().to_dict()
        json_obj = {'date': crawl_date, 'unique_TPs_per_site': occurence_TP_per_site}
        total_number_of_TPs_per_site.append(json_obj)
        #print(total_number_of_TPs_per_site)
        
        # 2b) Occurence of each TP per site TLD
#        TP_occurence_per_site_TLD_data = {g:v['RD_url'].value_counts().to_dict() for g, v in df_res_filt.groupby('RD_site_url')}
#        json_obj = {'date': crawl_date, 'TP_occurence_per_site_TLD': TP_occurence_per_site_TLD_data}
#        TP_occurence_per_site_TLD.append(json_obj)
        #print(TP_occurence_per_site_TLD)
                   
        # 2c) Occurence of each TP per site visit
#        TP_occurence_per_visit = {g:v['RD_url'].value_counts().to_dict() for g, v in df_res_filt.groupby('visit_id')}
#        json_obj = {'date': crawl_date, 'TP_occurence_per_visit': TP_occurence_per_visit}
#        TP_occurence_per_site_visit.append(json_obj)
        #print(TP_occurence_per_site_visit)
       
        # 3a) Count how many unique TPs per FP category (only TPs, no FP-FP communication)
#        TPs_per_cat = df_res_filt.groupby('SiteCategory').url.count().to_dict()
#        json_obj = {'date': crawl_date, 'TPs_per_cat': TPs_per_cat}
#        total_number_of_TPs_cat.append(json_obj)
        #print(total_number_of_TPs_cat)
       
        # 3b) Count how many unique TPs per FP category per sector (only TPs, no FP-FP communication)
#        unique_TPs_per_cat_sector = {g:v['PublicPrivate'].value_counts().to_dict() for g, v in df_res_filt.groupby('SiteCategory')}
#        json_obj = {'date': crawl_date, 'unique_TPs_per_cat_sector': unique_TPs_per_cat_sector}
#        total_number_of_TPs_cat_sector.append(json_obj)
        #print(total_number_of_TPs_cat_sector)
        
        # 3c) Count how many unique TPs per FP category per sector per URL type (only TPs, no FP-FP communication)
#        df_grouped = df_res_filt.groupby(['SiteCategory','URLtype', 'PublicPrivate']).size()
#        df_total_number_of_TPs_per_cat_type_sector = pd.DataFrame(df_grouped)
#        df_total_number_of_TPs_per_cat_type_sector.rename(columns={0:'count'}, inplace=True)
#        df_total_number_of_TPs_per_cat_type_sector = df_total_number_of_TPs_per_cat_type_sector.transpose()
#        df_total_number_of_TPs_per_cat_type_sector['date'] = crawl_date
#        json_total_number_of_TPs_per_cat_type_sector = df_total_number_of_TPs_per_cat_type_sector.to_json()
#        json_parsed = json.loads(json_total_number_of_TPs_per_cat_type_sector)
#        total_number_of_TPs_per_cat_type_sector.append(json_parsed)
        #print(total_number_of_TPs_per_cat_type_sector)
       
        # 4) Occurance of each TP
#        occurance_TP_total = df_res_filt.groupby('RD_url').size().to_dict()
#        json_obj = {'date': crawl_date, 'occurance_TP_total': occurance_TP_total}
#        total_occurence_of_TP.append(json_obj)
        #print(total_occurence_of_TP)

        # 5a) Number of unique TPs per country
#        TPs_per_country = df_res_filt.groupby('Country').agg({'RD_url':'nunique'})
#        unique_TPs_per_country = TPs_per_country.iloc[:,0].to_dict()
#        json_obj = {'date': crawl_date, 'unique_TPs_per_country': unique_TPs_per_country}
#        total_number_of_TPs_country.append(json_obj)
        #print(total_number_of_TPs_country)

        # 5b) Number of responses per country
#        total_res_per_country = df_res_filt.groupby(['Country']).size().to_dict()
#        json_obj = {'date': crawl_date, 'total_res_per_country': total_res_per_country}
#        total_number_of_res_country.append(json_obj)
        #print(total_number_of_res_country)

        # 6) Total number of responses per crawl
#        total_res = {'responses': len(df_res_filt)}
#        json_obj = {'date': crawl_date, 'total_res': total_res}
#        total_number_of_res.append(json_obj)
        #print(total_number_of_res)

        # 7) Total number of unique TPs per crawl
#        total_TPs = {'TPs': len(df_res_filt['RD_url'].unique())}
#        json_obj = {'date': crawl_date, 'total_TPs': total_TPs}
#        total_number_of_TPs.append(json_obj)
        #print(total_number_of_TPs)
        
        # 8) Total number of sites per harvest - number of unique visit_id-s
#        number_of_sites_per_harvest = len(df_res_filt['visit_id'].unique())
#        json_obj = {'date': crawl_date, 'total_sites': number_of_sites_per_harvest}
#        total_number_of_sites_per_harvest.append(json_obj)
        #print(total_number_of_sites_per_harvest)
        
        # 9a) Total number of all RES for EU/EEA for FP-TP communication
        number_of_total_res_EU_FP_TP = len(df_res_filt)
        # 9b) Total number of all RES for EU/EEA
        number_of_total_res_EU = len(df_res_TP_EU)
        # 9c) Total number of all RES
        number_of_total_res = len(df_res)
        json_obj = {'date': crawl_date, 'total_res_EU_FT_TP': number_of_total_res_EU_FP_TP, 
                    'total_res_EU': number_of_total_res_EU, 'total_res': number_of_total_res}
        total_number_of_total_res_COMB.append(json_obj)
        #print(total_number_of_total_res_COMB)
        
        # 10 a) Total number of visited sites with FP-TP communication from EU/EEA
        number_of_visited_sites_EU_FpTp = len(df_res_filt['visit_id'].unique())
        # 10 b) Total number of visited sites from EU/EEA
        number_of_visited_sites_EU = len(df_res_TP_EU['visit_id'].unique())
        # 10 c) Total number of visited sites with a RES
        number_of_visited_sites = len(df_res['visit_id'].unique())
        json_obj = {'date': crawl_date, 'visited_sites_EU_FP_TP': number_of_visited_sites_EU_FpTp, 
                    'visited_sites_EU': number_of_visited_sites_EU,'visited_sites': number_of_visited_sites}
        total_number_visited_sites_COMB.append(json_obj)
        #print(total_number_visited_sites_COMB)

# Export        
print('Export')
exporting(export_path, total_number_of_responses_per_site, total_number_of_TPs_per_site, 
          TP_occurence_per_site_TLD, TP_occurence_per_site_visit, total_number_of_TPs_cat, 
          total_number_of_TPs_cat_sector, total_occurence_of_TP, total_number_of_TPs_country, 
          total_number_of_res_country, total_number_of_res, total_number_of_TPs, total_number_of_TPs_per_cat_type_sector,
          total_number_of_sites_per_harvest, total_number_of_total_res_COMB, total_number_visited_sites_COMB)