# Generate list of unique third-parties per harvest for EU/EEA sites

    • input: .csv files of enriched http_responses tables
    • output: list of unique TPs for each harvest’s responses - CSV format
    • script steps:
        1. Import libraries
        2. Iterate through all enriched http_responses tables:
            (a) Load http_responses as a DF
            (b) Filter out all responses from non EU/EEA FP requests
            (c) Filter out all FP-to-FP communication
            (d) Obtain a DF of unique TPs by filtering out all duplicates
            (e) Export the DF with unique TPs per harvest in a CSV format

In [1]:
# Import
import pandas as pd
import re
import os

In [2]:
# Folder path
f_path = '/home/ubuntu/data/processed/crawls/response_enriched/v.3/'

# Iterate through all files in the folder
for f_name in os.listdir(f_path):
    print(f_name)
    
    # Load the file
    df_res = pd.read_csv(f_path + f_name)

    # Obtain the harvest date from the file name
    re_pattern = '\d{4}\W\d{2}\W\d{2}'
    crawl_date = re.findall(re_pattern, f_name)[0]

    # Filter out all NON EU/EEA visited sites responses
    df_res_TP_EU = df_res[df_res['Europe'].isin(['EU', 'EEA'])]

    # Filter out all NON TP responses
    TPs_res = df_res_TP_EU['RD_url'].where(df_res_TP_EU['third_party'] == True).dropna()
    
    # Get all unique TPs 
    unique_TP_res = TPs_res.unique()
    df_unique_TP_res = pd.DataFrame(unique_TP_res)
    
    # Export
    df_unique_TP_res.to_csv (r'/home/ubuntu/data/processed/TPs/responses_EU/EU-RES-'+ crawl_date +'.csv', index = False, header = None)
    
print('COMPLETED')

ENR-RES-2019-05-29.csv
ENR-RES-2019-04-05.csv
ENR-RES-2019-02-21.csv
ENR-RES-2020-05-12.csv
ENR-RES-2018-04-17.csv
ENR-RES-2018-06-12.csv
ENR-RES-2019-06-14.csv
ENR-RES-2019-04-12.csv
ENR-RES-2018-04-09.csv
ENR-RES-2018-11-07.csv
ENR-RES-2018-09-21.csv
ENR-RES-2019-11-13.csv
ENR-RES-2018-08-03.csv
ENR-RES-2020-06-19.csv
ENR-RES-2019-01-30.csv
ENR-RES-2018-02-14.csv
ENR-RES-2019-07-12.csv
ENR-RES-2020-02-25.csv
ENR-RES-2018-07-17.csv
ENR-RES-2018-10-09.csv
ENR-RES-2018-03-21.csv
ENR-RES-2020-06-02.csv
ENR-RES-2018-06-01.csv
ENR-RES-2020-01-13.csv
ENR-RES-2018-10-31.csv
ENR-RES-2019-11-05.csv
ENR-RES-2018-06-18.csv
ENR-RES-2020-03-24.csv
ENR-RES-2019-05-01.csv
ENR-RES-2018-12-10.csv
ENR-RES-2018-03-29.csv
ENR-RES-2019-05-27.csv
ENR-RES-2020-02-07.csv
ENR-RES-2018-06-07.csv
ENR-RES-2018-02-07.csv
ENR-RES-2018-07-06.csv
ENR-RES-2018-05-19.csv
ENR-RES-2019-12-22.csv
ENR-RES-2018-05-29.csv
ENR-RES-2019-03-12.csv
ENR-RES-2019-09-02.csv
ENR-RES-2018-11-19.csv
ENR-RES-2020-04-07.csv
ENR-RES-201