# Enrich all HTTP response CSV file with extra information about first-party

    • input: .csv files of processed http_responses tables and first-party categorisation data (.csv)
    • output: every harvest’s http_responses data enriched with first-party categorisation data (.csv)
    • script steps:
        1. Import libraries
        2. Load first-party categorisation data
        3. Iterate through all processed http_responses tables:
            (a) Load http_responses as a DF
            (b) Create a new dataframe by merging first-party categorisation data with responses’ DF on the corresponding FP root-domain column
            (c) Export as the enriched responses dataframe as a CSV file

In [1]:
# Import
import pandas as pd
import numpy as np
import os
import json
import csv

### Loading visited site category and location

In [2]:
visited_cat_loc_path = '/home/ubuntu/data/datasets_for_enrichment/provided/'
visited_cat_loc_name = 'SiteCategoriesUpdated.csv'

df_visited_cat_loc = pd.read_csv(visited_cat_loc_path + visited_cat_loc_name, header = 0)

In [3]:
df_visited_cat_loc = df_visited_cat_loc[['TopLevelDomainLookUp', 'Country', 'Europe', 'PublicPrivate', 'SiteCategory', 'URLtype']]
df_visited_cat_loc.head()

Unnamed: 0,TopLevelDomainLookUp,Country,Europe,PublicPrivate,SiteCategory,URLtype
0,balkanweb.com,Albania,NonEU,Private,News,PrivateMedia NonEU
1,joq.al,Albania,NonEU,Private,News,PrivateMedia NonEU
2,gazetaexpress.com,Albania,NonEU,Private,News,PrivateMedia NonEU
3,syri.net,Albania,NonEU,Private,News,PrivateMedia NonEU
4,xing.al,Albania,NonEU,Private,News,PrivateMedia NonEU


### Enrichment of dataset

In [4]:
# Load CSV file as a dataframe and return it
def load_TP_file(f_path, f_name):
    df = pd.read_csv(f_path + f_name)
    return df

In [11]:
# Define path to the folder
res_path = '/home/ubuntu/data/processed/crawls/response/'

# Get list of all files in the folder
res_list_of_TPs = os.listdir(res_path)

# Initialize variables
i = 0
file_lenght = []

# Iterate through all filenames
for res_file_name in res_list_of_TPs:
    if (res_file_name!='.ipynb_checkpoints'):
        i += 1
        
        # Load a harvest file with HTTP responses
        df_res = load_TP_file(res_path, res_file_name)
        
        # Get number of total responses per harevest
        total_number_res = len(df_res)
        total_data = [res_file_name, total_number_res]
        file_lenght.append(total_data)
        
        # Merge left the responses dataframe with first-party extra information on the first-party root domain columns
        df_enriched = pd.merge(left=df_res, right=df_visited_cat_loc, how='left', left_on='RD_site_url', right_on='TopLevelDomainLookUp')
        
        # Export each harvest as a CSV file
        df_enriched.to_csv('/home/ubuntu/data/processed/crawls/response_enriched/v.3/ENR-' + res_file_name, index = False, header = True)

        print('Finished generating CSV', i, res_file_name)

# Export the total number of responses per every harvest as CSV
with open('/home/ubuntu/data/processed/crawls/total_respones.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(file_lenght)
        
print('COMPLETED')

Finished generating CSV 1 RES-2019-02-21.csv
Finished generating CSV 2 RES-2020-02-07.csv
Finished generating CSV 3 RES-2018-08-03.csv
Finished generating CSV 4 RES-2018-07-06.csv
Finished generating CSV 5 RES-2018-06-07.csv
Finished generating CSV 6 RES-2019-10-16.csv
Finished generating CSV 7 RES-2019-05-27.csv
Finished generating CSV 8 RES-2019-10-03.csv
Finished generating CSV 9 RES-2019-04-24.csv
Finished generating CSV 10 RES-2018-07-17.csv
Finished generating CSV 11 RES-2019-03-27.csv
Finished generating CSV 12 RES-2020-03-24.csv
Finished generating CSV 13 RES-2019-12-22.csv
Finished generating CSV 14 RES-2018-02-07.csv
Finished generating CSV 15 RES-2018-05-05.csv
Finished generating CSV 16 RES-2018-09-21.csv
Finished generating CSV 17 RES-2019-04-12.csv
Finished generating CSV 18 RES-2018-09-06.csv
Finished generating CSV 19 RES-2019-08-05.csv
Finished generating CSV 20 RES-2019-09-20.csv
Finished generating CSV 21 RES-2018-02-14.csv
Finished generating CSV 22 RES-2018-06-01.c