# Get all HTTP status codes from responses for all initializing site visit requests of EU/EEA origin sites

    • input: all enriched responses .csv files
    • output: FP_status_codes_EU.json - number of visited sites per each HTTP response status code per harvest
    • script steps:
        1. Import libraries
        2. Iterate through all enriched response files:
            (a) Load response file as a dataframe
            (b) Filter out all responses from non EU/EEA FP requests
            (c) Obtain the number of visited sites per HTTP response status code based on initiating request of every visited site
            (d) Add the obtained data to the dictionary
        3. Export the dictionary containing the number of visited sites per each HTTP status code response category per harvest - FP_status_codes_EU.json

In [None]:
# Import
import pandas as pd
import re
import os
import json

In [None]:
# Function for loading CSVs
def load_dataset(path, name):
    df = pd.read_csv(path + name)
    
    print('Dataset ', name, ' loaded')
    
    return df

In [None]:
# Function for exporting data in JSON
def export_json(path, data, export_name):
    with open(path + export_name + '.json',"w") as jsonFilehandle:
        json_data = json.dumps(data)
        jsonFilehandle.write(json_data)
        jsonFilehandle.close()
    
    print('Exported ', export_name)

In [None]:
# ONLY EU/EEA RESPONSES

# Define patht and file name
f_path = '/home/ubuntu/data/processed/crawls/response_enriched/v.3/'
export_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
re_pattern = '\d{4}\W\d{2}\W\d{2}'

# Define regex pattern
FP_status_codes = {}
i = 0

# Define variables
list_of_dataset_names = os.listdir(f_path)
list_of_dataset_names

# Define the list of files in folder
for f_name in list_of_dataset_names:
    if (f_name!='.ipynb_checkpoints'):
        i += 1
        
        # Load dataset
        df_res = load_dataset(f_path, f_name)
        
        # Get crawl date out of file name
        crawl_date = re.findall(re_pattern, f_name)[0]
        
         # Filter out all non European sites
        df_res_TP_EU = df_res[df_res['Europe'].isin(['EU', 'EEA'])]
        
        # Get all iniciative/first call for the visited site (originating call - response)
        df_first_call = df_res_TP_EU.sort_values(['visit_id', 'time_stamp']).drop_duplicates(subset='visit_id', keep = 'first')
        
        # Get dictionary of status codes
        dict_status_codes = df_first_call.groupby(['response_status']).size().to_dict()
        
        # Assign data to the dictionary with the crawl data as a key
        FP_status_codes[crawl_date] = dict_status_codes
        
        
# Export
export_json(export_path, FP_status_codes, 'FP_status_codes_EU')

print("Completed")