# Enriching all responses CSV file with all enrichment data

    • input: (i) .csv files with previously enriched http_responses tables, (ii) third-party categorisation data (.json), (iii) GSB and (iv) VirusTotal malicious data (.csv) 
    • output: every harvest’s http_responses data enriched with categories and malicious
    information in separate .csv
    • script steps:
        1. Import libraries
        2. Load third-party categorisation data, GSB and VirusTotal malicious data as separate dataframes
        3. Iterate through files with all enriched http_responses:
            (a) Load http_responses as a DF
            (b) Create a new dataframe by merging third-party categorisation, GSB and VirusTotal malicious data with responses DF on the corresponding TP rootdomain column
            (c) Export the enriched responses dataframe as a CSV file

In [1]:
# Import
import pandas as pd
import numpy as np
import os
import json
import csv

### Loading third-party categorization data

In [2]:
# Define file path and name
cat_path = '/home/ubuntu/data/datasets_for_enrichment/categorization/'
cat_file_name = 'v3_allDataCategories_8487.json'

# Load data
with open(cat_path + cat_file_name) as f:
    cat_dict = json.load(f)

cat_list = []

# Process data for a dataframe
for key in cat_dict.keys():
    TP = key
    id_cat = cat_dict[key]['categories'][0]['id']
    label = cat_dict[key]['categories'][0]['label']
    score = cat_dict[key]['categories'][0]['score']
    confident = cat_dict[key]['categories'][0]['confident']
    data = [TP, id_cat, label, score, confident]
    cat_list.append(data)

df_categories = pd.DataFrame(cat_list, columns=['cat_TP','cat_ID', 'cat_label', 'cat_score', 'cat_confident'])

In [3]:
df_categories

Unnamed: 0,cat_TP,cat_ID,cat_label,cat_score,cat_confident
0,000webhostapp.com,IAB19,Technology & Computing,0.972562369517679493,True
1,011st.com,IAB24,Uncategorized,1.000000000000000000,True
2,01mspmd5yalky8.com,IAB25,Non-Standard Content,0.314561813623485231,True
3,01net.com,IAB19,Technology & Computing,0.197461177271862520,True
4,030876vw.com,IAB24,Uncategorized,1.000000000000000000,True
...,...,...,...,...,...
8579,sxbhdd045.videofiednuage.com,IAB24,Uncategorized,1.000000000000000000,True
8580,tenderi.posta.rs,IAB24,Uncategorized,1.000000000000000000,True
8581,web.meteo.co.me,IAB24,Uncategorized,1.000000000000000000,True
8582,zagent866.h-cdn.com,IAB25-WS1,Content Server,1.000000000000000000,True


### Loading GSB data

In [4]:
# Define file path and name
GSB_path = '/home/ubuntu/data/datasets_for_enrichment/malicious/'
GSB_file_name = 'v3_google_malicious8487.csv'

# Load data as dataframe
df_GSB = pd.read_csv(GSB_path + GSB_file_name, header = None, low_memory=False)
df_GSB.columns = ['gsb_TP', 'gsb_malicious', 'gsb_platform', 'gsb_threats']

In [5]:
df_GSB

Unnamed: 0,gsb_TP,gsb_malicious,gsb_platform,gsb_threats
0,000webhostapp.com,False,,
1,011st.com,False,,
2,01mspmd5yalky8.com,False,,
3,01net.com,False,,
4,030876vw.com,False,,
...,...,...,...,...
8579,sxbhdd045.videofiednuage.com,False,,
8580,tenderi.posta.rs,False,,
8581,web.meteo.co.me,False,,
8582,zagent866.h-cdn.com,False,,


### Loading Virus Total data

In [6]:
# Define file path and name
virusTotal_path = '/home/ubuntu/data/datasets_for_enrichment/malicious/'
virusTotal_file_name = 'v3_ALL_malicious_VirusTotal_8487.csv'

# Load data as dataframe
df_virusTotal = pd.read_csv(virusTotal_path + virusTotal_file_name, header = None, low_memory=False)
df_virusTotal.columns = ['vt_TP', 'vt_harmless', 'vt_malicious', 'vt_suspicious', 'vt_timeout', 'vt_undetected', 'vt_response_code']

In [7]:
df_virusTotal

Unnamed: 0,vt_TP,vt_harmless,vt_malicious,vt_suspicious,vt_timeout,vt_undetected,vt_response_code
0,000webhostapp.com,75.0,2.0,0.0,0.0,7.0,403.0
1,011st.com,76.0,0.0,0.0,0.0,7.0,200.0
2,01mspmd5yalky8.com,72.0,3.0,0.0,0.0,8.0,200.0
3,01net.com,76.0,0.0,0.0,0.0,7.0,200.0
4,030876vw.com,75.0,0.0,0.0,0.0,8.0,522.0
...,...,...,...,...,...,...,...
8579,sxbhdd045.videofiednuage.com,,,,,,
8580,tenderi.posta.rs,,,,,,
8581,web.meteo.co.me,60.0,0.0,0.0,0.0,7.0,
8582,zagent866.h-cdn.com,75.0,0.0,0.0,0.0,7.0,403.0


### Enrichment of dataset

In [10]:
# Function for loading CSV file as dataframe
def load_TP_file(f_path, f_name):
    df = pd.read_csv(f_path + f_name)
    return df

In [None]:
# Define path for all files
res_path = '/home/ubuntu/data/processed/crawls/response/'

# Get all file names in the folder
res_list_of_TPs = os.listdir(res_path)

# Initialize variables
i = 0
file_lenght = []

# Loop through all file names
for res_file_name in res_list_of_TPs:
    if (res_file_name!='.ipynb_checkpoints'):
        i += 1
        
        # Load a response CSV file as a dataframe
        df_res = load_TP_file(res_path, res_file_name)
        
        # Get total number of responses in a harvest
        total_number_res = len(df_res)
        total_data = [res_file_name, total_number_res]
        file_lenght.append(total_data)
        
        # Merge all enrichment dataframes with the response dataframe
        df_enriched_1 = pd.merge(left=df_res, right=df_virusTotal, how='left', left_on='RD_url', right_on='vt_TP')
        df_enriched_2 = pd.merge(left=df_enriched_1, right=df_GSB, how='left', left_on='vt_TP', right_on='gsb_TP')
        df_enriched = pd.merge(left=df_enriched_2, right=df_categories, how='left', left_on='gsb_TP', right_on='cat_TP')
        
        # Export enriched responses
        df_enriched.to_csv('/home/ubuntu/data/processed/crawls/response_enriched/v.3/ENR-' + res_file_name, index = False, header = True)

        print('Finished generating CSV', i, res_file_name)

# Export total number of responses per harvest        
with open('/home/ubuntu/data/processed/crawls/total_respones.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(file_lenght)
        
print('COMPLETED')