# Processing Webshrinker categorization data for unique TPs found

    • input: (i) EU-UNIQUE-RES-TPs.csv - list of globally unique TPs (ii) third-party categorisation data
    • output: (i) EU_TPs_categorization_processed.csv - all TPs enriched with processed categorisation, (ii) EU_TPs_categorization_processed_TOP_15.csv - TPs from top 15 TP categories enriched with processed categorisation, and (iii) EU_TPs_categories_size.csv - number of TPs per TP category
    • script steps:
        1. Import libraries
        2. Load file with all unique TPs as a DF
        3. Load third-party categorisation data as a dictionary
        4. Define a TP category unification dictionary
        5. Define Tier-2 category IDs with more than 1.5% of TPs
        6. Iterate over the categorisation dictionary and for each TP:
            (a) Check if the TP has a Tier-2 category: (i) if True - use Tier-2 category for categorisation, except for TPs with category id == IAB25-WS1 (ii) else use Tier-1 category
            (b) Create a dictionary with data from given Tier, and append it to the category data list
        7. Create a DF from category data list
        8. Merge TPs DF with categorisation DF on respective TP root domain column and create a new DF
        9. Create a new empty column (new_cat) in the merged DF
        10. Iterate through all TPs in the merged DF and check their value in cat_id column:
            (a) If a 400 value is present: (i) assign an uncategorized_IP value to the new_cat column
            (b) Else if a value is in the list of top Tier-2 categories: (i) assign current category to the new_cat column
            (c) Else: (i) assign a value based on the unification dictionary to the new_cat column
        11. Iterate through all TPs in the merged DF and check if the TP RD contains an IP address:
            (a) if True: (i) assign a value based on the uncategorised sites dictionary to the new_cat column
        12. Create a new DF containing the number of TPs in each category (based on new_cat column)
        13. Export: (i) the whole merged DF as EU_TPs_categorization_processed.csv, (ii) merged DF with TPs of the top 15 most populous categories as EU_TPs_categorization_processed_TOP_15.csv, and (iii) the DF containing the number of TPs per TP category as EU_TPs_categories_size.csv


In [4]:
# Import
import pandas as pd
import numpy as np
import json
import re

In [6]:
# Set folder paths and names
TPs_path = '/home/ubuntu/data/processed/TPs/TPs_merged/'
TPs_name = 'EU-UNIQUE-RES-TPs.csv'

TPs_cat_path = '/home/ubuntu/data/datasets_for_enrichment/categorization/'
TPs_cat_name_json = 'v3_allDataCategories_8487.json'

In [7]:
# Load list of all unique TPs
df_TPs = pd.read_csv(TPs_path + TPs_name, header=None)
df_TPs = df_TPs.rename(columns={0:'TPs'})
df_TPs.head()

Unnamed: 0,TPs
0,01mspmd5yalky8.com
1,01net.com
2,030876vw.com
3,0914.global.ssl.fastly.net
4,0klxjejyxak3.com


In [8]:
# Load categorization data for TPs
with open(TPs_cat_path + TPs_cat_name_json) as f:
    json_cat = json.load(f)

In [9]:
cat_data = []

# Iterate over JSON
for key, value in json_cat.items():
    # Check if each key has or not second categorization Tier and assign depth accordingly
    try:
        json_cat[key]['categories'][1]
        i = 1
        if json_cat[key]['categories'][0]['id'] == 'IAB25-WS1':
            i=0
    except:
        i = 0
    # Process JSON categorization data
    cat_id = json_cat[key]['categories'][i]['id']
    cat_label = json_cat[key]['categories'][i]['label']
    cat_parent = json_cat[key]['categories'][i]['parent']
    cat_score = json_cat[key]['categories'][i]['score']
    cat_confident = json_cat[key]['categories'][i]['confident']
    data = {'url': key, 'cat_id':cat_id,'cat_label':cat_label, 'cat_parent':cat_parent,'cat_score':cat_score, 'cat_confident':cat_confident}

    cat_data.append(data)

In [10]:
# Create a DF from dictionary categorization data
df_data = pd.json_normalize(cat_data)
df_data.head()

Unnamed: 0,url,cat_id,cat_label,cat_parent,cat_score,cat_confident
0,000webhostapp.com,IAB19-18,Internet Technology,IAB19,0.9651245501324388,True
1,011st.com,IAB24,Uncategorized,IAB24,1.0,True
2,01mspmd5yalky8.com,IAB25-6,Under Construction,IAB25,0.3065370802148136,True
3,01net.com,IAB19,Technology & Computing,IAB19,0.1974611772718625,True
4,030876vw.com,IAB24,Uncategorized,IAB24,1.0,True


In [11]:
# Merge left categorization data to TPs
df_merged = df_TPs.merge(df_data, how='left', left_on='TPs', right_on='url')
del df_merged['TPs']
df_merged.head()

Unnamed: 0,url,cat_id,cat_label,cat_parent,cat_score,cat_confident
0,01mspmd5yalky8.com,IAB25-6,Under Construction,IAB25,0.3065370802148136,True
1,01net.com,IAB19,Technology & Computing,IAB19,0.1974611772718625,True
2,030876vw.com,IAB24,Uncategorized,IAB24,1.0,True
3,0914.global.ssl.fastly.net,IAB25-WS1,Content Server,IAB25,1.0,True
4,0klxjejyxak3.com,IAB19-35,Web Search,IAB19,0.1108059127260707,False


In [12]:
# Group by category label and ID and obtain the size
df_count = df_merged.groupby(['cat_label', 'cat_id']).size()
df_count = pd.DataFrame(df_count)
df_count.reset_index(inplace=True)
df_count.rename(columns={0:'count'}, inplace=True)
df_count.sort_values('count', ascending=False).head(50)

Unnamed: 0,cat_label,cat_id,count
193,Uncategorized,IAB24,1654
5,Advertising,IAB3-1,638
57,Content Server,IAB25-WS1,528
135,Marketing,IAB3-11,464
186,Technology & Computing,IAB19,413
145,News / Weather / Information,IAB12,321
3,Adult Content,IAB25-3,291
195,Under Construction,IAB25-6,278
32,Business,IAB3,121
171,Shopping,IAB22,110


In [13]:
# Define key - value pairs for categorization merging
def merge_cat(x):
    return{
        'IAB1':'Arts & Entertainment',
        'IAB2':'Automotive',
        'IAB3':'Business',
        'IAB4':'Careers',
        'IAB5':'Education',
        'IAB6':'Family & Parenting',
        'IAB7':'Health & Fitness',
        'IAB8':'Food & Drink',
        'IAB9':'Hobbies & Interests',
        'IAB10':'Home & Garden',
        'IAB11':'Law, Gov’t & Politics',
        'IAB12':'News',
        'IAB13':'Personal Finance',
        'IAB14':'Society',
        'IAB15':'Science',
        'IAB16':'Pets',
        'IAB17':'Sports',
        'IAB18':'Style & Fashion',
        'IAB19':'Technology & Computing',
        'IAB20':'Travel',
        'IAB21':'Real Estate',
        'IAB22':'Shopping',
        'IAB23':'Religion & Spirituality',
        'IAB24':'Uncategorized',
        'IAB25':'Non-Standard Content',
        'IAB26':'Illegal Content'
    }[x]

In [14]:
# Iterate over all TPs and merge the categories according to the key-value pairs in the function
new_cat_list = []
re_pattern = '(\w{3}\d{1,2})'

for i, row in df_merged.iterrows():
    # If category is an IP address
    if row['cat_id']==400:
        new_cat_list.append('Uncategorized_IP')
    # If given category contains more than 100 TPs
    elif row['cat_id'] in ['IAB3-1', 'IAB3-11', 'IAB25-3', 'IAB25-6', 'IAB25-WS1']:
        new_cat_list.append(row['cat_label'])
    else:
        re_result = re.findall(re_pattern, row['cat_id'])[0]
        new_cat = merge_cat(re_result)
        new_cat_list.append(new_cat)

df_merged['new_cat']= new_cat_list        

In [15]:
# Categorize all IP addressess
re_pattern = '(\d{1,3}\W\d{1,3}\W\d{1,3}\W\d{1,3})'
for i, row in df_merged.iterrows():
    re_result = re.search(re_pattern, row['url'])
    if re_result:
        df_merged['new_cat'][i]= 'Uncategorized_IP'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['new_cat'][i]= 'Uncategorized_IP'


In [18]:
# Group by new categorization column and find the size of each category
cat_size = df_merged.groupby('new_cat').size()
pd_cat_size = pd.DataFrame(cat_size)
pd_cat_size.reset_index(inplace=True)
pd_cat_size = pd_cat_size.rename(columns={0:'count', 'new_cat':'category'})
pd_cat_size.sort_values('count', ascending=False).head(15)

Unnamed: 0,category,count
30,Uncategorized,1059
7,Content Server,1038
28,Technology & Computing,871
1,Advertising,634
16,Marketing,461
17,News,318
6,Cloud storage and hosting,276
0,Adult Content,262
32,Under Construction,240
4,Business,219


In [19]:
# Get top 15 categories
df_top15 = pd_cat_size.sort_values('count', ascending=False).head(15)
df_top15.head()

Unnamed: 0,category,count
30,Uncategorized,1059
7,Content Server,1038
28,Technology & Computing,871
1,Advertising,634
16,Marketing,461


In [20]:
# Merge TPs with top 15 categories
df_top15 = df_merged.where(df_merged['new_cat'].isin(df_top15['category'])).dropna(subset=['new_cat'])
df_top15.head()

Unnamed: 0,url,cat_id,cat_label,cat_parent,cat_score,cat_confident,new_cat
0,01mspmd5yalky8.com,IAB25-6,Under Construction,IAB25,0.3065370802148136,1.0,Under Construction
1,01net.com,IAB19,Technology & Computing,IAB19,0.1974611772718625,1.0,Technology & Computing
2,030876vw.com,IAB24,Uncategorized,IAB24,1.0,1.0,Uncategorized
3,0914.global.ssl.fastly.net,IAB25-WS1,Content Server,IAB25,1.0,1.0,Content Server
4,0klxjejyxak3.com,IAB19-35,Web Search,IAB19,0.1108059127260707,0.0,Technology & Computing


In [22]:
# EXPORT
export_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
categories_name = 'EU_TPs_categories_size.csv'
TPs_cat_name = 'EU_TPs_categorization_processed.csv'
top15_name = 'EU_TPs_categorization_processed_TOP_15.csv'

pd_cat_size.to_csv(export_path + categories_name, index=False, header=True)
df_merged.to_csv(export_path + TPs_cat_name, index=False, header=True)
df_top15.to_csv(export_path + top15_name, index=False, header=True)