# Relationships between first-party and third-party categorization

    – input data: (i) EU_TP_total_occurence_count.csv - counted presence of each TP on every visited site and (ii) visitedSitesCat.csv - list of visited sites enriched with first-party categorisation data
    – output plot: heatmap with top 15 TP categories on x-axis and all visited sites’ categories on y-axis
    – purpose: Visualise the presence of each TP category in each of the visited sites categories to find if any TP category is predominantly present on certain visited sites.

In [1]:
# Import
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
# Jupyter setup
init_notebook_mode(connected=True)

In [3]:
# Load data
f_path = '/home/ubuntu/data/processed/crawls/response_enriched/analysis_v.3/'
f_name = 'EU_TP_total_occurence_count.csv'

df_total_occurence = pd.read_csv(f_path + f_name)
df_total_occurence.head()

Unnamed: 0.1,Unnamed: 0,01mspmd5yalky8.com,01net.com,030876vw.com,0914.global.ssl.fastly.net,0klxjejyxak3.com,1.98.201.35.bc.googleusercontent.com,100posto.hr,1053041200.rsc.cdn77.org,108.59.8.1,...,zrh50.cloudfront.net,zro56hd6szoy.com,ztat.net,ztkcdn.net,ztsrv.com,zumby.io,zuora.com,zuuvi.com,zvuki.ru,zxcvads.com
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Rename column
df_total_occurence.rename(columns={'Unnamed: 0': 'visit_id'}, inplace=True)
df_total_occurence.head()

Unnamed: 0,visit_id,01mspmd5yalky8.com,01net.com,030876vw.com,0914.global.ssl.fastly.net,0klxjejyxak3.com,1.98.201.35.bc.googleusercontent.com,100posto.hr,1053041200.rsc.cdn77.org,108.59.8.1,...,zrh50.cloudfront.net,zro56hd6szoy.com,ztat.net,ztkcdn.net,ztsrv.com,zumby.io,zuora.com,zuuvi.com,zvuki.ru,zxcvads.com
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Load visited site info
site_cat_path = '/home/ubuntu/data/processed/'
site_cat_name = 'visitedSitesCat.csv'

df_site_cat = pd.read_csv(site_cat_path + site_cat_name)
df_site_cat = df_site_cat.astype({'visit_id': 'int64'})
df_site_cat.head()

Unnamed: 0,visit_id,crawl_id,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp
0,1,1,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk
1,2,2,http://nuzzel.com,nuzzel.com,United States,NotEurope,Private,News,NewsUser,nuzzel.com
2,3,3,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie
3,4,4,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is
4,5,2,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu


In [6]:
# Merge visited sited info and TP occurance per visited site
df_merged = df_total_occurence.merge(df_site_cat, how='left', on='visit_id')
df_merged.set_index('visit_id', inplace=True)
df_merged.sort_index(inplace=True)
df_merged.head()

Unnamed: 0_level_0,01mspmd5yalky8.com,01net.com,030876vw.com,0914.global.ssl.fastly.net,0klxjejyxak3.com,1.98.201.35.bc.googleusercontent.com,100posto.hr,1053041200.rsc.cdn77.org,108.59.8.1,108.59.8.35,...,zxcvads.com,crawl_id,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk
2,0,0,0,0,0,0,0,0,0,0,...,0,2,http://nuzzel.com,nuzzel.com,United States,NotEurope,Private,News,NewsUser,nuzzel.com
3,0,0,0,0,0,0,0,0,0,0,...,0,3,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie
4,0,0,0,0,0,0,0,0,0,0,...,0,4,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is
5,0,0,0,0,0,0,0,0,0,0,...,0,2,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu


In [7]:
# Create empty column in DF for unified categories of visited sites
df_merged['merged_cat'] = np.nan
df_merged.head()

Unnamed: 0_level_0,01mspmd5yalky8.com,01net.com,030876vw.com,0914.global.ssl.fastly.net,0klxjejyxak3.com,1.98.201.35.bc.googleusercontent.com,100posto.hr,1053041200.rsc.cdn77.org,108.59.8.1,108.59.8.35,...,crawl_id,site_url,url_TLD,Country,Europe,PublicPrivate,SiteCategory,URLtype,TopLevelDomainLookUp,merged_cat
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,1,https://ekstrabladet.dk/nationen/,ekstrabladet.dk,Denmark,EU,Private,News,PrivateMedia EU,ekstrabladet.dk,
2,0,0,0,0,0,0,0,0,0,0,...,2,http://nuzzel.com,nuzzel.com,United States,NotEurope,Private,News,NewsUser,nuzzel.com,
3,0,0,0,0,0,0,0,0,0,0,...,3,https://www.her.ie/entertainment/dakota-johnso...,her.ie,Ireland,EU,Private,News,PrivateMedia,her.ie,
4,0,0,0,0,0,0,0,0,0,0,...,4,https://www.government.is/ministries/locations...,government.is,Iceland,EEA,Public,Government,Ministry,government.is,
5,0,0,0,0,0,0,0,0,0,0,...,2,https://www.bme.hu/kt,bme.hu,Hungary,EU,Public,Education,University,bme.hu,


In [8]:
# Unifying categories
df_merged.loc[df_merged['SiteCategory']=='Education', 'merged_cat'] = 'University'
df_merged.loc[df_merged['SiteCategory']=='Entertainment', 'merged_cat'] = 'Entertainment'
df_merged.loc[(df_merged['SiteCategory']=='Government') | (df_merged['URLtype']=='CitizenSelfService'), 'merged_cat'] = 'Government'
df_merged.loc[df_merged['SiteCategory']=='LegalService', 'merged_cat'] = 'LegalServices'
df_merged.loc[(df_merged['SiteCategory']=='News') & (df_merged['PublicPrivate']=='Public'), 'merged_cat'] = 'NewsPublic'
df_merged.loc[(df_merged['SiteCategory']=='News') & (df_merged['PublicPrivate']=='Private'), 'merged_cat'] = 'NewsPrivate'

df_merged.loc[(df_merged['SiteCategory']=='CitizenService') & (df_merged['URLtype']=='PostalService'), 'merged_cat'] = 'PostalServices'
df_merged.loc[(df_merged['SiteCategory']=='CitizenService') & (df_merged['URLtype']=='PublicTransport'), 'merged_cat'] = 'PublicTransport'
df_merged.loc[(df_merged['SiteCategory']=='CitizenService') 
                      & (df_merged['URLtype']=='Weather')
                         & (df_merged['PublicPrivate']=='Private'), 'merged_cat'] = 'WeatherPrivate'
df_merged.loc[(df_merged['SiteCategory']=='CitizenService') 
                      & (df_merged['URLtype']=='Weather')
                         & (df_merged['PublicPrivate']=='Public'), 'merged_cat'] = 'WeatherPublic'
df_merged.loc[(df_merged['SiteCategory']=='Consumption') & ((df_merged['URLtype']=='Shopping')
                         | (df_merged['URLtype']=='ShoppingUser') | (df_merged['URLtype']=='TravelUser')
                            | (df_merged['URLtype']=='Pharmacy')), 'merged_cat'] = 'ShoppingTravel'

In [9]:
# Merge the DF by the new categories
df_merged = df_merged.groupby('merged_cat').sum()
del df_merged['crawl_id']
df_merged

Unnamed: 0_level_0,01mspmd5yalky8.com,01net.com,030876vw.com,0914.global.ssl.fastly.net,0klxjejyxak3.com,1.98.201.35.bc.googleusercontent.com,100posto.hr,1053041200.rsc.cdn77.org,108.59.8.1,108.59.8.35,...,zrh50.cloudfront.net,zro56hd6szoy.com,ztat.net,ztkcdn.net,ztsrv.com,zumby.io,zuora.com,zuuvi.com,zvuki.ru,zxcvads.com
merged_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Entertainment,3,0,0,0,1,0,0,0,0,0,...,2,2,0,0,0,0,0,13,13,0
Government,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LegalServices,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NewsPrivate,0,1180,11,707,0,0,124,9,2,1,...,30,0,3,31,9,103,17,48,0,57
NewsPublic,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,2,0,143
PostalServices,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
PublicTransport,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ShoppingTravel,0,0,0,0,0,0,0,0,0,0,...,2,0,1296,0,0,0,0,0,0,0
University,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WeatherPrivate,0,0,0,24,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Transpose and reset index of DF
df_merged = df_merged.T
df_merged.reset_index(inplace=True)
df_merged.head()

merged_cat,index,Entertainment,Government,LegalServices,NewsPrivate,NewsPublic,PostalServices,PublicTransport,ShoppingTravel,University,WeatherPrivate,WeatherPublic
0,01mspmd5yalky8.com,3,0,0,0,0,0,0,0,0,0,0
1,01net.com,0,0,0,1180,0,0,0,0,0,0,0
2,030876vw.com,0,0,0,11,0,0,0,0,0,0,0
3,0914.global.ssl.fastly.net,0,0,0,707,0,0,0,0,0,24,0
4,0klxjejyxak3.com,1,0,0,0,0,0,0,0,0,0,0


In [11]:
# Load TPs in top 15 categories 
TPs_cat_name = "EU_TPs_categorization_processed_TOP_15.csv"

df_TPs_cat = pd.read_csv(f_path + TPs_cat_name)
df_TPs_cat.head()

Unnamed: 0,url,cat_id,cat_label,cat_parent,cat_score,cat_confident,new_cat
0,01mspmd5yalky8.com,IAB25-6,Under Construction,IAB25,0.306537,1.0,Under Construction
1,01net.com,IAB19,Technology & Computing,IAB19,0.197461,1.0,Technology & Computing
2,030876vw.com,IAB24,Uncategorized,IAB24,1.0,1.0,Uncategorized
3,0914.global.ssl.fastly.net,IAB25-WS1,Content Server,IAB25,1.0,1.0,Content Server
4,0klxjejyxak3.com,IAB19-35,Web Search,IAB19,0.110806,0.0,Technology & Computing


In [12]:
# Merge the DFs
df_merged_total = df_merged.merge(df_TPs_cat, how='left', left_on='index', right_on='url')
df_merged_total.head()

Unnamed: 0,index,Entertainment,Government,LegalServices,NewsPrivate,NewsPublic,PostalServices,PublicTransport,ShoppingTravel,University,WeatherPrivate,WeatherPublic,url,cat_id,cat_label,cat_parent,cat_score,cat_confident,new_cat
0,01mspmd5yalky8.com,3,0,0,0,0,0,0,0,0,0,0,01mspmd5yalky8.com,IAB25-6,Under Construction,IAB25,0.306537,1.0,Under Construction
1,01net.com,0,0,0,1180,0,0,0,0,0,0,0,01net.com,IAB19,Technology & Computing,IAB19,0.197461,1.0,Technology & Computing
2,030876vw.com,0,0,0,11,0,0,0,0,0,0,0,030876vw.com,IAB24,Uncategorized,IAB24,1.0,1.0,Uncategorized
3,0914.global.ssl.fastly.net,0,0,0,707,0,0,0,0,0,24,0,0914.global.ssl.fastly.net,IAB25-WS1,Content Server,IAB25,1.0,1.0,Content Server
4,0klxjejyxak3.com,1,0,0,0,0,0,0,0,0,0,0,0klxjejyxak3.com,IAB19-35,Web Search,IAB19,0.110806,0.0,Technology & Computing


In [13]:
# Sum total occurance of TPs per their respective category
df_relationship = df_merged_total.groupby('new_cat').sum()
del df_relationship['cat_score']
del df_relationship['cat_confident']
df_relationship

Unnamed: 0_level_0,Entertainment,Government,LegalServices,NewsPrivate,NewsPublic,PostalServices,PublicTransport,ShoppingTravel,University,WeatherPrivate,WeatherPublic
new_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Adult Content,8866,368,1211,50569,7468,741,175,14221,3017,104,221
Advertising,112630,930,4363,765917,98778,5962,4447,193233,16403,6965,4511
Arts & Entertainment,2765,1100,423,18353,8749,818,9,1968,703,63,0
Business,55653,46104,81891,208984,74388,18867,5627,105292,81796,2233,2074
Cloud storage and hosting,7833,385,1453,30858,4038,1264,137,14944,1571,549,71
Content Server,125284,59570,119711,483258,123835,18251,7376,164798,125492,6242,3105
Hobbies & Interests,7673,787,108,21077,10942,647,129,6488,97,131,229
Marketing,93493,23098,56464,443399,105050,24411,4977,178681,85776,3134,2103
News,7014,693,11,51554,15458,109,210,2695,272,382,338
Shopping,1830,0,26,8307,1368,12,52,8942,0,67,51


In [14]:
# Reaorder rows for the heatmap
df_relationship['order'] = [9, 6, 12, 2, 11, 3, 13, 4, 10, 15, 5, 1, 14, 7, 8]
df_relationship.sort_values('order', inplace=True)
df_relationship

Unnamed: 0_level_0,Entertainment,Government,LegalServices,NewsPrivate,NewsPublic,PostalServices,PublicTransport,ShoppingTravel,University,WeatherPrivate,WeatherPublic,order
new_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Technology & Computing,140548,42120,89566,610488,160710,38498,8833,278004,149139,7886,4169,1
Business,55653,46104,81891,208984,74388,18867,5627,105292,81796,2233,2074,2
Content Server,125284,59570,119711,483258,123835,18251,7376,164798,125492,6242,3105,3
Marketing,93493,23098,56464,443399,105050,24411,4977,178681,85776,3134,2103,4
Society,70339,17835,25089,225861,56692,13837,3588,118508,81198,787,1222,5
Advertising,112630,930,4363,765917,98778,5962,4447,193233,16403,6965,4511,6
Uncategorized,83644,16398,23162,469568,86725,14847,3203,164597,29797,3597,2164,7
Under Construction,14163,1576,693,49239,4476,722,428,8991,1047,426,247,8
Adult Content,8866,368,1211,50569,7468,741,175,14221,3017,104,221,9
News,7014,693,11,51554,15458,109,210,2695,272,382,338,10


In [15]:
# Get list of TP categories
list_TP_cat = df_relationship.index

In [16]:
# Define the order of categories
list_visite_cat = ['NewsPrivate', 'ShoppingTravel', 'Entertainment', 'NewsPublic', 'University',
                   'LegalServices', 'PublicTransport', 'Government', 'WeatherPrivate', 'PostalServices', 'WeatherPublic']

In [17]:
# Create heatmap
fig = go.Figure(go.Heatmap(
    z=[df_relationship['NewsPrivate'], df_relationship['ShoppingTravel'], df_relationship['Entertainment'],
       df_relationship['NewsPublic'], df_relationship['University'], df_relationship['LegalServices'], 
       df_relationship['Government'], df_relationship['PostalServices'], df_relationship['PublicTransport'], 
       df_relationship['WeatherPrivate'],df_relationship['WeatherPublic']],
    x=list_TP_cat,
    y=list_visite_cat,
    hoverongaps = False, 
    colorbar=dict(
        tickvals = [0, 50000, 100000, 500000, 750000],
        title='Total TPs'
    ),
   colorscale=[[0.0, "rgb(49,54,149)"],
               [1./1000, "rgb(69,117,180)"],
               [1./100, "rgb(116,173,209)"],
               [1./50, "rgb(171,217,233)"],
               [1./35, "rgb(224,243,248)"],
               [1./25, "rgb(254,224,144)"],
               [1./20, "rgb(253,174,97)"],
               [1./15, "rgb(244,109,67)"],
               [1./10, "rgb(215,48,39)"],
               [1.0, "rgb(165,0,38)"]],
))

# Set y-axis title
fig.update_yaxes(title_text="Visited website categories")

# Set x-axis title
fig.update_xaxes(title_text="Third-party categories")

# Add figure title
fig.update_layout(
    title={
        'text': "Number of total TP occurrence per category in relation with visited site<br>categories",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font':dict(size=20)},
    font=dict(
        family="Courier New, monospace",
    )
)

# Set plot size - use when exporting
#fig.update_layout(
#    autosize=False,
#    width=1100,
#    height=600,
#    )


iplot(fig)

In [18]:
# Export
fig.write_image("/home/ubuntu/Plots/FIG_10.pdf")