In [1]:
# Re-importing the necessary libraries
import json
import pandas as pd
import os
import glob
from os.path import basename
import numpy as np
from utils import  get_initiators, get_ps1_or_host, get_referrer,check_third_party, match_entity, get_visit_metadata, is_failed_visit

In [2]:
def get_response_referrer_policy(request):
    try:
        return request['responseHeaders'].get('referrer-policy', '')
    except KeyError:
        return ""
    
def get_req_referrer_policy(request):
    try:
        return request['reqReferrerPolicy']
    except KeyError:
        return ""

In [3]:
def get_all_reqs(path, saved_req_file_path):
    """
    Extracts request information from the JSON files in the given path.
    
    Args:
    path (str): The path to the JSON files containing the request data.
    
    Returns:
    reqDF (pd.DataFrame): A DataFrame containing the request information.
    """
    rows_list = []
    to_drop = []
    attr_list = []
    meta_list = []
    # dir_path = os.path.dirname(path)

    for json_path in glob.glob(path):
        json_name = basename(json_path)

        try:
            with open(json_path) as file:
                results = json.load(file)
        except Exception as e:
            print("ERROR: Cannot load the json", json_name, e)
            continue

        # Get visit metadata and check whether it should be processed
        init_url, final_url, site_domain, should_process = get_visit_metadata(results, json_name)
        if not should_process or is_failed_visit(results):
            to_drop.append(json_path)
            continue

        try:
            requests = results["data"]["requests"]
            for request in requests:
                request_url = request['url']
                status = request.get("status")
                size = request.get("size")
                request_url_domain = get_ps1_or_host(request_url)
                site_fld = get_ps1_or_host(site_domain)
                referrer = get_referrer(request)
                post_data = request.get('postData', '')
                request_type = (request['type']).lower()
                req_initiators = get_initiators(request['initiators'])
                third_party_req = site_fld != request_url_domain
                response_ref_policy = (get_response_referrer_policy(request))
                req_ref_policy = (get_req_referrer_policy(request))
                req_details = (site_domain, request_url, request_url_domain, referrer, request_type, req_initiators, third_party_req, post_data, final_url, init_url, status, size, response_ref_policy, req_ref_policy)
                rows_list.append(req_details)
        except Exception as err:
            print('Error get_all_reqs', site_domain, err)
        try:
            linkAttributes = results["data"]["elementAttributes"]
            for link in linkAttributes["elmAttrs"]:
                # print("--site_domain--", site_domain)
                href = link['href']
                src = link['src']
                rel = link['rel']
                referrerpolicy = link['linkReferrerPolicy']
                title = link['title']
                text = link['text']
                frameUrl = link['frameUrl']
                tagName = link['tagName']
                isFrame = link['inFrame']
                type = link['type']
                crossorigin = link['crossorigin']
                as_attr = link['as']
                integrity = link['integrity']
                link_details = (init_url, href, src, rel, referrerpolicy, title, text, frameUrl, tagName, isFrame, type, crossorigin, as_attr, integrity)
                attr_list.append(link_details)
            for meta in linkAttributes["metaRP"]:
                referrer_policy = meta["metaReferrerContent"]
                # print(referrer_policy)
                frameUrl = meta['frameUrl']
                inFrame = meta['inFrame']
                pageURL = meta['pageUrl']
                # print (pageURL)
                metadata_details = (init_url, referrer_policy, frameUrl, inFrame, pageURL)
                meta_list.append(metadata_details)
        except Exception as err:
            print('Error linkAttributes', site_domain, err)
            

    # Save excluded JSON files
    file_name = '_'.join(path.split("/")[-2].split("_")) + '_to_drop.txt'
    with open(f'to_drop_JSONs/{file_name}', 'w') as file:
        file.write('\n'.join(to_drop))

    reqDF = pd.DataFrame(rows_list, columns=['site_domain', 'request_url', 'request_url_domain', 'referrer', 
                                             'request_type', 'req_initiators', 'third_party_req', 'post_data', 'final_url', 'init_url', 'status', 'size', 'response_ref_policy', 'req_ref_policy'])
    reqDF.to_csv(saved_req_file_path, index=False)
    attrDF = pd.DataFrame(attr_list, columns=['init_url', 'href', 'src', 'rel', 'referrerpolicy', 'title', 'text', 'frameUrl', 'tagName', 'isFrame', 'type', 'crossorigin', 'as_attr', 'integrity'])
    # attrDF.to_json(saved_req_file_path.replace('requests', 'linkAttrs'), orient='records')
    attrDF.to_csv(saved_req_file_path.replace('requests', 'linkAttrs'), index=False)
    metaDF = pd.DataFrame(meta_list, columns=['init_url', 'referrer_policy', 'frameUrl', 'inFrame', 'pageURL'])
    metaDF.to_csv(saved_req_file_path.replace('requests', 'metadata'), index=False)


    return reqDF, attrDF, metaDF


In [4]:
# path = '../2023-10-11_inner_collector/*.json'
# path = '../2023-11-06_inner_collector/*.json'
path = './2023-12-07_inner_collector_optout/*.json'

def create_directory(directory_name):
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
        # return path of the created directory
    return os.path.join(os.getcwd(), directory_name)

saved_requests_dir_path = create_directory('saved_requests')
saved_linkAttrs_dir_path = create_directory('saved_linkAttrs')
saved_meta_dir_path = create_directory('saved_metadata')
drop_dir_path = create_directory('to_drop_JSONs')

file_name_reqs = '_'.join(path.split("/")[-2].split("_")) + '_requests.csv'
saved_req_file_path = os.path.join(saved_requests_dir_path, file_name_reqs)


In [5]:
df_req, df_attr, df_meta = get_all_reqs(path, saved_req_file_path)

Error linkAttributes yandex.com 'text'
Error linkAttributes detik.com 'text'
ERROR: error page   metadata.json
ERROR: NO FINAL or INIT URL:  metadata.json
ERROR: No data  metadata.json
Error linkAttributes onet.pl 'text'
Error linkAttributes umn.edu 'text'
Error linkAttributes umn.edu 'text'
Error linkAttributes umn.edu 'text'
Error linkAttributes umn.edu 'text'


In [6]:
df_req

Unnamed: 0,site_domain,request_url,request_url_domain,referrer,request_type,req_initiators,third_party_req,post_data,final_url,init_url,status,size,response_ref_policy,req_ref_policy
0,ox.ac.uk,https://www.ox.ac.uk/research,ox.ac.uk,,document,{},False,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,110448.0,,strict-origin-when-cross-origin
1,ox.ac.uk,https://www.ox.ac.uk/sites/files/oxford/advagg...,ox.ac.uk,https://www.ox.ac.uk/research,stylesheet,{ox.ac.uk},False,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,7799.0,,strict-origin-when-cross-origin
2,ox.ac.uk,https://www.ox.ac.uk/sites/files/oxford/advagg...,ox.ac.uk,https://www.ox.ac.uk/research,stylesheet,{ox.ac.uk},False,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,5005.0,,strict-origin-when-cross-origin
3,ox.ac.uk,https://www.ox.ac.uk/sites/files/oxford/advagg...,ox.ac.uk,https://www.ox.ac.uk/research,stylesheet,{ox.ac.uk},False,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,6155.0,,strict-origin-when-cross-origin
4,ox.ac.uk,https://fonts.googleapis.com/css?family=PT+San...,fonts.googleapis.com,https://www.ox.ac.uk/,stylesheet,{ox.ac.uk},True,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,1367.0,,strict-origin-when-cross-origin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493665,mbga.jp,https://aw.dw.impact-ad.jp/c/u/?gdpr=0&oid=207...,impact-ad.jp,,image,{impact-ad.jp},True,,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.mbga.jp/_game_intro?game_id=12016007,302.0,,,strict-origin-when-cross-origin
493666,mbga.jp,https://aw.dw.impact-ad.jp/c/map/?sp=cro&oid=2...,impact-ad.jp,,image,{impact-ad.jp},True,,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.mbga.jp/_game_intro?game_id=12016007,200.0,195.0,,strict-origin-when-cross-origin
493667,mbga.jp,https://id5-sync.com/g/v2/1270.json,id5-sync.com,https://6015542.fls.doubleclick.net/,xhr,{impact-ad.jp},True,"{""partner"":1270,""v"":""1.0.36"",""o"":""api"",""tml"":""...",https://www.andapp.jp/apps/12016007?from=www_m...,https://www.mbga.jp/_game_intro?game_id=12016007,200.0,1113.0,,strict-origin-when-cross-origin
493668,mbga.jp,https://aw.dw.impact-ad.jp/c/map/?sp=ttd&oid=2...,impact-ad.jp,,image,{impact-ad.jp},True,,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.mbga.jp/_game_intro?game_id=12016007,200.0,58.0,,strict-origin-when-cross-origin


In [7]:
df_attr

Unnamed: 0,init_url,href,src,rel,referrerpolicy,title,text,frameUrl,tagName,isFrame,type,crossorigin,as_attr,integrity
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/sites/default/themes/cust...,,shortcut icon,,,,https://www.ox.ac.uk/research,link,False,,,,
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/sites/default/themes/cust...,,apple-touch-icon,,,,https://www.ox.ac.uk/research,link,False,,,,
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/sites/default/themes/cust...,,apple-touch-icon-precomposed,,,,https://www.ox.ac.uk/research,link,False,,,,
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,,canonical,,,,https://www.ox.ac.uk/research,link,False,,,,
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/node/2297,,shortlink,,,,https://www.ox.ac.uk/research,link,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026308,https://www.mbga.jp/_game_intro?game_id=12016007,,https://gum.criteo.com/sync?c=333&r=1&u=https%...,,,,,about:blank,img,True,,,,
1026309,https://www.mbga.jp/_game_intro?game_id=12016007,,https://yjtag.yahoo.co.jp/csx?tp=khADDtf,,,,,about:blank,img,True,,,,
1026310,https://www.mbga.jp/_game_intro?game_id=12016007,,https://analytics.twitter.com/i/adsct?p_user_i...,,,,,about:blank,img,True,,,,
1026311,https://www.mbga.jp/_game_intro?game_id=12016007,//twitter.com,,dns-prefetch,,,,https://platform.twitter.com/widgets/tweet_but...,link,True,,,,


In [8]:
df_attr.tagName.value_counts()

a         644849
img       166799
script    112834
link       84575
iframe     16959
area         297
Name: tagName, dtype: int64

In [9]:
# Response Referrer Policy
df_req[(df_req.third_party_req == True) & (df_req.response_ref_policy != '')].drop_duplicates(subset=['init_url', 'response_ref_policy']).response_ref_policy.value_counts()

strict-origin-when-cross-origin                                     818
unsafe-url                                                          203
no-referrer-when-downgrade                                          191
origin                                                              171
same-origin                                                         114
origin-when-cross-origin                                            111
no-referrer                                                          94
strict-origin-when-cross-origin\nstrict-origin-when-cross-origin     23
origin-when-cross-origin, strict-origin-when-cross-origin            22
same-origin\nstrict-origin-when-cross-origin                          7
strict-origin                                                         6
nosniff                                                               4
Name: response_ref_policy, dtype: int64

In [10]:
df_req[df_req.response_ref_policy == 'strict-origin-when-cross-origin']

Unnamed: 0,site_domain,request_url,request_url_domain,referrer,request_type,req_initiators,third_party_req,post_data,final_url,init_url,status,size,response_ref_policy,req_ref_policy
92,ox.ac.uk,https://cc.cdn.civiccomputing.com/9/cookieCont...,civiccomputing.com,https://www.ox.ac.uk/,script,{googletagmanager.com},True,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,94887.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
112,ox.ac.uk,https://apikeys.civiccomputing.com/c/v?d=www.o...,civiccomputing.com,https://www.ox.ac.uk/,xhr,"{civiccomputing.com, googletagmanager.com}",True,,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,200.0,697.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
792,mamastar.jp,https://code.piano.io/api/tinypass.min.js,piano.io,https://hokatsu.mamastar.jp/,script,{piano.io},True,,https://hokatsu.mamastar.jp/,https://hokatsu.mamastar.jp/,200.0,108148.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
2316,cookpad.com,https://news.cookpad.com/articles/53482,cookpad.com,,document,{},False,,https://news.cookpad.com/articles/53482,https://news.cookpad.com/articles/53482,200.0,34944.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
2535,life360.com,https://www.life360.com/driving-safety,life360.com,,document,{},False,,https://www.life360.com/driving-safety/,https://www.life360.com/driving-safety,301.0,,strict-origin-when-cross-origin,strict-origin-when-cross-origin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493383,wired.com,https://shopify.privy.com/widget.js?shop=wired...,privy.com,https://shop.wired.com/,script,{wired.com},True,,https://shop.wired.com/,https://shop.wired.com/,200.0,1131.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
493404,wired.com,https://api.privy.com/businesses/AE895E13560FD...,privy.com,https://shop.wired.com/,xhr,"{privy.com, wired.com}",True,,https://shop.wired.com/,https://shop.wired.com/,200.0,6163.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
493455,wired.com,https://events.privy.com/v2/collect,privy.com,https://shop.wired.com/,xhr,"{privy.com, wired.com}",True,"{""event"":""new-session"",""properties"":{""referrin...",https://shop.wired.com/,https://shop.wired.com/,200.0,422.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin
493472,wired.com,https://promotions.lpage.co/campaigns/3972448/...,lpage.co,https://shop.wired.com/,document,{privy.com},True,,https://shop.wired.com/,https://shop.wired.com/,200.0,5395.0,strict-origin-when-cross-origin,strict-origin-when-cross-origin


In [11]:
df_req[(df_req.response_ref_policy=="no-referrer") & (df_req.third_party_req == True) & (df_req.response_ref_policy != '')]

Unnamed: 0,site_domain,request_url,request_url_domain,referrer,request_type,req_initiators,third_party_req,post_data,final_url,init_url,status,size,response_ref_policy,req_ref_policy
4614,thesun.co.uk,https://commercial-analytics-collector.news.co...,news.co.uk,https://www.thesun.co.uk/,ping,{thesun.co.uk},True,"{""prebid"":[{""wrapper"":""prebid"",""ad_unit_code"":...",https://www.thesun.co.uk/shopping/,https://www.thesun.co.uk/shopping,202.0,0.0,no-referrer,strict-origin-when-cross-origin
4623,thesun.co.uk,https://commercial-analytics-collector.news.co...,news.co.uk,https://www.thesun.co.uk/,ping,"{amazon-adsystem.com, thesun.co.uk}",True,"{""amazon"":[{""wrapper"":""amazon"",""auction_id"":""7...",https://www.thesun.co.uk/shopping/,https://www.thesun.co.uk/shopping,202.0,0.0,no-referrer,strict-origin-when-cross-origin
4649,thesun.co.uk,https://commercial-analytics-collector.news.co...,news.co.uk,https://www.thesun.co.uk/,ping,"{doubleclick.net, thesun.co.uk}",True,"{""slotRender"":{""wrapper"":""slotRender"",""name"":""...",https://www.thesun.co.uk/shopping/,https://www.thesun.co.uk/shopping,202.0,0.0,no-referrer,strict-origin-when-cross-origin
4662,thesun.co.uk,https://commercial-analytics-collector.news.co...,news.co.uk,https://www.thesun.co.uk/,ping,"{doubleclick.net, thesun.co.uk}",True,"{""slotRender"":{""wrapper"":""slotRender"",""prebid_...",https://www.thesun.co.uk/shopping/,https://www.thesun.co.uk/shopping,202.0,0.0,no-referrer,strict-origin-when-cross-origin
4663,thesun.co.uk,https://commercial-analytics-collector.news.co...,news.co.uk,https://www.thesun.co.uk/,ping,"{doubleclick.net, thesun.co.uk}",True,"{""slotRender"":{""wrapper"":""slotRender"",""prebid_...",https://www.thesun.co.uk/shopping/,https://www.thesun.co.uk/shopping,202.0,0.0,no-referrer,strict-origin-when-cross-origin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477843,avito.ru,https://kimberlite.io/rtb/sync/mts?u=d65f7b7b-...,kimberlite.io,,image,"{buzzoola.com, avito.ru}",True,,https://www.avito.ru/all/zhivotnye,https://www.avito.ru/all/zhivotnye,307.0,,no-referrer,no-referrer
482639,huffpost.com,https://sync.inmobi.com/TAM?redirect=https%3A%...,inmobi.com,,document,{amazon-adsystem.com},True,,https://www.huffpost.com/voices/,https://www.huffpost.com/voices,302.0,,no-referrer,no-referrer
487183,tenki.jp,https://sync.inmobi.com/gob?google_push=AXcoOm...,inmobi.com,https://pagead2.googlesyndication.com/,image,{googlesyndication.com},True,,https://earthquake.tenki.jp/bousai/earthquake/,https://tenki.jp/bousai/earthquake,302.0,,no-referrer,strict-origin-when-cross-origin
487256,tenki.jp,https://sync.inmobi.com/gobRedirectFromId5?id=...,inmobi.com,,image,{googlesyndication.com},True,,https://earthquake.tenki.jp/bousai/earthquake/,https://tenki.jp/bousai/earthquake,302.0,,no-referrer,no-referrer


In [12]:
req_ref_policy = df_req[(df_req.third_party_req == True) & (df_req.req_ref_policy != '')].drop_duplicates(subset=['init_url', 'req_ref_policy'])
req_ref_policy.req_ref_policy.value_counts()

strict-origin-when-cross-origin    2905
no-referrer-when-downgrade          586
unsafe-url                          370
origin                              354
no-referrer                         285
origin-when-cross-origin            168
same-origin                          67
strict-origin                        18
Name: req_ref_policy, dtype: int64

In [13]:
req_ref_policy[req_ref_policy.req_ref_policy == 'no-referrer']

Unnamed: 0,site_domain,request_url,request_url_domain,referrer,request_type,req_initiators,third_party_req,post_data,final_url,init_url,status,size,response_ref_policy,req_ref_policy
1475,amazon.com.mx,https://ib.adnxs.com/setuid/a9?entity=188&code...,adnxs.com,,image,{amazon-adsystem.com},True,,https://www.amazon.com.mx/s?k=carteras&i=fashi...,https://www.amazon.com.mx/s/?_encoding=utf8&k=...,307.0,,,no-referrer
3075,change.org,https://play.google.com/log?format=json&hasfas...,google.com,,preflight,{google.com},True,,https://www.change.org/s/member?source_locatio...,https://www.change.org/member?source_location=...,200.0,0.0,,no-referrer
4686,thesun.co.uk,https://csync.loopme.me/?pubid=11405&redirect=...,loopme.me,,image,{amazon-adsystem.com},True,,https://www.thesun.co.uk/shopping/,https://www.thesun.co.uk/shopping,307.0,,,no-referrer
6073,launchdarkly.com,https://play.google.com/log?format=json&hasfas...,google.com,,xhr,"{google.com, /_/gsi/_/js/k=gsi.gsi.en_US.l0gW8...",True,"[[1,null,null,null,null,null,null,null,null,nu...",https://app.launchdarkly.com/login?redirect=%2F,https://app.launchdarkly.com/,200.0,155.0,,no-referrer
6567,nytimes.com,https://cs.media.net/cksync?cs=31&type=tam&red...,media.net,,image,{amazon-adsystem.com},True,,https://www.nytimes.com/spotlight/podcasts,https://www.nytimes.com/spotlight/podcasts,302.0,,,no-referrer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488252,acesso.gov.br,https://cdn.dsgovserprodesign.estaleiro.serpro...,serpro.gov.br,,stylesheet,{acesso.gov.br},True,,https://cadastro.acesso.gov.br/termo-de-uso,https://cadastro.acesso.gov.br/termo-de-uso,,,,no-referrer
489301,foxnews.com,https://sync.1rx.io/usersync2/rmpssp?sub=amazo...,1rx.io,,image,{amazon-adsystem.com},True,,https://www.foxnews.com/video,https://www.foxnews.com/video,302.0,,,no-referrer
490351,hitomi.la,https://accounts.google.com/ServiceLogin?passi...,google.com,,image,"{wpadmngr.com, wpushsdk.com}",True,,https://hitomi.la/imageset/%e5%8d%87%e5%a4%a9-...,https://hitomi.la/imageset/%e5%8d%87%e5%a4%a9-...,302.0,,,no-referrer
491518,zemanta.com,https://www.googletagmanager.com/gtm.js?id=GTM...,googletagmanager.com,,script,{zemanta.com},True,,https://www.zemanta.com/privacy/,https://www.zemanta.com/privacy,200.0,92044.0,,no-referrer


In [14]:
# Link Referrer Policy
df_attr.drop_duplicates(subset=['init_url','referrerpolicy']).referrerpolicy.value_counts()

no-referrer-when-downgrade         126
unsafe-url                         100
no-referrer                         46
origin                              40
strict-origin-when-cross-origin     36
strict-origin                        8
origin-when-cross-origin             5
Name: referrerpolicy, dtype: int64

In [15]:
df_attr[df_attr["referrerpolicy"] == "no-referrer-when-downgrade"]

Unnamed: 0,init_url,href,src,rel,referrerpolicy,title,text,frameUrl,tagName,isFrame,type,crossorigin,as_attr,integrity
13423,https://www.rakuten-sec.co.jp/web/foreign/etf,,https://asia.creativecdn.com/tags?type=iframe&...,,no-referrer-when-downgrade,,,https://www.rakuten-sec.co.jp/web/foreign/etf/,iframe,False,,,,
13431,https://www.rakuten-sec.co.jp/web/foreign/etf,,https://b6.im-apps.net/3929/rt/11980/?vid=01HH...,,no-referrer-when-downgrade,,,https://www.rakuten-sec.co.jp/web/foreign/etf/,img,False,,,,
25421,https://gall.dcinside.com/board/view/?id=dcbes...,,https://ad.xc.netinsight.co.kr/xc/j/ccDkYbfd/x...,,no-referrer-when-downgrade,,,https://gall.dcinside.com/board/view/?id=dcbes...,script,False,text/javascript,,,
36532,https://www.epsilon.com/us/about-us/pressroom/...,,https://app.hubspot.com/content-tools-menu/api...,,no-referrer-when-downgrade,,,https://www.epsilon.com/us/about-us/pressroom/...,script,False,text/javascript,,,
38194,https://www.appier.com/?hslang=en,,https://app.hubspot.com/content-tools-menu/api...,,no-referrer-when-downgrade,,,https://www.appier.com/en/?hslang=en,script,False,text/javascript,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985478,https://www.olx.ua/uk/rabota,,https://creativecdn.com/tags?type=iframe&id=pr...,,no-referrer-when-downgrade,,,https://www.olx.ua/uk/rabota/,iframe,False,,,,
1010550,https://www.epsilon.com/us/client-success/case...,,https://app.hubspot.com/content-tools-menu/api...,,no-referrer-when-downgrade,,,https://www.epsilon.com/us/client-success/case...,script,False,text/javascript,,,
1012817,https://tenki.jp/bousai/earthquake,,https://www.googletagmanager.com/gtm.js?id=GTM...,,no-referrer-when-downgrade,,,https://earthquake.tenki.jp/bousai/earthquake/,script,False,text/javascript,,,
1012820,https://tenki.jp/bousai/earthquake,,https://sync.im-apps.net/imid/segment?callback...,,no-referrer-when-downgrade,,,https://earthquake.tenki.jp/bousai/earthquake/,script,False,text/javascript,,,


In [16]:
# Get rel link attribute
df_attr[(df_attr.rel == "noreferrer" )].drop_duplicates(subset=['init_url', 'rel']).rel.value_counts()

noreferrer    231
Name: rel, dtype: int64

In [17]:
df_attr.drop_duplicates(subset=['init_url', 'isFrame']).isFrame.value_counts()

False    3132
True     1755
Name: isFrame, dtype: int64

In [18]:
df_attr.tagName.value_counts()

a         644849
img       166799
script    112834
link       84575
iframe     16959
area         297
Name: tagName, dtype: int64

In [19]:
df_attr.as_attr.value_counts()

script        8534
font          2949
style         2505
image          601
fetch          413
undefined       36
div             31
fonts           25
document        11
                 9
stylesheet       2
a                2
worker           2
Name: as_attr, dtype: int64

In [20]:
df_attr.crossorigin.value_counts()

anonymous          14668
                    5385
true                1168
use-credentials      280
crossorigin           69
*                     35
crossOrigin            4
same-origin            2
Name: crossorigin, dtype: int64

In [21]:
df_attr.integrity.value_counts()

sha384-JtvhFQlPQ6LL/+I5aABhkbXo/hmh5M6IvL9vK+ecFqveRPvf7P6cGzs1DEyU5A3c                            224
sha512-euoFGowhlaLqXsPWQ48qSkBSCFs3DPRyiwVu3FjR96cMPx+Fr+gpWRhIafcHwqwCqWS42RZhIudOvEI+Ckf6MA==    173
sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0=                                                 46
sha384-oyOrIfu0dTVXgJDnDwTkpAOw6OQnC6D4wN0pmPLvl75dXBhYohgWHMyv3Y05PPLU                             37
sha256-/xUj+3OJU5yExlq6GSYGSHk7tPXikynS7ogEvDej/m4=                                                 31
                                                                                                  ... 
sha384-IYkqoIXkdl+WJcoBj1LZTExMvaRVpoG/9c7oVDUMyNxFPDbKlE7/+pG0dgVttvWl                              1
sha384-LyHGXhTbjDIkuP7AbL+VErN4jX9XAQsTJV2XTVXu9zm/7Z8MedCR3ElBpOd+0v7J                              1
sha384-gxy0R8gznzoSM0vSba4c6civwB+qWm1KZZvMX4T/rq5y06I07ucXW9wWtuTz3nv6                              1
sha384-IOd+jF7FUSVClxjkEmZL3TnW8URsklhPBZA8pAbjUEGI9FrfcjNeL2JxCTNDNMrU  

In [22]:
df_meta

Unnamed: 0,init_url,referrer_policy,frameUrl,inFrame,pageURL
0,https://ameblo.jp/borderfreecosmetics/entry-12...,origin,https://ameblo.jp/borderfreecosmetics/entry-12...,False,https://ameblo.jp/borderfreecosmetics/entry-12...
1,https://www.amazon.com.mx/s/?_encoding=utf8&k=...,no-referrer,https://s.amazon-adsystem.com/v3/pr?exlist=n-x...,True,https://www.amazon.com.mx/s?k=carteras&i=fashi...
2,https://www.google.ca/preferences?hl=en&fg=1,origin,https://ogs.google.ca/widget/callout?prid=1903...,True,https://www.google.ca/preferences?hl=en&fg=1
3,https://www.change.org/member?source_location=...,no-referrer,https://accounts.google.com/gsi/button?size=la...,True,https://www.change.org/s/member?source_locatio...
4,https://www.change.org/member?source_location=...,origin,https://pay.google.com/gp/p/ui/payframe?origin...,True,https://www.change.org/s/member?source_locatio...
...,...,...,...,...,...
827,https://www.google.com.mx/imghp?hl=en&ogbl,origin,https://www.google.com.mx/imghp?hl=en&ogbl,False,https://www.google.com.mx/imghp?hl=en&ogbl
828,https://www.google.com.mx/imghp?hl=en&ogbl,origin,https://ogs.google.com.mx/widget/callout?prid=...,True,https://www.google.com.mx/imghp?hl=en&ogbl
829,https://www.ebay.com/e/_electronics/certified-...,unsafe-url,https://www.ebay.com/e/_electronics/certified-...,False,https://www.ebay.com/e/_electronics/certified-...
830,https://mashable.com/entertainment,no-referrer,https://aax-eu.amazon-adsystem.com/s/v3/pr?exl...,True,https://mashable.com/entertainment


In [23]:
df_meta.drop_duplicates(subset=['init_url', 'referrer_policy']).referrer_policy.value_counts()

no-referrer                        211
no-referrer-when-downgrade         140
origin                             126
unsafe-url                          67
origin-when-crossorigin             56
origin-when-cross-origin            48
always                              32
strict-origin-when-cross-origin     17
same-origin                          6
never                                1
strict-origin                        1
Name: referrer_policy, dtype: int64