# Simple Analyzer for Leak Detector
## OptIn Version

In [1]:
import pandas as pd
import re
import os
from urllib.parse import urlparse

In [2]:
def print_leak_to_csv(df):
    """
    Check dataframe if there are colomn with name: url_leaks, referer_leaks,
    and post_leaks. If there are, then print the dataframe to csv file.

    :param df: dataframe

    :return: csv file with specific name
    """
  


    # if 'url_leaks' in df and 'referer_leaks' in df and 'post_leaks' in df:
    #     return df.to_csv(extract_date + '_print_raw_leaks.csv')
    if 'url_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_' + extract_type +'_print_url_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optin_url_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optout_url_leaks.csv')
    elif 'referer_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_' + extract_type + '_print_referer_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optin_referer_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optout_referer_leaks.csv')
    elif 'post_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_' + extract_type + '_print_post_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optin_post_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_opt_out_post_leaks.csv')



In [3]:
def extract_leak(data, leak):
    """
    extract data with when 'leak' row is not empty

    :param dataframe data: input dataframe
    :param str leak: column name

    :return: dataframe with non-empty 'leak' column
    """
 
    return data[(data['leaks'].str.contains(leak, na=False))]

In [5]:
def drop_non_leak(data, leak):
    """
    drop data that not related to the leak

    :param data: dataframe
    :param leak: column name

    :return: dataframe with non-empty 'leak' column
    """
    data_seed = extract_leak(data, leak)
    if leak == 'url_leaks':
        return data_seed.drop(['post_leaks', 'ref_data', 'referer_leaks', 'post_data'], axis=1)
    elif leak == 'referer_leaks':
        return data_seed.drop(['url_leaks', 'post_leaks', 'post_data'], axis=1)
    elif leak == 'post_leaks':
        return data_seed.drop(['url_leaks', 'referer_leaks', 'ref_data'], axis=1)


In [4]:
def check_urlparse(row):
    """check if the referer_leaks contain the netloc, path, params, query, and fragment from the final_url

    :param row: row of dataframe

    :return: dictionary of the referer_leaks
    """
    parsed_url = urlparse(row['final_url'])
    parsed_url_list = [parsed_url.netloc, parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment]
    # print(parsed_url_list)
    if 'referer_leaks' in row:    
        desired_list = row['referer_leaks'].split("'") 
    elif 'url_leaks' in row:
        desired_list = row['url_leaks'].split("'")
    elif 'post_leaks' in row:
        desired_list = row['post_leaks'].split("'")
    # print (desired_list)
    undesired_list = ['[', 
                    ']', 
                    ',', 
                    'None', 
                    '', 
                    '[(',
                    ')]', 
                    ',)]',
                    '), (',
                    ', ']
    result = {}
    netloc, path, query ,params, fragments = '', '', '', '', ''
    for item in desired_list:
        if item not in undesired_list:
            if item in parsed_url_list:
                if item == parsed_url.netloc:
                    netloc = parsed_url.netloc
                if item == parsed_url.path:
                    path = parsed_url.path
                if item == parsed_url.query:
                    query = parsed_url.query
                # it should be not include in referer 
                if item == parsed_url.params:
                    params = parsed_url.params
                if item == parsed_url.fragment:
                    fragments = parsed_url.fragment

    return netloc, path, query, params, fragments

In [6]:
def check_violation(row):
    """
    Check if the referer_leaks violate the policy or not

    :param dataframe row: a dataframe row that contain "ref_pol_data" colomn as
    the referer policy and check if the referer_leaks violate the policy or not

    :return str: Send "Violation" if the referer_leaks violate the policy,
    send "Safe" if the referer_leaks is safe, send "Warning" if the referer_leaks
    """
    ref_pol = row['ref_pol_data']
    if ((row ['params']) or (row ['fragments'])):
        return "Warning"
    match ref_pol:
        case 'no-referrer':
            if ((row ['netloc']) or (row ['path']) or (row ['query'])):
                return "Violation"
            else:
                return "Safe"
        case 'same-origin':
            if ((row ['netloc']) or (row ['path']) or (row ['query'])):
                return "Violation"
            else:  
                return "Safe"
        case 'no-referrer-when-downgrade':
            return "Safe"
        case 'unsafe-url':
            return "Safe"
        case 'origin':
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:
                return "Safe"
        case 'origin-when-cross-origin':
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:   
                return "Safe"
        case 'strict-origin' :
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:   
                return "Safe"
        case 'strict-origin-when-cross-origin':
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:
                return "Safe"
        case _:
            return "Blank"

## Convert CSV to Dataframe

In [7]:
csv_file = './data_raw/2023-12-08_optIn_leaks_raw.csv'
# csv_file = './data_raw/2023-11-29_optIn_leaks_raw.csv'
# csv_file = './data_raw/test.csv'
extract_type = csv_file.split("_")[2]

extract_date = re.findall(r'20(.*?)_', csv_file)
extract_date = ''.join(extract_date)  # Convert the list to a string

os.makedirs("data_seg", exist_ok=True)

extracted_df = pd.read_csv(csv_file)
extracted_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,post_data,url_leaks,post_leaks,referer_leaks,leaks
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",[],[],url_leaks
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",[],[],url_leaks
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",[],[],url_leaks
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://apikeys.civiccomputing.com/c/v?d=www.o...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",[],[],url_leaks
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.linkedin.com/px/li_sync?redirect=h...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",[],[],url_leaks
...,...,...,...,...,...,...,...,...,...,...,...
110883,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://syndication.twitter.com/i/jot/embeds?l...,GET,strict-origin-when-cross-origin,https://www.andapp.jp/,,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",[],[],url_leaks
110884,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google.com/pagead/1p-user-list/878...,GET,strict-origin-when-cross-origin,https://www.andapp.jp/,,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",[],[],url_leaks
110885,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...",[],"[('www.andapp.jp',)]","[('urlencode', 'www.andapp.jp'), ('www.andapp....",post_leaks - referer_leaks
110886,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://aw.dw.impact-ad.jp/ut/rep?u=2714&v=2&r...,GET,strict-origin-when-cross-origin,https://6015542.fls.doubleclick.net/,,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",[],[],url_leaks


## Leaks on Referer

We still need to compare with the referrer policy to make it accurate

In [8]:
# create a new dataframe with non-empty 'referer_leaks' column
referer_leaks_df = drop_non_leak(extracted_df, 'referer_leaks')
referer_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks
9,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
10,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
11,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
17,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
18,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
...,...,...,...,...,...,...,...,...
110873,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks
110874,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks
110876,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks
110878,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/generate_204?FeixEA,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks


In [9]:
# show the dataframe with more than one leaks in "leaks" colomn
referer_leaks_df[referer_leaks_df['leaks'] != "referer_leaks"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks
17,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
28,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/api/stats/qoe?fmt=247&...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks - referer_leaks
31,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/next?key=A...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
32,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
33,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/ogkc1FmfiuI?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
...,...,...,...,...,...,...,...,...
110556,https://mashable.com/entertainment,https://mashable.com/entertainment,https://ping.chartbeat.net/ping?h=mashable.com...,GET,no-referrer-when-downgrade,https://mashable.com/entertainment,"[('/entertainment',)]",url_leaks - referer_leaks
110562,https://mashable.com/entertainment,https://mashable.com/entertainment,https://ping.chartbeat.net/ping?h=mashable.com...,GET,no-referrer-when-downgrade,https://mashable.com/entertainment,"[('/entertainment',)]",url_leaks - referer_leaks
110684,https://www.mcafee.com/en-us/consumer-corporat...,https://www.mcafee.com/en-us/consumer-corporat...,https://adobedc.demdex.net/ee/v1/interact?conf...,POST,no-referrer-when-downgrade,https://www.mcafee.com/en-us/consumer-corporat...,"[('/en-us/consumer-corporate/investors.html',)]",post_leaks - referer_leaks
110742,https://actu.fr/politique,https://actu.fr/politique,https://gum.criteo.com/sid/json?origin=publish...,GET,strict-origin-when-cross-origin,https://gum.criteo.com/syncframe?origin=publis...,"[('actu.fr',), ('urlencode', 'actu.fr')]",url_leaks - referer_leaks


In [10]:
#expand the dataframe with the result from check_urlparse function
referer_leaks_df[['netloc', 'path', 'query', 'params', 'fragments']] = referer_leaks_df.apply(check_urlparse, axis=1, result_type='expand')
referer_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments
9,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
10,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
11,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
17,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks,www.ox.ac.uk,,,,
18,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110873,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,
110874,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,
110876,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,
110878,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/generate_204?FeixEA,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,


In [11]:
# create a new colomn with the result from check_violation function
referer_leaks_df['violation'] = referer_leaks_df.apply(check_violation, axis=1, result_type='expand')
referer_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation
9,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
10,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
11,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
17,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks,www.ox.ac.uk,,,,,Safe
18,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110873,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,,Safe
110874,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,,Safe
110876,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,,Safe
110878,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/generate_204?FeixEA,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",referer_leaks,www.andapp.jp,,,,,Safe


In [12]:
# Check wheter the params or fragments leak on referrer 
referer_leaks_df[referer_leaks_df['violation'] == "Warning"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation


### Referer Leaks that did not follow referer policy

This case is similar with the youtube case. Maybe, it's an TRC error. 
But it's also possible that there is a problem from Chrome

In [13]:
referer_leaks_vioilation_df = referer_leaks_df[referer_leaks_df['violation'] == "Violation"]
referer_leaks_vioilation_df.to_csv("data_seg/"+ extract_date + '_' + extract_type + '_print_referer_leaks_violation.csv')
referer_leaks_vioilation_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation
1028,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/runtime~m...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('wpengine.com',), ('urlencode', 'wpengine.co...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
1029,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/9.4a3e980...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('wpengine.com',), ('urlencode', 'wpengine.co...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
1030,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/main~493d...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('wpengine.com',), ('urlencode', 'wpengine.co...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
1038,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/51.558be3...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('wpengine.com',), ('urlencode', 'wpengine.co...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
1039,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/35.d0f1cc...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('wpengine.com',), ('urlencode', 'wpengine.co...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110813,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('urlencode', '/products/docker-scout/'), ('w...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110814,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('urlencode', '/products/docker-scout/'), ('w...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110823,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('urlencode', '/products/docker-scout/'), ('w...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110824,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('urlencode', '/products/docker-scout/'), ('w...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation


In [14]:
print_leak_to_csv(referer_leaks_vioilation_df)

In [15]:
referer_leaks_vioilation_df.count()

init_url         2827
final_url        2827
req_url          2827
req_method       2827
ref_pol_data     2827
ref_data         2827
referer_leaks    2827
leaks            2827
netloc           2827
path             2827
query            2827
params           2827
fragments        2827
violation        2827
dtype: int64

In [16]:
referer_leaks_vioilation_df.drop_duplicates(subset=['final_url']).count()

init_url         226
final_url        226
req_url          226
req_method       226
ref_pol_data     226
ref_data         226
referer_leaks    226
leaks            226
netloc           226
path             226
query            226
params           226
fragments        226
violation        226
dtype: int64

In [18]:
referer_leaks_vioilation_df.final_url.value_counts()

https://www.parse.ly/getdemo/                                                                                            88
https://www.docker.com/products/docker-scout/                                                                            41
https://www.fastly.com/cookies/                                                                                          41
https://www.docker.com/resources/what-container/                                                                         41
https://www.docker.com/careers/                                                                                          41
                                                                                                                         ..
https://www.olx.com.br/esportes-e-lazer                                                                                   1
https://dantri.com.vn/lao-dong-viec-lam/chuyen-nghe.htm                                                                   1
https://

In [19]:
referer_leaks_vioilation_df.req_url.value_counts()

https://s.yimg.jp/images/advertising/common/js/iicon.min.js                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            57
https://js.driftt.com/core/ass

In [20]:
referer_leaks_vioilation_df.req_method.value_counts() 

GET     2805
POST      22
Name: req_method, dtype: int64

### Google

In [21]:
# list of referer leaks to google
referer_leaks_vioilation_df[referer_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)]


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation
1935,https://www.samsung.com/us/watches/galaxy-watch6,https://www.samsung.com/us/watches/galaxy-watch6/,https://cm.g.doubleclick.net/pixel?google_nid=...,GET,same-origin,https://login.dotomi.com/ucm/visit/iframe?cli_...,"[('www.samsung.com',), ('urlencode', 'www.sams...",referer_leaks,www.samsung.com,/us/watches/galaxy-watch6/,,,,Violation
17188,https://dantri.com.vn/lao-dong-viec-lam/chuyen...,https://dantri.com.vn/lao-dong-viec-lam/chuyen...,https://googleads.g.doubleclick.net/pagead/adv...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('urlencode', '/lao-dong-viec-lam/chuyen-nghe...",referer_leaks,dantri.com.vn,/lao-dong-viec-lam/chuyen-nghe.htm,,,,Violation
17723,https://www.wikihow.com/make-christmas-come-fa...,https://www.wikihow.com/Make-Christmas-Come-Fa...,https://googleads.g.doubleclick.net/pagead/adv...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('www.wikihow.com',), ('urlencode', 'www.wiki...",referer_leaks,www.wikihow.com,/Make-Christmas-Come-Faster,,,,Violation
18767,https://weathernews.jp/onebox/radar/?fm=dotop&...,https://weathernews.jp/onebox/radar/?fm=dotop&...,https://googleads.g.doubleclick.net/pagead/adv...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('weathernews.jp',), ('urlencode', 'weatherne...",referer_leaks,weathernews.jp,/onebox/radar/,fm=dotop&fmdotop=1,,,Violation
20454,https://pets.udn.com/,https://pets.udn.com/pets/index,https://googleads.g.doubleclick.net/xbbe/pixel...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('urlencode', '/pets/index'), ('pets.udn.com'...",referer_leaks,pets.udn.com,/pets/index,,,,Violation
23550,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/x6j13gk/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/x6j13gk/...,"[('www.investopedia.com',), ('urlencode', 'www...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
23564,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/x6j13gk/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/x6j13gk/...,"[('www.investopedia.com',), ('urlencode', 'www...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
23565,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/x6j13gk/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/x6j13gk/...,"[('www.investopedia.com',), ('urlencode', 'www...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
23566,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/x6j13gk/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/x6j13gk/...,"[('www.investopedia.com',), ('urlencode', 'www...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
23572,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/x6j13gk/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/x6j13gk/...,"[('www.investopedia.com',), ('urlencode', 'www...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation


In [22]:
# counting on referer leaks to google
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('google')
    .fillna(False)
    ].count()


init_url         45
final_url        45
req_url          45
req_method       45
ref_pol_data     45
ref_data         45
referer_leaks    45
leaks            45
netloc           45
path             45
query            45
params           45
fragments        45
violation        45
dtype: int64

### META / Facebook

In [23]:
# list of referer leaks to Meta or Facebook
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('meta')
    | referer_leaks_vioilation_df['req_url']
        .str
        .contains('facebook')]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation


In [24]:
# Counting on referer leaks to Meta or Facebook
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('meta') 
    | referer_leaks_vioilation_df['req_url']
        .str.
        contains('facebook')].count()

init_url         0
final_url        0
req_url          0
req_method       0
ref_pol_data     0
ref_data         0
referer_leaks    0
leaks            0
netloc           0
path             0
query            0
params           0
fragments        0
violation        0
dtype: int64

### Tiktok

In [25]:
# list of referer leaks to tiktok
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('tiktok')
    .fillna(False)] 


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation


In [26]:
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('tiktok')
    .fillna(False)].count()


init_url         0
final_url        0
req_url          0
req_method       0
ref_pol_data     0
ref_data         0
referer_leaks    0
leaks            0
netloc           0
path             0
query            0
params           0
fragments        0
violation        0
dtype: int64

## Leaks on URL

In [27]:
# create a new dataframe with non-empty 'referer_leaks' column
url_leaks_df = drop_non_leak(extracted_df, 'url_leaks')
url_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://apikeys.civiccomputing.com/c/v?d=www.o...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.linkedin.com/px/li_sync?redirect=h...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks
...,...,...,...,...,...,...,...
110881,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://adservice.google.com/ddm/fls/i/dc_pre=...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks
110882,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://6015542.fls.doubleclick.net/ddm/fls/r/...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks
110883,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://syndication.twitter.com/i/jot/embeds?l...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks
110884,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google.com/pagead/1p-user-list/878...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks


In [28]:
# expand the datarame with the result from check_urlparse function and create a new colomn with the result from check_violation function
url_leaks_df[['netloc', 'path', 'query', 'params', 'fragments']] = url_leaks_df.apply(check_urlparse, axis=1, result_type='expand')

url_leaks_df['violation'] = url_leaks_df.apply(check_violation, axis=1, result_type='expand')

url_leaks_df


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks,www.ox.ac.uk,,,,,Safe
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks,www.ox.ac.uk,/research,,,,Violation
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks,www.ox.ac.uk,/research,,,,Violation
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://apikeys.civiccomputing.com/c/v?d=www.o...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks,www.ox.ac.uk,,,,,Safe
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.linkedin.com/px/li_sync?redirect=h...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks,www.ox.ac.uk,,,,,Safe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110881,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://adservice.google.com/ddm/fls/i/dc_pre=...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks,www.andapp.jp,,,,,Safe
110882,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://6015542.fls.doubleclick.net/ddm/fls/r/...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks,www.andapp.jp,,,,,Safe
110883,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://syndication.twitter.com/i/jot/embeds?l...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
110884,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google.com/pagead/1p-user-list/878...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation


In [29]:
# Check wheter the params or fragments leak on url
url_leaks_df[url_leaks_df['violation'] == "Warning"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation


In [30]:
url_leaks_vioilation_df = url_leaks_df[url_leaks_df['violation'] == "Violation"]
url_leaks_vioilation_df


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks,www.ox.ac.uk,/research,,,,Violation
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks,www.ox.ac.uk,/research,,,,Violation
6,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks,www.ox.ac.uk,/research,,,,Violation
56,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/g/collect?v=2...,POST,origin,"[('ameblo.jp',), ('urlencode', 'ameblo.jp'), (...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
60,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ln.ameba.jp/v3/zBymKrvv?rd=7e93fce4-60...,GET,origin,"[('ameblo.jp',), ('urlencode', 'ameblo.jp'), (...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110875,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://yjtag.yahoo.co.jp/tag?site=k36tMgw&ref...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
110879,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.facebook.com/v10.0/plugins/like.ph...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,,,,Violation
110880,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://googleads.g.doubleclick.net/pagead/vie...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
110883,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://syndication.twitter.com/i/jot/embeds?l...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation


### Maybe some of them is false alarm

For example is a language setting that can be shown on query. The pattern usually "lang=xx"

In [35]:
url_leaks_vioilation_df[
    (url_leaks_vioilation_df['query']
    .str
    .contains('lang', na=False))
          ]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
7780,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://reliably-main-silkworm.pgsdemo.com/?la...,GET,strict-origin-when-cross-origin,"[('lang=en_us&referrerPageUrl=',)]",url_leaks,,,lang=en_us&referrerPageUrl=,,,Violation
7804,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://hero.kingpinkton.com/ct?id=26647&url=h...,GET,strict-origin-when-cross-origin,"[('urlencode', 'base64', 'store.netgear.com'),...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
7806,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://ct.pinterest.com/v3/?tid=2613809389803...,GET,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'lang=...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
7807,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://events.attentivemobile.com/e?v=4.25.43...,POST,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'lang=...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
7809,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://www.facebook.com/tr/?id=43154253371256...,GET,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'lang=...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88678,https://dzen.ru/news/story/vpodmoskove_nashli_...,https://dzen.ru/news/story/vpodmoskove_nashli_...,https://sso.passport.yandex.ru/push?uuid=f7d40...,GET,strict-origin-when-cross-origin,"[('dzen.ru',), ('urlencode', 'dzen.ru'), ('url...",url_leaks,dzen.ru,/news/story/vpodmoskove_nashli_telo_ehks-deput...,lang=ru&from=main_portal&fan=1&stid=wzlzk1zkik...,,,Violation
93422,https://dzen.ru/news/story/vnarodnom_sovete_ln...,https://dzen.ru/news/story/vnarodnom_sovete_ln...,https://sso.passport.yandex.ru/push?uuid=e9feb...,GET,strict-origin-when-cross-origin,"[('dzen.ru',), ('urlencode', 'dzen.ru'), ('url...",url_leaks,dzen.ru,/news/story/vnarodnom_sovete_lnr_podtverdili_g...,lang=ru&from=main_portal&fan=1&stid=ppm0ktlu-g...,,,Violation
101085,https://businesshelp.snapchat.com/,https://businesshelp.snapchat.com/s/?language=...,https://www.google-analytics.com/j/collect?v=1...,POST,origin-when-cross-origin,"[('businesshelp.snapchat.com',), ('urlencode',...",url_leaks,businesshelp.snapchat.com,/s/,language=en_US,,,Violation
102563,https://www.scorecardresearch.com/about.aspx?n...,https://www.scorecardresearch.com/about.aspx?n...,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('www.scorecardresearch.com',), ('urlencode',...",url_leaks,www.scorecardresearch.com,/about.aspx,newlanguage=1,,,Violation


In [None]:
# and print to csv

print_leak_to_csv(url_leaks_vioilation_df)

In [36]:
url_leaks_vioilation_df.count()

init_url        30658
final_url       30658
req_url         30658
req_method      30658
ref_pol_data    30658
url_leaks       30658
leaks           30658
netloc          30658
path            30658
query           30658
params          30658
fragments       30658
violation       30658
dtype: int64

In [37]:
url_leaks_vioilation_df.req_method.value_counts()

GET        25127
POST        5370
OPTIONS      140
HEAD          21
Name: req_method, dtype: int64

In [38]:
url_leaks_vioilation_df.drop_duplicates(subset=['final_url'])

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('urlencode', ...",url_leaks,www.ox.ac.uk,/research,,,,Violation
56,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/g/collect?v=2...,POST,origin,"[('ameblo.jp',), ('urlencode', 'ameblo.jp'), (...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
73,https://xmp.mobvista.com/cn,https://xmp.mobvista.com/cn,https://magnet.rayjump.com/pixel?campuuid=ss_x...,GET,strict-origin-when-cross-origin,"[('xmp.mobvista.com',), ('urlencode', 'xmp.mob...",url_leaks,xmp.mobvista.com,/cn,,,,Violation
77,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",url_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
94,https://email.nypost.com/israel-war-update,https://email.nypost.com/israel-war-update/?ut...,https://p1.parsely.com/px/?rand=1702099616458&...,GET,strict-origin-when-cross-origin,"[('urlencode', 'email.nypost.com'), ('urlencod...",url_leaks,email.nypost.com,/israel-war-update/,utm_source=facebook&utm_campaign=nypost&sr_sha...,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110624,https://www.finn.no/nettbil/velkommen?origin=f...,https://www.finn.no/nettbil?origin=frontpage_icon,https://static.finncdn.no/_c/nettbil-layout/_n...,GET,strict-origin-when-cross-origin,"[('/nettbil',)]",url_leaks,,/nettbil,,,,Violation
110661,https://www.washington.edu/news/?utm_source=wh...,https://www.washington.edu/news/?utm_source=wh...,https://www.googletagmanager.com/a?id=GTM-KQ6Q...,GET,strict-origin-when-cross-origin,"[('urlencode', '/news/'), ('www.washington.edu...",url_leaks,www.washington.edu,/news/,,,,Violation
110686,https://www.mcafee.com/en-us/consumer-corporat...,https://www.mcafee.com/en-us/consumer-corporat...,https://d2v83son8kay5v.cloudfront.net/1x1.gif?...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.mcafee.com'), ('urlencode'...",url_leaks,www.mcafee.com,/en-us/consumer-corporate/investors.html,,,,Violation
110701,https://actu.fr/politique,https://actu.fr/politique,https://trc.taboola.com/actufr-actufr/trc/3/js...,GET,strict-origin-when-cross-origin,"[('actu.fr',), ('urlencode', 'actu.fr'), ('url...",url_leaks,actu.fr,/politique,,,,Violation


### GOOGLE

In [39]:
# count of url leaks to google
url_leaks_vioilation_df[url_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)].count()


init_url        9765
final_url       9765
req_url         9765
req_method      9765
ref_pol_data    9765
url_leaks       9765
leaks           9765
netloc          9765
path            9765
query           9765
params          9765
fragments       9765
violation       9765
dtype: int64

In [40]:
# list of url leaks to google
url_leaks_vioilation_df[url_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
56,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/g/collect?v=2...,POST,origin,"[('ameblo.jp',), ('urlencode', 'ameblo.jp'), (...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
61,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/j/collect?v=1...,POST,origin,"[('ameblo.jp',), ('urlencode', 'ameblo.jp'), (...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
74,https://xmp.mobvista.com/cn,https://xmp.mobvista.com/cn,https://www.google-analytics.com/g/collect?v=2...,POST,strict-origin-when-cross-origin,"[('xmp.mobvista.com',), ('urlencode', 'xmp.mob...",url_leaks,xmp.mobvista.com,/cn,,,,Violation
76,https://xmp.mobvista.com/cn,https://xmp.mobvista.com/cn,https://www.google-analytics.com/g/collect?v=2...,POST,strict-origin-when-cross-origin,"[('xmp.mobvista.com',), ('urlencode', 'xmp.mob...",url_leaks,xmp.mobvista.com,/cn,,,,Violation
77,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",url_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110828,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://analytics.google.com/g/collect?v=2&tid...,POST,strict-origin-when-cross-origin,"[('urlencode', '/products/docker-scout/'), ('w...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110868,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
110870,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google-analytics.com/g/collect?v=2...,POST,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
110880,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://googleads.g.doubleclick.net/pagead/vie...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('urlencode',...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation


### Facebook or Meta

In [41]:
# count of url leaks to Meta or Facebook
url_leaks_vioilation_df[url_leaks_vioilation_df['req_url']
                        .str.contains('meta') 
                        | url_leaks_vioilation_df['req_url']
                            .str.contains('facebook')].count()

init_url        1557
final_url       1557
req_url         1557
req_method      1557
ref_pol_data    1557
url_leaks       1557
leaks           1557
netloc          1557
path            1557
query           1557
params          1557
fragments       1557
violation       1557
dtype: int64

In [42]:
# list of url leaks to Meta or Facebook
url_leaks_vioilation_df[
    url_leaks_vioilation_df['req_url']
    .str.contains('meta') 
    | url_leaks_vioilation_df['req_url']
        .str.contains('facebook')]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
94,https://email.nypost.com/israel-war-update,https://email.nypost.com/israel-war-update/?ut...,https://p1.parsely.com/px/?rand=1702099616458&...,GET,strict-origin-when-cross-origin,"[('urlencode', 'email.nypost.com'), ('urlencod...",url_leaks,email.nypost.com,/israel-war-update/,utm_source=facebook&utm_campaign=nypost&sr_sha...,,,Violation
95,https://email.nypost.com/israel-war-update,https://email.nypost.com/israel-war-update/?ut...,https://tr.outbrain.com/unifiedPixel?optOut=fa...,POST,strict-origin-when-cross-origin,"[('urlencode', 'email.nypost.com'), ('urlencod...",url_leaks,email.nypost.com,/israel-war-update/,utm_source=facebook&utm_campaign=nypost&sr_sha...,,,Violation
96,https://email.nypost.com/israel-war-update,https://email.nypost.com/israel-war-update/?ut...,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', 'email.nypost.com'), ('urlencod...",url_leaks,email.nypost.com,/israel-war-update/,utm_source=facebook&utm_campaign=nypost&sr_sha...,,,Violation
97,https://email.nypost.com/israel-war-update,https://email.nypost.com/israel-war-update/?ut...,https://www.google-analytics.com/collect?v=1&_...,GET,strict-origin-when-cross-origin,"[('urlencode', 'email.nypost.com'), ('urlencod...",url_leaks,email.nypost.com,/israel-war-update/,utm_source=facebook&utm_campaign=nypost&sr_sha...,,,Violation
114,https://stars.udn.com/star/story/10092/7623509...,https://stars.udn.com/star/story/10092/7623509...,https://www.facebook.com/tr/?id=20291620972956...,GET,strict-origin-when-cross-origin,"[('stars.udn.com',), ('urlencode', 'stars.udn....",url_leaks,stars.udn.com,/star/story/10092/7623509,from=redpush,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110703,https://actu.fr/politique,https://actu.fr/politique,https://www.facebook.com/tr/?id=18311472367086...,GET,strict-origin-when-cross-origin,"[('actu.fr',), ('urlencode', 'actu.fr'), ('url...",url_leaks,actu.fr,/politique,,,,Violation
110705,https://actu.fr/politique,https://actu.fr/politique,https://www.facebook.com/tr/?id=64407153621134...,GET,strict-origin-when-cross-origin,"[('actu.fr',), ('urlencode', 'actu.fr'), ('url...",url_leaks,actu.fr,/politique,,,,Violation
110757,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://www.facebook.com/tr/?id=16466388623210...,GET,strict-origin-when-cross-origin,"[('urlencode', '/products/docker-scout/'), ('w...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110827,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://www.facebook.com/tr/?id=16466388623210...,GET,strict-origin-when-cross-origin,"[('urlencode', '/products/docker-scout/'), ('w...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation


### Tiktok

In [43]:
url_leaks_vioilation_df[
    url_leaks_vioilation_df['req_url']
        .str.contains('tiktok').fillna(False)] 


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
4272,https://time.com/6342806/person-of-the-year-20...,https://time.com/6342806/person-of-the-year-20...,https://securepubads.g.doubleclick.net/gampad/...,GET,strict-origin-when-cross-origin,"[('urlencode', '/6342806/person-of-the-year-20...",url_leaks,time.com,/6342806/person-of-the-year-2023-taylor-swift/,,,,Violation
4295,https://www.adjust.com/resources/ebooks/skadne...,https://www.adjust.com/resources/ebooks/skadne...,https://pi.pardot.com/analytics?ver=3&pi_form=...,GET,strict-origin-when-cross-origin,"[('urlencode', '/resources/ebooks/skadnetwork-...",url_leaks,www.adjust.com,/resources/ebooks/skadnetwork-4-ios/,,,,Violation
4298,https://www.adjust.com/resources/ebooks/skadne...,https://www.adjust.com/resources/ebooks/skadne...,https://pi.pardot.com/analytics?ver=3&pi_form=...,GET,strict-origin-when-cross-origin,"[('urlencode', '/resources/ebooks/skadnetwork-...",url_leaks,www.adjust.com,/resources/ebooks/skadnetwork-4-ios/,,,,Violation
5176,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://widgets.outbrain.com/widgetMonitor/mon...,GET,strict-origin-when-cross-origin,"[('www.huffpost.com',), ('urlencode', 'www.huf...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
5178,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://widgetmonitor.outbrain.com/WidgetError...,GET,strict-origin-when-cross-origin,"[('www.huffpost.com',), ('urlencode', 'www.huf...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
5199,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://widgetmonitor.outbrain.com/WidgetError...,GET,strict-origin-when-cross-origin,"[('www.huffpost.com',), ('urlencode', 'www.huf...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
5219,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://ssum-sec.casalemedia.com/usermatch?us_...,GET,strict-origin-when-cross-origin,"[('www.huffpost.com',), ('urlencode', 'www.huf...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
5400,https://www.nike.com/w/member-sale-9t7gt,https://www.nike.com/w/member-sale-9t7gt,https://newrelicstream.adtech-prod.nikecloud.c...,GET,strict-origin-when-cross-origin,"[('/w/member-sale-9t7gt',), ('www.nike.com',),...",url_leaks,www.nike.com,/w/member-sale-9t7gt,,,,Violation
8361,https://www.healthline.com/fitness,https://www.healthline.com/fitness,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', '/fitness'), ('www.healthline.c...",url_leaks,www.healthline.com,/fitness,,,,Violation
9949,https://www.nike.com/w/classics-collection-aj8oq,https://www.nike.com/w/classics-collection-aj8oq,https://newrelicstream.adtech-prod.nikecloud.c...,GET,strict-origin-when-cross-origin,"[('urlencode', '/w/classics-collection-aj8oq')...",url_leaks,www.nike.com,/w/classics-collection-aj8oq,,,,Violation


## Leaks on Post Data

In [44]:
# create a new dataframe with non-empty 'post_leaks' column
post_leaks_df = drop_non_leak(extracted_df, 'post_leaks')
post_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks
5,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""MJNz9_M3kFg"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks
7,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""ogkc1FmfiuI"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks
8,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/next?key=A...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""rem...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks
13,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks
15,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks
...,...,...,...,...,...,...,...,...
110830,https://shop.wired.com/,https://shop.wired.com/,https://monorail-edge.shopifysvc.com/v1/produce,POST,strict-origin-when-cross-origin,"{""schema_id"":""online_store_media_video_created...","[('shop.wired.com',)]",post_leaks
110838,https://shop.wired.com/,https://shop.wired.com/,https://events.privy.com/v2/collect,POST,strict-origin-when-cross-origin,"{""event"":""new-session"",""properties"":{""referrin...","[('shop.wired.com',)]",post_leaks
110844,https://shop.wired.com/,https://shop.wired.com/,https://a.ad.gt/api/v1/collect,POST,strict-origin-when-cross-origin,eyJjYXRlZ29yeSI6InBhZ2VWaWV3IiwidmVyc2lvbiI6In...,"[('base64', 'shop.wired.com')]",post_leaks
110885,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.andapp.jp',)]",post_leaks - referer_leaks


In [45]:
# expand the datarame with the result from check_urlparse function and create a new colomn with the result from check_violation function
post_leaks_df[['netloc', 'path', 'query', 'params', 'fragments']] = post_leaks_df.apply(check_urlparse, axis=1, result_type='expand')

post_leaks_df['violation'] = post_leaks_df.apply(check_violation, axis=1, result_type='expand')

post_leaks_df


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
5,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""MJNz9_M3kFg"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks,www.ox.ac.uk,,,,,Safe
7,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""ogkc1FmfiuI"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks,www.ox.ac.uk,,,,,Safe
8,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/next?key=A...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""rem...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks,www.ox.ac.uk,,,,,Safe
13,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks,www.ox.ac.uk,,,,,Safe
15,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks,www.ox.ac.uk,,,,,Safe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110830,https://shop.wired.com/,https://shop.wired.com/,https://monorail-edge.shopifysvc.com/v1/produce,POST,strict-origin-when-cross-origin,"{""schema_id"":""online_store_media_video_created...","[('shop.wired.com',)]",post_leaks,shop.wired.com,,,,,Safe
110838,https://shop.wired.com/,https://shop.wired.com/,https://events.privy.com/v2/collect,POST,strict-origin-when-cross-origin,"{""event"":""new-session"",""properties"":{""referrin...","[('shop.wired.com',)]",post_leaks,shop.wired.com,,,,,Safe
110844,https://shop.wired.com/,https://shop.wired.com/,https://a.ad.gt/api/v1/collect,POST,strict-origin-when-cross-origin,eyJjYXRlZ29yeSI6InBhZ2VWaWV3IiwidmVyc2lvbiI6In...,"[('base64', 'shop.wired.com')]",post_leaks,shop.wired.com,,,,,Safe
110885,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.andapp.jp',)]",post_leaks - referer_leaks,www.andapp.jp,,,,,Safe


In [46]:
# Check wheter the params or fragments leak on post
post_leaks_df[post_leaks_df['violation'] == "Warning"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation


In [47]:
post_leaks_vioilation_df = post_leaks_df[post_leaks_df['violation'] == "Violation"]
post_leaks_vioilation_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
16,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/wa/,POST,strict-origin-when-cross-origin,"{""pids"":[9293],""scriptVersion"":1029,""time"":170...","[('/research',)]",post_leaks,,/research,,,,Violation
62,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://id5-sync.com/gm/v3,POST,origin,"{""requests"":[{""requestId"":""39a51fe1-6803-467a-...","[('ameblo.jp',), ('/borderfreecosmetics/entry-...",post_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
78,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://htlb.casalemedia.com/openrtb/pbjs?s=23...,POST,strict-origin-when-cross-origin,"{""id"":""17daa0cecd9fec"",""site"":{""page"":""https:/...","[('www.marktplaats.nl',), ('/cp/621/kleding-da...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
81,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://pre.ads.justpremium.com/v/2.0/t/xhr?i=...,POST,strict-origin-when-cross-origin,"{""zone"":[148381],""referer"":""https://www.marktp...","[('www.marktplaats.nl',), ('/cp/621/kleding-da...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
82,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://ib.adnxs-simple.com/ut/v3/prebid,POST,strict-origin-when-cross-origin,"{""tags"":[{""sizes"":[{""width"":728,""height"":90},{...","[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110815,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://bootstrap.api.drift.com/widget_bootstr...,POST,strict-origin-when-cross-origin,ping_context=%7B%22embedId%22%3A%22p4d2wxp9n4g...,"[('urlencode', '/products/docker-scout/'), ('w...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110817,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://bootstrap.api.drift.com/widget_bootstrap,POST,strict-origin-when-cross-origin,embed_id=p4d2wxp9n4gk&client_id=f6zuizdyhxrm7r...,"[('urlencode', '/products/docker-scout/'), ('w...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110819,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://event.api.drift.com/track,POST,strict-origin-when-cross-origin,"{""orgId"":5090900,""inboxId"":732344,""userId"":nul...","[('/products/docker-scout/',), ('www.docker.co...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation
110820,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://targeting.api.drift.com/targeting/eval...,POST,strict-origin-when-cross-origin,"{""conditionGroups"":[{""status"":""EVALUATED"",""mat...","[('/products/docker-scout/',), ('www.docker.co...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation


In [None]:
# print to csv

post_leaks_vioilation_df['post_data'] = [str(x)[:2048] for x in post_leaks_vioilation_df['post_data']] # convert to string and truncate post_data to 2048 characters
print_leak_to_csv(post_leaks_vioilation_df)

In [48]:
post_leaks_vioilation_df.count()

init_url        9843
final_url       9843
req_url         9843
req_method      9843
ref_pol_data    9843
post_data       9843
post_leaks      9843
leaks           9843
netloc          9843
path            9843
query           9843
params          9843
fragments       9843
violation       9843
dtype: int64

In [49]:
post_leaks_vioilation_df.req_method.value_counts()	

POST    9841
PUT        2
Name: req_method, dtype: int64

### Leaks to Google, facebook, and Meta

In [50]:
# list of post data leaks to google
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)]


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
84,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=2133699423&t=event&_s=2&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
85,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=2133699423&t=event&_s=3&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
86,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=2133699423&t=event&_s=4&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
88,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=2133699423&t=event&_s=5&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
89,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=2133699423&t=event&_s=6&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107625,https://www.hurriyet.com.tr/ekonomi/borsa-ista...,https://www.hurriyet.com.tr/ekonomi/borsa-ista...,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1722949931&t=event&ni=1&_s=1&dl=...,"[('urlencode', '/ekonomi/borsa-istanbulda-seyi...",post_leaks,www.hurriyet.com.tr,/ekonomi/borsa-istanbulda-seyir-ne-olacak-uzma...,,,,Violation
108676,https://www.gismeteo.ru/weather-hayward-execut...,https://www.gismeteo.ru/weather-hayward-execut...,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&aip=0&a=147209700&t=timing&ds=desk...,"[('www.gismeteo.ru',), ('urlencode', '/weather...",post_leaks,www.gismeteo.ru,/weather-hayward-executive-28925/,,,,Violation
109393,https://wordpress.com/start/?ref=logged-out-ho...,https://wordpress.com/start/user-social?ref=lo...,https://www.google-analytics.com/collect,POST,origin,v=1&_v=j101&aip=1&a=408908049&t=pageview&_s=2&...,"[('urlencode', '/start/user-social'), ('urlenc...",post_leaks,wordpress.com,/start/user-social,ref=logged-out-homepage-lp,,,Violation
110154,https://www.weebly.com/websites,https://www.weebly.com/websites,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1578457713&t=pageview&_s=1&dl=ht...,"[('urlencode', '/websites'), ('www.weebly.com',)]",post_leaks,www.weebly.com,/websites,,,,Violation


In [53]:
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)].count()

init_url        540
final_url       540
req_url         540
req_method      540
ref_pol_data    540
post_data       540
post_leaks      540
leaks           540
netloc          540
path            540
query           540
params          540
fragments       540
violation       540
dtype: int64

In [51]:
# list of post data leaks to Meta or Facebook
post_leaks_vioilation_df[
    post_leaks_vioilation_df['req_url']
    .str
    .contains('meta') 
    | post_leaks_vioilation_df['req_url']
        .str.contains('facebook')]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
1950,https://www.samsung.com/us/watches/galaxy-watch6,https://www.samsung.com/us/watches/galaxy-watch6/,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=1049256285582240&ev=Microdata&dl=https%3A%2...,"[('www.samsung.com',), ('urlencode', '/us/watc...",post_leaks,www.samsung.com,/us/watches/galaxy-watch6/,,,,Violation
2819,https://www.oracle.com/cx/advertising/measurem...,https://www.oracle.com/cx/advertising/measurem...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=704367189971874&ev=PageView&dl=https%3A%2F%...,"[('urlencode', 'urlencode', '/cx/advertising/m...",post_leaks,www.oracle.com,/cx/advertising/measurement/publishers-platforms/,,,,Violation
7812,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=431542533712561&ev=Microdata&dl=https%3A%2F...,"[('urlencode', 'lang=en_us&referrerPageUrl='),...",post_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
9191,https://www.samsung.com/us/mobile/audio/headph...,https://www.samsung.com/us/mobile/audio/headph...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=1049256285582240&ev=Microdata&dl=https%3A%2...,"[('www.samsung.com',), ('urlencode', '/us/mobi...",post_leaks,www.samsung.com,/us/mobile/audio/headphones/galaxy-buds2-pro-b...,,,,Violation
19975,https://www.appsflyer.com/solutions/entertainm...,https://www.appsflyer.com/solutions/entertainm...,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.appsflyer.com/solutions/en...","[('/solutions/entertainment-music/',), ('www.a...",post_leaks,www.appsflyer.com,/solutions/entertainment-music/,,,,Violation
24447,https://www.pendo.io/pricing,https://www.pendo.io/pricing/,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/pricing/"",""url_re...","[('www.pendo.io',), ('/pricing/',)]",post_leaks,www.pendo.io,/pricing/,,,,Violation
24448,https://www.pendo.io/pricing,https://www.pendo.io/pricing/,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/pricing/"",""url_re...","[('www.pendo.io',), ('/pricing/',)]",post_leaks,www.pendo.io,/pricing/,,,,Violation
26350,https://explore.zoom.us/en/ai-assistant,https://www.zoom.com/en/ai-assistant/,https://api-gw.metadata.io/traffic,POST,origin-when-cross-origin,"{""url"":""https://www.zoom.com/en/ai-assistant/""...","[('www.zoom.com',), ('/en/ai-assistant/',)]",post_leaks,www.zoom.com,/en/ai-assistant/,,,,Violation
27331,https://www.nikkei.com/article/dgxzqoua067j10w...,https://www.nikkei.com/article/dgxzqoua067j10w...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=517132292271830&ev=Microdata&dl=https%3A%2F...,"[('urlencode', '/article/dgxzqoua067j10w3a201c...",post_leaks,www.nikkei.com,/article/dgxzqoua067j10w3a201c2000000/,,,,Violation
27505,https://www.pendo.io/ai-for-product-management...,https://www.pendo.io/ai-for-product-management...,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/ai-for-product-ma...","[('/ai-for-product-management-course/',), ('ww...",post_leaks,www.pendo.io,/ai-for-product-management-course/,,,,Violation


In [54]:
post_leaks_vioilation_df[
    post_leaks_vioilation_df['req_url']
    .str
    .contains('meta') 
    | post_leaks_vioilation_df['req_url']
        .str.contains('facebook')].count()

init_url        32
final_url       32
req_url         32
req_method      32
ref_pol_data    32
post_data       32
post_leaks      32
leaks           32
netloc          32
path            32
query           32
params          32
fragments       32
violation       32
dtype: int64

In [52]:
# list of post data leaks to tiktok
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('tiktok').fillna(False)] 


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
277,https://www.life360.com/driving-safety,https://www.life360.com/driving-safety/,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('www.life360.com',), ('/driving-safety/',)]",post_leaks,www.life360.com,/driving-safety/,,,,Violation
280,https://www.life360.com/driving-safety,https://www.life360.com/driving-safety/,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1702157632999-5894138...","[('www.life360.com',), ('/driving-safety/',)]",post_leaks,www.life360.com,/driving-safety/,,,,Violation
296,https://www.change.org/member?source_location=...,https://www.change.org/s/member?source_locatio...,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('/s/member',), ('www.change.org',), ('source...",post_leaks,www.change.org,/s/member,source_location=member_link_header,,,Violation
297,https://www.change.org/member?source_location=...,https://www.change.org/s/member?source_locatio...,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1702115796023-7889799...","[('/s/member',), ('www.change.org',), ('urlenc...",post_leaks,www.change.org,/s/member,source_location=member_link_header,,,Violation
399,https://app.life360.com/login,https://app.life360.com/login?web_loc=login,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('app.life360.com',), ('/login',)]",post_leaks,app.life360.com,/login,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108640,https://www.eventbrite.com/b/local/health,https://www.eventbrite.com/d/ca--santa-clara/e...,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('/d/ca--santa-clara/events/',), ('www.eventb...",post_leaks,www.eventbrite.com,/d/ca--santa-clara/events/,,,,Violation
108645,https://www.eventbrite.com/b/local/health,https://www.eventbrite.com/d/ca--santa-clara/e...,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1702083281103-2993420...","[('urlencode', 'www.eventbrite.com'), ('/d/ca-...",post_leaks,www.eventbrite.com,/d/ca--santa-clara/events/,,,,Violation
109378,https://www.discogs.com/group/thread/963554?ev...,https://www.discogs.com/group/thread/963554?ev...,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('www.discogs.com',), ('/group/thread/963554'...",post_leaks,www.discogs.com,/group/thread/963554,ev=em_bt,,,Violation
109379,https://www.discogs.com/group/thread/963554?ev...,https://www.discogs.com/group/thread/963554?ev...,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1702158357927-5548643...","[('www.discogs.com',), ('/group/thread/963554'...",post_leaks,www.discogs.com,/group/thread/963554,ev=em_bt,,,Violation


In [55]:
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('tiktok').fillna(False)].count()


init_url        435
final_url       435
req_url         435
req_method      435
ref_pol_data    435
post_data       435
post_leaks      435
leaks           435
netloc          435
path            435
query           435
params          435
fragments       435
violation       435
dtype: int64