# Simple Analyzer for Leak Detector
## Give no Response for CMPS Version

In [None]:
import pandas as pd
import re
import os
from urllib.parse import urlparse

In [None]:
def print_leak_to_csv(df):
    """
    Check dataframe if there are colomn with name: url_leaks, referer_leaks,
    and post_leaks. If there are, then print the dataframe to csv file.

    :param df: dataframe

    :return: csv file with specific name
    """
  


    # if 'url_leaks' in df and 'referer_leaks' in df and 'post_leaks' in df:
    #     return df.to_csv(extract_date + '_print_raw_leaks.csv')
    if 'url_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_' + extract_type +'_print_url_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optin_url_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optout_url_leaks.csv')
    elif 'referer_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_' + extract_type + '_print_referer_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optin_referer_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optout_referer_leaks.csv')
    elif 'post_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_' + extract_type + '_print_post_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_optin_post_leaks.csv')
        # return df.to_csv("data_seg/"+ extract_date + '_print_opt_out_post_leaks.csv')



In [None]:
def extract_leak(data, leak):
    """
    extract data with when 'leak' row is not empty

    :param dataframe data: input dataframe
    :param str leak: column name

    :return: dataframe with non-empty 'leak' column
    """
 
    return data[(data['leaks'].str.contains(leak, na=False))]

In [None]:
def drop_non_leak(data, leak):
    """
    drop data that not related to the leak

    :param data: dataframe
    :param leak: column name

    :return: dataframe with non-empty 'leak' column
    """
    data_seed = extract_leak(data, leak)
    if leak == 'url_leaks':
        return data_seed.drop(['post_leaks', 'ref_data', 'referer_leaks', 'post_data'], axis=1)
    elif leak == 'referer_leaks':
        return data_seed.drop(['url_leaks', 'post_leaks', 'post_data'], axis=1)
    elif leak == 'post_leaks':
        return data_seed.drop(['url_leaks', 'referer_leaks', 'ref_data'], axis=1)


In [None]:
def check_urlparse(row):
    """check if the referer_leaks contain the netloc, path, params, query, and fragment from the final_url

    :param row: row of dataframe

    :return: dictionary of the referer_leaks
    """
    parsed_url = urlparse(row['final_url'])
    parsed_url_list = [parsed_url.netloc, parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment]
    # print(parsed_url_list)
    if 'referer_leaks' in row:    
        desired_list = row['referer_leaks'].split("'") 
    elif 'url_leaks' in row:
        desired_list = row['url_leaks'].split("'")
    elif 'post_leaks' in row:
        desired_list = row['post_leaks'].split("'")
    # print (desired_list)
    undesired_list = ['[', 
                    ']', 
                    ',', 
                    'None', 
                    '', 
                    '[(',
                    ')]', 
                    ',)]',
                    '), (',
                    ', ']
    result = {}
    netloc, path, query ,params, fragments = '', '', '', '', ''
    for item in desired_list:
        if item not in undesired_list:
            if item in parsed_url_list:
                if item == parsed_url.netloc:
                    netloc = parsed_url.netloc
                if item == parsed_url.path:
                    path = parsed_url.path
                if item == parsed_url.query:
                    query = parsed_url.query
                # it should be not include in referer 
                if item == parsed_url.params:
                    params = parsed_url.params
                if item == parsed_url.fragment:
                    fragments = parsed_url.fragment

    return netloc, path, query, params, fragments

In [None]:
def check_violation(row):
    """
    Check if the referer_leaks violate the policy or not

    :param dataframe row: a dataframe row that contain "ref_pol_data" colomn as
    the referer policy and check if the referer_leaks violate the policy or not

    :return str: Send "Violation" if the referer_leaks violate the policy,
    send "Safe" if the referer_leaks is safe, send "Warning" if the referer_leaks
    """
    ref_pol = row['ref_pol_data']
    if ((row ['params']) or (row ['fragments'])):
        return "Warning"
    match ref_pol:
        case 'no-referrer':
            if ((row ['netloc']) or (row ['path']) or (row ['query'])):
                return "Violation"
            else:
                return "Safe"
        case 'same-origin':
            if ((row ['netloc']) or (row ['path']) or (row ['query'])):
                return "Violation"
            else:  
                return "Safe"
        case 'no-referrer-when-downgrade':
            return "Safe"
        case 'unsafe-url':
            return "Safe"
        case 'origin':
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:
                return "Safe"
        case 'origin-when-cross-origin':
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:   
                return "Safe"
        case 'strict-origin' :
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:   
                return "Safe"
        case 'strict-origin-when-cross-origin':
            if (row ['path']) or (row ['query']):
                return "Violation"
            else:
                return "Safe"
        case _:
            return "Blank"

## Convert CSV to Dataframe

In [None]:
csv_file = './data_raw/2023-12-07_non_leaks_raw.csv'
# csv_file = './data_raw/2023-11-29_optIn_leaks_raw.csv'
# csv_file = './data_raw/test.csv'
extract_type = csv_file.split("_")[2]

extract_date = re.findall(r'20(.*?)_', csv_file)
extract_date = ''.join(extract_date)  # Convert the list to a string

os.makedirs("data_seg", exist_ok=True)

extracted_df = pd.read_csv(csv_file)
extracted_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,post_data,url_leaks,post_leaks,referer_leaks,leaks
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",[],[],url_leaks
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', '/research'), ('urlencode', 'ww...",[],[],url_leaks
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', '/research'), ('urlencode', 'ww...",[],[],url_leaks
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://apikeys.civiccomputing.com/c/v?d=www.o...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",[],[],url_leaks
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.linkedin.com/px/li_sync?redirect=h...,GET,strict-origin-when-cross-origin,https://www.ox.ac.uk/,,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",[],[],url_leaks
...,...,...,...,...,...,...,...,...,...,...,...
103296,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://6015542.fls.doubleclick.net/ddm/fls/r/...,GET,strict-origin-when-cross-origin,https://6015542.fls.doubleclick.net/,,"[('www.andapp.jp',)]",[],[],url_leaks
103297,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google.com/pagead/1p-user-list/878...,GET,strict-origin-when-cross-origin,https://www.andapp.jp/,,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",[],[],url_leaks
103298,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://aw.dw.impact-ad.jp/ut/rep?u=2714&v=2&r...,GET,strict-origin-when-cross-origin,https://6015542.fls.doubleclick.net/,,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",[],[],url_leaks
103299,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...",[],"[('www.andapp.jp',)]","[('www.andapp.jp',), ('urlencode', 'www.andapp...",post_leaks - referer_leaks


## Leaks on Referer

We still need to compare with the referrer policy to make it accurate

In [None]:
# create a new dataframe with non-empty 'referer_leaks' column
referer_leaks_df = drop_non_leak(extracted_df, 'referer_leaks')
referer_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks
5,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
6,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
7,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
20,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
21,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks
...,...,...,...,...,...,...,...,...
103287,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks
103288,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks
103289,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks
103291,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/generate_204?0fnMlA,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks


In [None]:
# show the dataframe with more than one leaks in "leaks" colomn
referer_leaks_df[referer_leaks_df['leaks'] != "referer_leaks"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks
20,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
28,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/ogkc1FmfiuI?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
30,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/MJNz9_M3kFg?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
32,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/api/stats/qoe?fmt=247&...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks - referer_leaks
35,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/next?key=A...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks
...,...,...,...,...,...,...,...,...
103003,https://mashable.com/entertainment,https://mashable.com/entertainment,https://gum.criteo.com/sid/json?origin=publish...,GET,strict-origin-when-cross-origin,https://gum.criteo.com/syncframe?origin=publis...,"[('mashable.com',), ('urlencode', 'mashable.co...",url_leaks - referer_leaks
103006,https://mashable.com/entertainment,https://mashable.com/entertainment,https://stags.bluekai.com/site/30629?ret=html&...,GET,no-referrer-when-downgrade,https://mashable.com/entertainment,"[('/entertainment',)]",url_leaks - referer_leaks
103013,https://mashable.com/entertainment,https://mashable.com/entertainment,https://ping.chartbeat.net/ping?h=mashable.com...,GET,no-referrer-when-downgrade,https://mashable.com/entertainment,"[('/entertainment',)]",url_leaks - referer_leaks
103113,https://www.mcafee.com/en-us/consumer-corporat...,https://www.mcafee.com/en-us/consumer-corporat...,https://adobedc.demdex.net/ee/v1/interact?conf...,POST,no-referrer-when-downgrade,https://www.mcafee.com/en-us/consumer-corporat...,"[('/en-us/consumer-corporate/investors.html',)]",post_leaks - referer_leaks


In [None]:
#expand the dataframe with the result from check_urlparse function
referer_leaks_df[['netloc', 'path', 'query', 'params', 'fragments']] = referer_leaks_df.apply(check_urlparse, axis=1, result_type='expand')
referer_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments
5,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
6,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
7,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
20,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks,www.ox.ac.uk,,,,
21,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103287,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,
103288,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,
103289,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,
103291,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/generate_204?0fnMlA,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,


In [None]:
# create a new colomn with the result from check_violation function
referer_leaks_df['violation'] = referer_leaks_df.apply(check_violation, axis=1, result_type='expand')
referer_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation
5,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
6,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
7,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
20,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks - referer_leaks,www.ox.ac.uk,,,,,Safe
21,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",referer_leaks,www.ox.ac.uk,,,,,Safe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103287,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/www-...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,,Safe
103288,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,,Safe
103289,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/s/player/dee96cfa/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,,Safe
103291,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/generate_204?0fnMlA,GET,strict-origin-when-cross-origin,https://www.youtube.com/embed/19s0G5lLc0g?rel=...,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",referer_leaks,www.andapp.jp,,,,,Safe


In [None]:
# Check wheter the params or fragments leak on referrer 
referer_leaks_df[referer_leaks_df['violation'] == "Warning"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation


### Referer Leaks that did not follow referer policy

This case is similar with the youtube case. Maybe, it's an TRC error. 
But it's also possible that there is a problem from Chrome

In [None]:
referer_leaks_vioilation_df = referer_leaks_df[referer_leaks_df['violation'] == "Violation"]
referer_leaks_vioilation_df.to_csv("data_seg/"+ extract_date + '_' + extract_type + '_print_referer_leaks_violation.csv')
referer_leaks_vioilation_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation
853,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/runtime~m...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('urlencode', 'wpengine.com'), ('wpengine.com...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
854,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/9.4a3e980...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('urlencode', 'wpengine.com'), ('wpengine.com...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
855,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/main~493d...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('urlencode', 'wpengine.com'), ('wpengine.com...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
858,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/51.558be3...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('urlencode', 'wpengine.com'), ('wpengine.com...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
859,https://wpengine.com/agency-partner-program,https://wpengine.com/agency-partner-program/,https://js.driftt.com/core/assets/js/35.d0f1cc...,GET,strict-origin-when-cross-origin,https://js.driftt.com/core?d=1&embedId=5hrxis5...,"[('urlencode', 'wpengine.com'), ('wpengine.com...",referer_leaks,wpengine.com,/agency-partner-program/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103218,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('www.docker.com',), ('urlencode', '/products...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103219,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('www.docker.com',), ('urlencode', '/products...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103229,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('www.docker.com',), ('urlencode', '/products...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103230,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://rc-widget-frame.js.driftt.com/core/ass...,GET,strict-origin-when-cross-origin,https://rc-widget-frame.js.driftt.com/core?d=1...,"[('www.docker.com',), ('urlencode', '/products...",referer_leaks,www.docker.com,/products/docker-scout/,,,,Violation


In [None]:
print_leak_to_csv(referer_leaks_vioilation_df)

In [None]:
referer_leaks_vioilation_df.count()

init_url         2759
final_url        2759
req_url          2759
req_method       2759
ref_pol_data     2759
ref_data         2759
referer_leaks    2759
leaks            2759
netloc           2759
path             2759
query            2759
params           2759
fragments        2759
violation        2759
dtype: int64

In [None]:
referer_leaks_vioilation_df.drop_duplicates(subset=['final_url']).count()

init_url         205
final_url        205
req_url          205
req_method       205
ref_pol_data     205
ref_data         205
referer_leaks    205
leaks            205
netloc           205
path             205
query            205
params           205
fragments        205
violation        205
dtype: int64

In [None]:
referer_leaks_vioilation_df.final_url.value_counts()

https://www.parse.ly/getdemo/                                                                                                         88
https://www.docker.com/products/docker-scout/                                                                                         41
https://www.docker.com/careers/                                                                                                       41
https://www.docker.com/resources/what-container/                                                                                      41
https://www.docker.com/why-docker/                                                                                                    41
                                                                                                                                      ..
https://tenki.jp/pm25/                                                                                                                 1
https://www.criteo.com/get-started/      

In [None]:
referer_leaks_vioilation_df.req_url.value_counts()

https://s.yimg.jp/images/advertising/common/js/iicon.min.js                                                                                                   67
https://js.driftt.com/core/assets/js/runtime~main.95493482.js                                                                                                 52
https://js.driftt.com/core/assets/css/8.7602338c.chunk.css                                                                                                    52
https://js.driftt.com/core/assets/js/9.4a3e9801.chunk.js                                                                                                      52
https://js.driftt.com/core/assets/js/57.28dde8ce.chunk.js                                                                                                     52
                                                                                                                                                              ..
https://platform.twitter.com/embed

In [None]:
referer_leaks_vioilation_df.req_method.value_counts() 

GET     2737
POST      22
Name: req_method, dtype: int64

### Google

In [None]:
# list of referer leaks to google
referer_leaks_vioilation_df[referer_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)]


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation
10621,https://www.india.com/rajasthan,https://www.india.com/rajasthan/,https://googleads.g.doubleclick.net/pagead/adv...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('urlencode', '/rajasthan/'), ('urlencode', '...",referer_leaks,www.india.com,/rajasthan/,,,,Violation
10622,https://www.india.com/rajasthan,https://www.india.com/rajasthan/,https://googleads.g.doubleclick.net/pagead/adv...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('urlencode', '/rajasthan/'), ('urlencode', '...",referer_leaks,www.india.com,/rajasthan/,,,,Violation
16423,https://www.wikihow.com/make-christmas-come-fa...,https://www.wikihow.com/Make-Christmas-Come-Fa...,https://googleads.g.doubleclick.net/pagead/adv...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('www.wikihow.com',), ('urlencode', 'www.wiki...",referer_leaks,www.wikihow.com,/Make-Christmas-Come-Faster,,,,Violation
22034,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/l8z5yn8/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/l8z5yn8/...,"[('www.investopedia.com',), ('urlencode', '/th...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
22047,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/l8z5yn8/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/l8z5yn8/...,"[('www.investopedia.com',), ('urlencode', '/th...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
22048,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/l8z5yn8/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/l8z5yn8/...,"[('www.investopedia.com',), ('urlencode', '/th...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
22053,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/l8z5yn8/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/l8z5yn8/...,"[('www.investopedia.com',), ('urlencode', '/th...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
22054,https://www.investopedia.com/the-american-drea...,https://www.investopedia.com/the-american-drea...,https://live-tag.creatopy.net/designs/l8z5yn8/...,GET,strict-origin-when-cross-origin,https://live-tag.creatopy.net/designs/l8z5yn8/...,"[('www.investopedia.com',), ('urlencode', '/th...",referer_leaks,www.investopedia.com,/the-american-dream-now-costs-over-usd3-millio...,,,,Violation
27869,https://www.fmkorea.com/fm24players,https://www.fmkorea.com/fm24players,https://googleads.g.doubleclick.net/xbbe/pixel...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('urlencode', 'www.fmkorea.com'), ('www.fmkor...",referer_leaks,www.fmkorea.com,/fm24players,,,,Violation
38224,https://www.fmkorea.com/fm23tips,https://www.fmkorea.com/fm23tips,https://googleads.g.doubleclick.net/xbbe/pixel...,GET,strict-origin-when-cross-origin,https://googleads.g.doubleclick.net/pagead/ads...,"[('urlencode', 'www.fmkorea.com'), ('www.fmkor...",referer_leaks,www.fmkorea.com,/fm23tips,,,,Violation


In [None]:
# counting on referer leaks to google
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('google')
    .fillna(False)
    ].count()


init_url         33
final_url        33
req_url          33
req_method       33
ref_pol_data     33
ref_data         33
referer_leaks    33
leaks            33
netloc           33
path             33
query            33
params           33
fragments        33
violation        33
dtype: int64

### META / Facebook

In [None]:
# list of referer leaks to Meta or Facebook
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('meta')
    | referer_leaks_vioilation_df['req_url']
        .str
        .contains('facebook')]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation


In [None]:
# Counting on referer leaks to Meta or Facebook
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('meta') 
    | referer_leaks_vioilation_df['req_url']
        .str.
        contains('facebook')].count()

init_url         0
final_url        0
req_url          0
req_method       0
ref_pol_data     0
ref_data         0
referer_leaks    0
leaks            0
netloc           0
path             0
query            0
params           0
fragments        0
violation        0
dtype: int64

### Tiktok

In [None]:
# list of referer leaks to tiktok
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('tiktok')
    .fillna(False)] 


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,referer_leaks,leaks,netloc,path,query,params,fragments,violation


In [None]:
referer_leaks_vioilation_df[
    referer_leaks_vioilation_df['req_url']
    .str
    .contains('tiktok')
    .fillna(False)].count()


init_url         0
final_url        0
req_url          0
req_method       0
ref_pol_data     0
ref_data         0
referer_leaks    0
leaks            0
netloc           0
path             0
query            0
params           0
fragments        0
violation        0
dtype: int64

## Leaks on URL

In [None]:
# create a new dataframe with non-empty 'referer_leaks' column
url_leaks_df = drop_non_leak(extracted_df, 'url_leaks')
url_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://apikeys.civiccomputing.com/c/v?d=www.o...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.linkedin.com/px/li_sync?redirect=h...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks
...,...,...,...,...,...,...,...
103294,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://adservice.google.com/ddm/fls/i/dc_pre=...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks
103295,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://googleads.g.doubleclick.net/pagead/vie...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks
103296,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://6015542.fls.doubleclick.net/ddm/fls/r/...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks
103297,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google.com/pagead/1p-user-list/878...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks


In [None]:
# expand the datarame with the result from check_urlparse function and create a new colomn with the result from check_violation function
url_leaks_df[['netloc', 'path', 'query', 'params', 'fragments']] = url_leaks_df.apply(check_urlparse, axis=1, result_type='expand')

url_leaks_df['violation'] = url_leaks_df.apply(check_violation, axis=1, result_type='expand')

url_leaks_df


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
0,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/embed/VQwYwDqHA7I?wmod...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks,www.ox.ac.uk,,,,,Safe
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks,www.ox.ac.uk,/research,,,,Violation
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks,www.ox.ac.uk,/research,,,,Violation
3,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://apikeys.civiccomputing.com/c/v?d=www.o...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks,www.ox.ac.uk,,,,,Safe
4,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.linkedin.com/px/li_sync?redirect=h...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",url_leaks,www.ox.ac.uk,,,,,Safe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103294,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://adservice.google.com/ddm/fls/i/dc_pre=...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks,www.andapp.jp,,,,,Safe
103295,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://googleads.g.doubleclick.net/pagead/vie...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
103296,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://6015542.fls.doubleclick.net/ddm/fls/r/...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',)]",url_leaks,www.andapp.jp,,,,,Safe
103297,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google.com/pagead/1p-user-list/878...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation


In [None]:
# Check wheter the params or fragments leak on url
url_leaks_df[url_leaks_df['violation'] == "Warning"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation


In [None]:
url_leaks_vioilation_df = url_leaks_df[url_leaks_df['violation'] == "Violation"]
url_leaks_vioilation_df


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks,www.ox.ac.uk,/research,,,,Violation
2,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks,www.ox.ac.uk,/research,,,,Violation
8,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks,www.ox.ac.uk,/research,,,,Violation
56,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/g/collect?v=2...,POST,origin,"[('urlencode', '/borderfreecosmetics/entry-127...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
59,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ln.ameba.jp/v3/zBymKrvv?rd=03883547-4d...,GET,origin,"[('urlencode', 'ameblo.jp'), ('urlencode', '/b...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103284,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://yjtag.yahoo.co.jp/tag?site=k36tMgw&ref...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
103292,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.facebook.com/v10.0/plugins/like.ph...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,,,,Violation
103293,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://syndication.twitter.com/i/jot/embeds?l...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.andapp.jp'), ('www.andapp....",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
103295,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://googleads.g.doubleclick.net/pagead/vie...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation


### Maybe some of them is false alarm

For example is a language setting that can be shown on query. The pattern usually "lang=xx"

In [None]:
url_leaks_vioilation_df[
    (url_leaks_vioilation_df['query']
    .str
    .contains('lang', na=False))
          ]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
7046,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://reliably-main-silkworm.pgsdemo.com/?la...,GET,strict-origin-when-cross-origin,"[('lang=en_us&referrerPageUrl=',)]",url_leaks,,,lang=en_us&referrerPageUrl=,,,Violation
7075,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://www.facebook.com/tr/?id=43154253371256...,GET,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'lang=...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
7078,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://events.attentivemobile.com/e?v=4.25.43...,POST,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'lang=...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
7081,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://rp.liadm.com/j?dtstmp=1702007516121&ai...,GET,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'lang=...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
7083,https://store.netgear.com/home/?lang=en_us,https://store.netgear.com/home/?lang=en_us&ref...,https://netgear.attn.tv/unrenderedCreative?v=4...,POST,strict-origin-when-cross-origin,"[('urlencode', '/home/'), ('urlencode', 'store...",url_leaks,store.netgear.com,/home/,lang=en_us&referrerPageUrl=,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83826,https://dzen.ru/news/story/vpodmoskove_nashli_...,https://dzen.ru/news/story/vpodmoskove_nashli_...,https://sso.passport.yandex.ru/push?uuid=65b5f...,GET,strict-origin-when-cross-origin,"[('urlencode', 'dzen.ru'), ('urlencode', '/new...",url_leaks,dzen.ru,/news/story/vpodmoskove_nashli_telo_ehks-deput...,lang=ru&from=main_portal&fan=1&stid=wzlzk1zkik...,,,Violation
88428,https://dzen.ru/news/story/vnarodnom_sovete_ln...,https://dzen.ru/news/story/vnarodnom_sovete_ln...,https://sso.passport.yandex.ru/push?uuid=077d2...,GET,strict-origin-when-cross-origin,"[('urlencode', 'dzen.ru'), ('urlencode', '/new...",url_leaks,dzen.ru,/news/story/vnarodnom_sovete_lnr_podtverdili_g...,lang=ru&from=main_portal&fan=1&stid=ppm0ktlu-g...,,,Violation
95095,https://businesshelp.snapchat.com/,https://businesshelp.snapchat.com/s/?language=...,https://www.google-analytics.com/j/collect?v=1...,POST,origin-when-cross-origin,"[('urlencode', 'language=en_US'), ('urlencode'...",url_leaks,businesshelp.snapchat.com,/s/,language=en_US,,,Violation
96241,https://www.scorecardresearch.com/about.aspx?n...,https://www.scorecardresearch.com/about.aspx?n...,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', 'www.scorecardresearch.com'), (...",url_leaks,www.scorecardresearch.com,/about.aspx,newlanguage=1,,,Violation


In [None]:
# and print to csv

print_leak_to_csv(url_leaks_vioilation_df)

In [None]:
url_leaks_vioilation_df.count()

init_url        29234
final_url       29234
req_url         29234
req_method      29234
ref_pol_data    29234
url_leaks       29234
leaks           29234
netloc          29234
path            29234
query           29234
params          29234
fragments       29234
violation       29234
dtype: int64

In [None]:
url_leaks_vioilation_df.req_method.value_counts()

GET        23847
POST        5229
OPTIONS      138
HEAD          19
PUT            1
Name: req_method, dtype: int64

In [None]:
url_leaks_vioilation_df.drop_duplicates(subset=['final_url'])

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
1,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/collect?v=2&fmt=js...,GET,strict-origin-when-cross-origin,"[('urlencode', '/research'), ('urlencode', 'ww...",url_leaks,www.ox.ac.uk,/research,,,,Violation
56,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/g/collect?v=2...,POST,origin,"[('urlencode', '/borderfreecosmetics/entry-127...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
72,https://xmp.mobvista.com/cn,https://xmp.mobvista.com/cn,https://magnet.rayjump.com/pixel?campuuid=ss_x...,GET,strict-origin-when-cross-origin,"[('xmp.mobvista.com',), ('urlencode', '/cn'), ...",url_leaks,xmp.mobvista.com,/cn,,,,Violation
76,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', '/cp/621/kleding-dames/'), ('ur...",url_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
94,https://email.nypost.com/israel-war-update,https://email.nypost.com/israel-war-update/,https://tr.outbrain.com/unifiedPixel?optOut=fa...,POST,strict-origin-when-cross-origin,"[('urlencode', 'email.nypost.com'), ('email.ny...",url_leaks,email.nypost.com,/israel-war-update/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103052,https://www.finn.no/nettbil/velkommen?origin=f...,https://www.finn.no/nettbil?origin=frontpage_icon,https://static.finncdn.no/_c/nettbil-layout/_n...,GET,strict-origin-when-cross-origin,"[('/nettbil',)]",url_leaks,,/nettbil,,,,Violation
103090,https://www.washington.edu/news/?utm_source=wh...,https://www.washington.edu/news/?utm_source=wh...,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', '/news/'), ('www.washington.edu...",url_leaks,www.washington.edu,/news/,utm_source=whitebar&utm_medium=click&utm_campa...,,,Violation
103111,https://www.mcafee.com/en-us/consumer-corporat...,https://www.mcafee.com/en-us/consumer-corporat...,https://sp.analytics.yahoo.com/sp.pl?a=10000&d...,GET,strict-origin-when-cross-origin,"[('www.mcafee.com',), ('urlencode', 'www.mcafe...",url_leaks,www.mcafee.com,/en-us/consumer-corporate/investors.html,,,,Violation
103126,https://actu.fr/politique,https://actu.fr/politique,https://trc.taboola.com/actufr-actufr/trc/3/js...,GET,strict-origin-when-cross-origin,"[('actu.fr',), ('urlencode', 'actu.fr'), ('url...",url_leaks,actu.fr,/politique,,,,Violation


### GOOGLE

In [None]:
# count of url leaks to google
url_leaks_vioilation_df[url_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)].count()


init_url        9178
final_url       9178
req_url         9178
req_method      9178
ref_pol_data    9178
url_leaks       9178
leaks           9178
netloc          9178
path            9178
query           9178
params          9178
fragments       9178
violation       9178
dtype: int64

In [None]:
# list of url leaks to google
url_leaks_vioilation_df[url_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
56,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/g/collect?v=2...,POST,origin,"[('urlencode', '/borderfreecosmetics/entry-127...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
60,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://www.google-analytics.com/j/collect?v=1...,POST,origin,"[('urlencode', '/borderfreecosmetics/entry-127...",url_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
73,https://xmp.mobvista.com/cn,https://xmp.mobvista.com/cn,https://www.google-analytics.com/g/collect?v=2...,POST,strict-origin-when-cross-origin,"[('xmp.mobvista.com',), ('urlencode', '/cn'), ...",url_leaks,xmp.mobvista.com,/cn,,,,Violation
75,https://xmp.mobvista.com/cn,https://xmp.mobvista.com/cn,https://www.google-analytics.com/g/collect?v=2...,POST,strict-origin-when-cross-origin,"[('xmp.mobvista.com',), ('urlencode', '/cn'), ...",url_leaks,xmp.mobvista.com,/cn,,,,Violation
76,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('urlencode', '/cp/621/kleding-dames/'), ('ur...",url_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103226,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://analytics.google.com/g/collect?v=2&tid...,POST,strict-origin-when-cross-origin,"[('www.docker.com',), ('urlencode', '/products...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103281,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
103283,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.google-analytics.com/g/collect?v=2...,POST,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation
103295,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://googleads.g.doubleclick.net/pagead/vie...,GET,strict-origin-when-cross-origin,"[('www.andapp.jp',), ('urlencode', 'www.andapp...",url_leaks,www.andapp.jp,/apps/12016007,from=www_mbga_intro,,,Violation


### Facebook or Meta

In [None]:
# count of url leaks to Meta or Facebook
url_leaks_vioilation_df[url_leaks_vioilation_df['req_url']
                        .str.contains('meta') 
                        | url_leaks_vioilation_df['req_url']
                            .str.contains('facebook')].count()

init_url        1577
final_url       1577
req_url         1577
req_method      1577
ref_pol_data    1577
url_leaks       1577
leaks           1577
netloc          1577
path            1577
query           1577
params          1577
fragments       1577
violation       1577
dtype: int64

In [None]:
# list of url leaks to Meta or Facebook
url_leaks_vioilation_df[
    url_leaks_vioilation_df['req_url']
    .str.contains('meta') 
    | url_leaks_vioilation_df['req_url']
        .str.contains('facebook')]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
168,https://www.amazon.com.mx/s/?_encoding=utf8&k=...,https://www.amazon.com.mx/s?k=carteras&i=fashi...,https://www.facebook.com/fr/r.php?p=5582933009...,GET,no-referrer,"[('urlencode', '/s'), ('custom_map_1', '/S')]",url_leaks,,/s,,,,Violation
310,https://www.instagram.com/accounts/emailsignup,https://www.instagram.com/accounts/emailsignup/,https://www.facebook.com/x/oauth/status?client...,GET,strict-origin-when-cross-origin,"[('urlencode', '/accounts/emailsignup/'), ('ur...",url_leaks,www.instagram.com,/accounts/emailsignup/,,,,Violation
311,https://ui.com/us/identity?s=us,https://ui.com/us/identity,https://connect.facebook.net/signals/config/77...,GET,same-origin,"[('ui.com',), ('urlencode', 'ui.com')]",url_leaks,ui.com,,,,,Violation
319,https://ui.com/us/identity?s=us,https://ui.com/us/identity,https://www.facebook.com/tr/?id=77134957013916...,GET,same-origin,"[('ui.com',), ('urlencode', '/us/identity'), (...",url_leaks,ui.com,/us/identity,,,,Violation
326,https://ui.com/us/identity?s=us,https://ui.com/us/identity,https://www.facebook.com/tr/?id=77134957013916...,GET,same-origin,"[('ui.com',), ('urlencode', '/us/identity'), (...",url_leaks,ui.com,/us/identity,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103095,https://www.washington.edu/news/?utm_source=wh...,https://www.washington.edu/news/?utm_source=wh...,https://www.facebook.com/tr/?id=39956399904231...,GET,strict-origin-when-cross-origin,"[('urlencode', '/news/'), ('www.washington.edu...",url_leaks,www.washington.edu,/news/,utm_source=whitebar&utm_medium=click&utm_campa...,,,Violation
103162,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://www.facebook.com/tr/?id=16466388623210...,GET,strict-origin-when-cross-origin,"[('www.docker.com',), ('urlencode', '/products...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103232,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://www.facebook.com/tr/?id=16466388623210...,GET,strict-origin-when-cross-origin,"[('www.docker.com',), ('urlencode', '/products...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103235,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://www.facebook.com/tr/?id=16466388623210...,GET,strict-origin-when-cross-origin,"[('www.docker.com',), ('urlencode', '/products...",url_leaks,www.docker.com,/products/docker-scout/,,,,Violation


### Tiktok

In [None]:
url_leaks_vioilation_df[
    url_leaks_vioilation_df['req_url']
        .str.contains('tiktok').fillna(False)] 


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,url_leaks,leaks,netloc,path,query,params,fragments,violation
3896,https://time.com/6342806/person-of-the-year-20...,https://time.com/6342806/person-of-the-year-20...,https://securepubads.g.doubleclick.net/gampad/...,GET,strict-origin-when-cross-origin,"[('urlencode', 'time.com'), ('urlencode', '/63...",url_leaks,time.com,/6342806/person-of-the-year-2023-taylor-swift/,,,,Violation
3916,https://www.adjust.com/resources/ebooks/skadne...,https://www.adjust.com/resources/ebooks/skadne...,https://pi.pardot.com/analytics?ver=3&pi_form=...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.adjust.com'), ('www.adjust...",url_leaks,www.adjust.com,/resources/ebooks/skadnetwork-4-ios/,,,,Violation
4715,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://widgetmonitor.outbrain.com/WidgetError...,GET,strict-origin-when-cross-origin,"[('www.huffpost.com',), ('urlencode', '/entry/...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
4727,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://widgets.outbrain.com/widgetMonitor/mon...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.huffpost.com'), ('urlencod...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
4785,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://widgetmonitor.outbrain.com/WidgetError...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.huffpost.com'), ('urlencod...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
4807,https://www.huffpost.com/entry/comfy-shoes-tik...,https://www.huffpost.com/entry/comfy-shoes-tik...,https://ssum-sec.casalemedia.com/usermatch?us_...,GET,strict-origin-when-cross-origin,"[('www.huffpost.com',), ('urlencode', '/entry/...",url_leaks,www.huffpost.com,/entry/comfy-shoes-tiktok-ano_l_6567b611e4b066...,,,,Violation
4993,https://www.nike.com/w/member-sale-9t7gt,https://www.nike.com/w/member-sale-9t7gt,https://newrelicstream.adtech-prod.nikecloud.c...,GET,strict-origin-when-cross-origin,"[('urlencode', 'www.nike.com'), ('/w/member-sa...",url_leaks,www.nike.com,/w/member-sale-9t7gt,,,,Violation
7577,https://www.healthline.com/fitness,https://www.healthline.com/fitness,https://www.google-analytics.com/j/collect?v=1...,POST,strict-origin-when-cross-origin,"[('www.healthline.com',), ('urlencode', '/fitn...",url_leaks,www.healthline.com,/fitness,,,,Violation
9069,https://www.nike.com/w/classics-collection-aj8oq,https://www.nike.com/w/classics-collection-aj8oq,https://newrelicstream.adtech-prod.nikecloud.c...,GET,strict-origin-when-cross-origin,"[('www.nike.com',), ('urlencode', 'www.nike.co...",url_leaks,www.nike.com,/w/classics-collection-aj8oq,,,,Violation
19449,https://www.nike.com/w/jordan-1-4foky,https://www.nike.com/w/jordan-1-4foky,https://newrelicstream.adtech-prod.nikecloud.c...,GET,strict-origin-when-cross-origin,"[('www.nike.com',), ('/w/jordan-1-4foky',), ('...",url_leaks,www.nike.com,/w/jordan-1-4foky,,,,Violation


## Leaks on Post Data

In [None]:
# create a new dataframe with non-empty 'post_leaks' column
post_leaks_df = drop_non_leak(extracted_df, 'post_leaks')
post_leaks_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks
9,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""ogkc1FmfiuI"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks
10,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""MJNz9_M3kFg"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks
11,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/wa/,POST,strict-origin-when-cross-origin,"{""pids"":[9293],""scriptVersion"":1029,""time"":170...","[('/research',)]",post_leaks
13,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks
15,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks
...,...,...,...,...,...,...,...,...
103249,https://shop.wired.com/,https://shop.wired.com/,https://events.privy.com/v2/collect,POST,strict-origin-when-cross-origin,"{""event"":""new-session"",""properties"":{""referrin...","[('shop.wired.com',)]",post_leaks
103251,https://shop.wired.com/,https://shop.wired.com/,https://a.ad.gt/api/v1/collect,POST,strict-origin-when-cross-origin,eyJjYXRlZ29yeSI6InBhZ2VWaWV3IiwidmVyc2lvbiI6In...,"[('base64', 'shop.wired.com')]",post_leaks
103275,https://shop.wired.com/,https://shop.wired.com/,https://l.evidon.com/site/v3/userPref/,POST,strict-origin-when-cross-origin,"{""d"":{""noticeId"":98141,""companyId"":5116,""actio...","[('shop.wired.com',)]",post_leaks
103299,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.andapp.jp',)]",post_leaks - referer_leaks


In [None]:
# expand the datarame with the result from check_urlparse function and create a new colomn with the result from check_violation function
post_leaks_df[['netloc', 'path', 'query', 'params', 'fragments']] = post_leaks_df.apply(check_urlparse, axis=1, result_type='expand')

post_leaks_df['violation'] = post_leaks_df.apply(check_violation, axis=1, result_type='expand')

post_leaks_df


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
9,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""ogkc1FmfiuI"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks,www.ox.ac.uk,,,,,Safe
10,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/player?key...,POST,strict-origin-when-cross-origin,"{""videoId"":""MJNz9_M3kFg"",""context"":{""client"":{...","[('urlencode', 'www.ox.ac.uk'), ('www.ox.ac.uk...",post_leaks,www.ox.ac.uk,,,,,Safe
11,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/wa/,POST,strict-origin-when-cross-origin,"{""pids"":[9293],""scriptVersion"":1029,""time"":170...","[('/research',)]",post_leaks,,/research,,,,Violation
13,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks,www.ox.ac.uk,,,,,Safe
15,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.ox.ac.uk',)]",post_leaks,www.ox.ac.uk,,,,,Safe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103249,https://shop.wired.com/,https://shop.wired.com/,https://events.privy.com/v2/collect,POST,strict-origin-when-cross-origin,"{""event"":""new-session"",""properties"":{""referrin...","[('shop.wired.com',)]",post_leaks,shop.wired.com,,,,,Safe
103251,https://shop.wired.com/,https://shop.wired.com/,https://a.ad.gt/api/v1/collect,POST,strict-origin-when-cross-origin,eyJjYXRlZ29yeSI6InBhZ2VWaWV3IiwidmVyc2lvbiI6In...,"[('base64', 'shop.wired.com')]",post_leaks,shop.wired.com,,,,,Safe
103275,https://shop.wired.com/,https://shop.wired.com/,https://l.evidon.com/site/v3/userPref/,POST,strict-origin-when-cross-origin,"{""d"":{""noticeId"":98141,""companyId"":5116,""actio...","[('shop.wired.com',)]",post_leaks,shop.wired.com,,,,,Safe
103299,https://www.mbga.jp/_game_intro?game_id=12016007,https://www.andapp.jp/apps/12016007?from=www_m...,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,"{""context"":{""client"":{""hl"":""en"",""gl"":""US"",""cli...","[('www.andapp.jp',)]",post_leaks - referer_leaks,www.andapp.jp,,,,,Safe


In [None]:
# Check wheter the params or fragments leak on post
post_leaks_df[post_leaks_df['violation'] == "Warning"]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation


In [None]:
post_leaks_vioilation_df = post_leaks_df[post_leaks_df['violation'] == "Violation"]
post_leaks_vioilation_df

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
11,https://www.ox.ac.uk/research,https://www.ox.ac.uk/research,https://px.ads.linkedin.com/wa/,POST,strict-origin-when-cross-origin,"{""pids"":[9293],""scriptVersion"":1029,""time"":170...","[('/research',)]",post_leaks,,/research,,,,Violation
61,https://ameblo.jp/borderfreecosmetics/entry-12...,https://ameblo.jp/borderfreecosmetics/entry-12...,https://id5-sync.com/gm/v3,POST,origin,"{""requests"":[{""requestId"":""323ae658-e2df-4d62-...",[('/borderfreecosmetics/entry-12761950217.html...,post_leaks,ameblo.jp,/borderfreecosmetics/entry-12761950217.html,,,,Violation
77,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://ib.adnxs-simple.com/ut/v3/prebid,POST,strict-origin-when-cross-origin,"{""tags"":[{""sizes"":[{""width"":728,""height"":90},{...","[('/cp/621/kleding-dames/',), ('www.marktplaat...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
78,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://ib.adnxs-simple.com/ut/v3/prebid,POST,strict-origin-when-cross-origin,"{""tags"":[{""sizes"":[{""width"":728,""height"":90},{...","[('/cp/621/kleding-dames/',), ('www.marktplaat...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
79,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://htlb.casalemedia.com/openrtb/pbjs?s=23...,POST,strict-origin-when-cross-origin,"{""id"":""9a6fb5caf8609a"",""site"":{""page"":""https:/...","[('/cp/621/kleding-dames/',), ('www.marktplaat...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103220,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://bootstrap.api.drift.com/widget_bootstr...,POST,strict-origin-when-cross-origin,ping_context=%7B%22embedId%22%3A%22p4d2wxp9n4g...,"[('www.docker.com',), ('urlencode', '/products...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103222,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://bootstrap.api.drift.com/widget_bootstrap,POST,strict-origin-when-cross-origin,embed_id=p4d2wxp9n4gk&client_id=f6zuizdyhxrm7r...,"[('www.docker.com',), ('urlencode', '/products...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103224,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://event.api.drift.com/track,POST,strict-origin-when-cross-origin,"{""orgId"":5090900,""inboxId"":732344,""userId"":nul...","[('www.docker.com',), ('/products/docker-scout...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation
103225,https://www.docker.com/products/docker-scout,https://www.docker.com/products/docker-scout/,https://targeting.api.drift.com/targeting/eval...,POST,strict-origin-when-cross-origin,"{""conditionGroups"":[{""status"":""EVALUATED"",""mat...","[('www.docker.com',), ('/products/docker-scout...",post_leaks,www.docker.com,/products/docker-scout/,,,,Violation


In [None]:
# print to csv

post_leaks_vioilation_df['post_data'] = [str(x)[:2048] for x in post_leaks_vioilation_df['post_data']] # convert to string and truncate post_data to 2048 characters
print_leak_to_csv(post_leaks_vioilation_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  post_leaks_vioilation_df['post_data'] = [str(x)[:2048] for x in post_leaks_vioilation_df['post_data']] # convert to string and truncate post_data to 2048 characters


In [None]:
post_leaks_vioilation_df.count()

init_url        8670
final_url       8670
req_url         8670
req_method      8670
ref_pol_data    8670
post_data       8670
post_leaks      8670
leaks           8670
netloc          8670
path            8670
query           8670
params          8670
fragments       8670
violation       8670
dtype: int64

In [None]:
post_leaks_vioilation_df.req_method.value_counts()	

POST    8668
PUT        2
Name: req_method, dtype: int64

### Leaks to Google, facebook, and Meta

In [None]:
# list of post data leaks to google
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)]


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
84,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1693760922&t=event&_s=2&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
85,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1693760922&t=event&_s=3&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
86,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1693760922&t=event&_s=4&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
88,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1693760922&t=event&_s=5&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
89,https://www.marktplaats.nl/cp/621/kleding-dames,https://www.marktplaats.nl/cp/621/kleding-dames/,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1693760922&t=event&_s=6&dl=https...,"[('urlencode', '/cp/621/kleding-dames/'), ('ww...",post_leaks,www.marktplaats.nl,/cp/621/kleding-dames/,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101512,https://www.gismeteo.ru/weather-hayward-execut...,https://www.gismeteo.ru/weather-hayward-execut...,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&aip=0&a=2001831229&t=timing&ds=des...,"[('www.gismeteo.ru',), ('urlencode', '/weather...",post_leaks,www.gismeteo.ru,/weather-hayward-executive-28925/,,,,Violation
102143,https://wordpress.com/start/?ref=logged-out-ho...,https://wordpress.com/start/user-social?ref=lo...,https://www.google-analytics.com/collect,POST,origin,v=1&_v=j101&aip=1&a=2015635416&t=pageview&_s=2...,"[('urlencode', '/start/user-social'), ('urlenc...",post_leaks,wordpress.com,/start/user-social,ref=logged-out-homepage-lp,,,Violation
102619,https://www.weebly.com/websites,https://www.weebly.com/websites,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1926208616&t=pageview&_s=1&dl=ht...,"[('www.weebly.com',), ('urlencode', '/websites')]",post_leaks,www.weebly.com,/websites,,,,Violation
102620,https://www.weebly.com/websites,https://www.weebly.com/websites,https://www.google-analytics.com/collect,POST,strict-origin-when-cross-origin,v=1&_v=j101&a=1926208616&t=event&ni=0&_s=1&dl=...,"[('www.weebly.com',), ('urlencode', '/websites')]",post_leaks,www.weebly.com,/websites,,,,Violation


In [None]:
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('google').fillna(False)].count()

init_url        520
final_url       520
req_url         520
req_method      520
ref_pol_data    520
post_data       520
post_leaks      520
leaks           520
netloc          520
path            520
query           520
params          520
fragments       520
violation       520
dtype: int64

In [None]:
# list of post data leaks to Meta or Facebook
post_leaks_vioilation_df[
    post_leaks_vioilation_df['req_url']
    .str
    .contains('meta') 
    | post_leaks_vioilation_df['req_url']
        .str.contains('facebook')]

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
1674,https://www.samsung.com/us/watches/galaxy-watch6,https://www.samsung.com/us/watches/galaxy-watch6/,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=1049256285582240&ev=Microdata&dl=https%3A%2...,"[('www.samsung.com',), ('urlencode', '/us/watc...",post_leaks,www.samsung.com,/us/watches/galaxy-watch6/,,,,Violation
2613,https://www.oracle.com/cx/advertising/measurem...,https://www.oracle.com/cx/advertising/measurem...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=704367189971874&ev=PageView&dl=https%3A%2F%...,"[('urlencode', 'urlencode', '/cx/advertising/m...",post_leaks,www.oracle.com,/cx/advertising/measurement/publishers-platforms/,,,,Violation
8358,https://www.samsung.com/us/mobile/audio/headph...,https://www.samsung.com/us/mobile/audio/headph...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=1049256285582240&ev=Microdata&dl=https%3A%2...,"[('urlencode', '/us/mobile/audio/headphones/ga...",post_leaks,www.samsung.com,/us/mobile/audio/headphones/galaxy-buds2-pro-b...,,,,Violation
18725,https://www.appsflyer.com/solutions/entertainm...,https://www.appsflyer.com/solutions/entertainm...,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.appsflyer.com/solutions/en...","[('/solutions/entertainment-music/',), ('www.a...",post_leaks,www.appsflyer.com,/solutions/entertainment-music/,,,,Violation
22922,https://www.pendo.io/pricing,https://www.pendo.io/pricing/,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/pricing/"",""url_re...","[('/pricing/',), ('www.pendo.io',)]",post_leaks,www.pendo.io,/pricing/,,,,Violation
22923,https://www.pendo.io/pricing,https://www.pendo.io/pricing/,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/pricing/"",""url_re...","[('/pricing/',), ('www.pendo.io',)]",post_leaks,www.pendo.io,/pricing/,,,,Violation
24760,https://explore.zoom.us/en/ai-assistant,https://www.zoom.com/en/ai-assistant/,https://api-gw.metadata.io/traffic,POST,origin-when-cross-origin,"{""url"":""https://www.zoom.com/en/ai-assistant/""...","[('www.zoom.com',), ('/en/ai-assistant/',)]",post_leaks,www.zoom.com,/en/ai-assistant/,,,,Violation
25695,https://www.nikkei.com/article/dgxzqoua067j10w...,https://www.nikkei.com/article/dgxzqoua067j10w...,https://www.facebook.com/tr/,POST,strict-origin-when-cross-origin,id=517132292271830&ev=Microdata&dl=https%3A%2F...,"[('www.nikkei.com',), ('urlencode', '/article/...",post_leaks,www.nikkei.com,/article/dgxzqoua067j10w3a201c2000000/,,,,Violation
25864,https://www.pendo.io/ai-for-product-management...,https://www.pendo.io/ai-for-product-management...,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/ai-for-product-ma...","[('/ai-for-product-management-course/',), ('ww...",post_leaks,www.pendo.io,/ai-for-product-management-course/,,,,Violation
25865,https://www.pendo.io/ai-for-product-management...,https://www.pendo.io/ai-for-product-management...,https://api-gw.metadata.io/traffic,POST,strict-origin-when-cross-origin,"{""url"":""https://www.pendo.io/ai-for-product-ma...","[('/ai-for-product-management-course/',), ('ww...",post_leaks,www.pendo.io,/ai-for-product-management-course/,,,,Violation


In [None]:
post_leaks_vioilation_df[
    post_leaks_vioilation_df['req_url']
    .str
    .contains('meta') 
    | post_leaks_vioilation_df['req_url']
        .str.contains('facebook')].count()

init_url        29
final_url       29
req_url         29
req_method      29
ref_pol_data    29
post_data       29
post_leaks      29
leaks           29
netloc          29
path            29
query           29
params          29
fragments       29
violation       29
dtype: int64

In [None]:
# list of post data leaks to tiktok
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('tiktok').fillna(False)] 


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,post_data,post_leaks,leaks,netloc,path,query,params,fragments,violation
358,https://www.life360.com/driving-safety,https://www.life360.com/driving-safety/,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('/driving-safety/',), ('www.life360.com',)]",post_leaks,www.life360.com,/driving-safety/,,,,Violation
362,https://www.life360.com/driving-safety,https://www.life360.com/driving-safety/,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1702065622324-6156731...","[('/driving-safety/',), ('www.life360.com',)]",post_leaks,www.life360.com,/driving-safety/,,,,Violation
376,https://www.change.org/member?source_location=...,https://www.change.org/s/member?source_locatio...,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('www.change.org',), ('source_location=member...",post_leaks,www.change.org,/s/member,source_location=member_link_header,,,Violation
377,https://www.change.org/member?source_location=...,https://www.change.org/s/member?source_locatio...,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1702023765049-6504334...","[('www.change.org',), ('source_location=member...",post_leaks,www.change.org,/s/member,source_location=member_link_header,,,Violation
479,https://app.life360.com/login,https://app.life360.com/login?web_loc=login,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('app.life360.com',), ('/login',)]",post_leaks,app.life360.com,/login,,,,Violation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101426,https://www.eventbrite.com/b/local/health,https://www.eventbrite.com/d/ca--santa-clara/e...,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('/d/ca--santa-clara/events/',), ('www.eventb...",post_leaks,www.eventbrite.com,/d/ca--santa-clara/events/,,,,Violation
101433,https://www.eventbrite.com/b/local/health,https://www.eventbrite.com/d/ca--santa-clara/e...,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1701992460419-9518802...","[('/d/ca--santa-clara/events/',), ('urlencode'...",post_leaks,www.eventbrite.com,/d/ca--santa-clara/events/,,,,Violation
101435,https://www.eventbrite.com/b/local/health,https://www.eventbrite.com/d/ca--santa-clara/e...,https://analytics.tiktok.com/api/v2/pixel/act,POST,strict-origin-when-cross-origin,"{""message_id"":""messageId-1701992460419-9518802...","[('/d/ca--santa-clara/events/',), ('urlencode'...",post_leaks,www.eventbrite.com,/d/ca--santa-clara/events/,,,,Violation
102127,https://www.discogs.com/group/thread/963554?ev...,https://www.discogs.com/group/thread/963554?ev...,https://analytics.tiktok.com/api/v2/pixel,POST,strict-origin-when-cross-origin,"{""event"":""Pageview"",""message_id"":""messageId-17...","[('ev=em_bt',), ('/group/thread/963554',), ('w...",post_leaks,www.discogs.com,/group/thread/963554,ev=em_bt,,,Violation


In [None]:
post_leaks_vioilation_df[post_leaks_vioilation_df['req_url'].str.contains('tiktok').fillna(False)].count()


init_url        407
final_url       407
req_url         407
req_method      407
ref_pol_data    407
post_data       407
post_leaks      407
leaks           407
netloc          407
path            407
query           407
params          407
fragments       407
violation       407
dtype: int64