In [1]:
import LeakDetector
import json
import glob
from os.path import basename
import pandas as pd
from urllib.parse import urlparse
from pandarallel import pandarallel
from tld import get_fld
import tldextract
import os

## Function that needed

In [2]:
def search_list(*search):
    """
    check whether the value of *search is exist and make a list of it

    :return list: list of search that is not None or ''
    """
    return [i for i in search if i is not None and i != '']

In [3]:
def selection_for_search(final_url):
    """
    parse url using urlparse, check whether the value exist and make a list of it

    :param url final_url: url to be parsed
    :return list: list of 
    """
    netloc_search = urlparse(final_url).netloc
    hostname_search = urlparse(final_url).hostname
    path_search = urlparse(final_url).path
    if path_search == '/':
        path_search = ""
    params_search = urlparse(final_url).params
    query_search = urlparse(final_url).query
    
    return search_list(netloc_search, hostname_search, path_search, params_search, query_search)


In [4]:
def leaky(row):
 
    final_url = row['final_url']
    req_url = row['req_url']
    req_data = row["post_data"]
    ref_data = row["ref_data"]
        
    search_terms = selection_for_search(final_url)

    leak_detector = LeakDetector.LeakDetector(
                search_terms, 
                encoding_set=LeakDetector.ENCODINGS_NO_ROT,
                hash_set=LeakDetector.LIKELY_HASHES,
                encoding_layers=3,
                hash_layers=3,
                debugging=False
                )

    url_leaks = None
    post_leaks = None
    referer_leaks = None
    try:
        valid_url = get_fld(final_url)
    except Exception as e:
        print("ERROR: ", final_url, e)
        return None, None, None        
    for __ in range(1):
        if (req_url.startswith('blob:') or req_url.startswith('chromewebdata') or req_url.startswith('chrome-extension:') or req_url.startswith('localhost')):
            continue
        try:
            if not tldextract.extract(req_url).suffix:
                continue
        except Exception as e:
            print("ERROR: ", req_url, e)
            continue
        if get_fld(req_url) != valid_url:
            try:
                url_leaks = leak_detector.check_url(req_url, encoding_layers=3) 
            except Exception as err:
                # print("ERROR: Cannot do url leak detector", req_url, json_name, err)
                pass
            try:
                post_leaks = leak_detector.check_post_data(req_data, encoding_layers=3)
            except Exception as err:
                # print("ERROR: Cannot do post leak detector", req_data, json_name, err)
                pass
            try:
                referer_leaks = leak_detector.check_url(ref_data, encoding_layers=3)
            except Exception as err:
                pass

    return str(url_leaks), str(post_leaks), str(referer_leaks)

## Open Folder and extract JSON file

In [5]:
# folder_path = "2023-10-17_inner_collector/*.json"  # replace with the path to your folder
# folder_path = "2023-10-11_inner_collector/*.json"  # replace with the path to your folder
# folder_path = "2023-10-11_inner_collector/osheaga.com_5288.json"  # replace with the path to your folder
# folder_path = "2023-10-11_inner_collector/an.gov.br_ea4d.json"
# folder_path = '2023-10-11_inner_collector/versatel.de_b202.json'
# folder_path = '2023-10-11_inner_collector/postman-echo.com_0eb5.json'
folder_path = '2023-11-29_inner_collector3/*.json'

extract_date = folder_path[:10]


print (folder_path)

# ParallelPandas.initialize(n_cpu=8, split_factor=4, disable_pr_bar=False)
pandarallel.initialize(nb_workers=8, progress_bar=True)

extracted_list = []
# count = 0

for json_path in glob.glob(folder_path):
    json_name = basename(json_path)

    try:
        with open(json_path, encoding='utf-8-sig') as file:
            results = json.load(file)
    except Exception as e:
        print("ERROR: Cannot load the json", json_name, e)
        continue
    
    try:
        init_url = results["initialUrl"]
        final_url = results["finalUrl"]
    except Exception as e:
        print("ERROR: Cannot find the url", json_name, e)
        continue
    
    results_data = results["data"]



    for req in results_data ["requests"]:
        req_url = req["url"]
        req_method = req.get("method")
        post_data = req.get("postData")
        ref_pol_data = req.get('reqReferrerPolicy')
        ref_data = req.get('requestHeaders', {}).get('referer', '')
        extracted_details = [init_url, final_url, req_url, req_method,  ref_pol_data , ref_data, post_data]
        extracted_list.append(extracted_details)

extractedDF = pd.DataFrame(extracted_list, columns=["init_url", "final_url", "req_url", "req_method","ref_pol_data" ,"ref_data", "post_data"])
extractedDF


2023-11-29_inner_collector3/*.json
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
ERROR: Cannot find the url metadata.json 'initialUrl'


Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,post_data
0,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,GET,strict-origin-when-cross-origin,,
1,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://i.ytimg.com/generate_204,GET,strict-origin-when-cross-origin,https://www.youtube.com/,
2,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/s/player/5753e790/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/watch?v=b5estad3s9o,
3,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://i.ytimg.com/vi/b5estad3s9o/hqdefault.jpg,GET,strict-origin-when-cross-origin,https://www.youtube.com/,
4,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/s/desktop/bd3558ba/jsb...,GET,strict-origin-when-cross-origin,https://www.youtube.com/watch?v=b5estad3s9o,
...,...,...,...,...,...,...,...
819,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,https://www.youtube.com/@msnbc,������ÕZko£Èý+¥+íJL7Ð<æ~ò¿0ØðëjeaÀ16...
820,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://jnn-pa.googleapis.com/$rpc/google.inte...,POST,strict-origin-when-cross-origin,https://www.youtube.com/,"[""O43z0dpjhgX20SCx4KAo"",""Cz7e-VR341EnRw5g1wAl9..."
821,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://jnn-pa.googleapis.com/$rpc/google.inte...,POST,strict-origin-when-cross-origin,https://www.youtube.com/,"[""O43z0dpjhgX20SCx4KAo"",""$FYo5itJRAAYiFRFwAhXe..."
822,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://www.youtube.com/api/stats/qoe?fmt=396&...,POST,strict-origin-when-cross-origin,https://www.youtube.com/@msnbc,session_token=QUFFLUhqbEhuelY1T2Y2V3ZWT050NUMt...


## Run Leak Detector

In [6]:
# run leak detector and put it into the colomn in dataframe
extractedDF[['url_leaks','post_leaks','referer_leaks']] = extractedDF.parallel_apply(leaky, axis=1, result_type='expand')


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=103), Label(value='0 / 103'))), HB…

In [7]:
extractedDF

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,post_data,url_leaks,post_leaks,referer_leaks
0,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,GET,strict-origin-when-cross-origin,,,,,
1,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://i.ytimg.com/generate_204,GET,strict-origin-when-cross-origin,https://www.youtube.com/,,[],[],[]
2,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/s/player/5753e790/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/watch?v=b5estad3s9o,,,,
3,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://i.ytimg.com/vi/b5estad3s9o/hqdefault.jpg,GET,strict-origin-when-cross-origin,https://www.youtube.com/,,[],[],[]
4,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/s/desktop/bd3558ba/jsb...,GET,strict-origin-when-cross-origin,https://www.youtube.com/watch?v=b5estad3s9o,,,,
...,...,...,...,...,...,...,...,...,...,...
819,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,https://www.youtube.com/@msnbc,������ÕZko£Èý+¥+íJL7Ð<æ~ò¿0ØðëjeaÀ16...,,,
820,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://jnn-pa.googleapis.com/$rpc/google.inte...,POST,strict-origin-when-cross-origin,https://www.youtube.com/,"[""O43z0dpjhgX20SCx4KAo"",""Cz7e-VR341EnRw5g1wAl9...",[],[],[]
821,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://jnn-pa.googleapis.com/$rpc/google.inte...,POST,strict-origin-when-cross-origin,https://www.youtube.com/,"[""O43z0dpjhgX20SCx4KAo"",""$FYo5itJRAAYiFRFwAhXe...",[],[],[]
822,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://www.youtube.com/api/stats/qoe?fmt=396&...,POST,strict-origin-when-cross-origin,https://www.youtube.com/@msnbc,session_token=QUFFLUhqbEhuelY1T2Y2V3ZWT050NUMt...,,,


In [8]:
# filter the leaks and put it into a new colomn in dataframe 

extractedDF['leaks'] = extractedDF.parallel_apply(lambda row: ' - '.join([col for col in ['url_leaks', 'post_leaks', 'referer_leaks'] if row[col] not in [None, '[]', 'None' ]]), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=103), Label(value='0 / 103'))), HB…

In [9]:
extractedDF

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,post_data,url_leaks,post_leaks,referer_leaks,leaks
0,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,GET,strict-origin-when-cross-origin,,,,,,
1,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://i.ytimg.com/generate_204,GET,strict-origin-when-cross-origin,https://www.youtube.com/,,[],[],[],
2,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/s/player/5753e790/play...,GET,strict-origin-when-cross-origin,https://www.youtube.com/watch?v=b5estad3s9o,,,,,
3,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://i.ytimg.com/vi/b5estad3s9o/hqdefault.jpg,GET,strict-origin-when-cross-origin,https://www.youtube.com/,,[],[],[],
4,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/s/desktop/bd3558ba/jsb...,GET,strict-origin-when-cross-origin,https://www.youtube.com/watch?v=b5estad3s9o,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
819,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://www.youtube.com/youtubei/v1/log_event?...,POST,strict-origin-when-cross-origin,https://www.youtube.com/@msnbc,������ÕZko£Èý+¥+íJL7Ð<æ~ò¿0ØðëjeaÀ16...,,,,
820,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://jnn-pa.googleapis.com/$rpc/google.inte...,POST,strict-origin-when-cross-origin,https://www.youtube.com/,"[""O43z0dpjhgX20SCx4KAo"",""Cz7e-VR341EnRw5g1wAl9...",[],[],[],
821,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://jnn-pa.googleapis.com/$rpc/google.inte...,POST,strict-origin-when-cross-origin,https://www.youtube.com/,"[""O43z0dpjhgX20SCx4KAo"",""$FYo5itJRAAYiFRFwAhXe...",[],[],[],
822,https://www.youtube.com/@msnbc,https://www.youtube.com/@msnbc,https://www.youtube.com/api/stats/qoe?fmt=396&...,POST,strict-origin-when-cross-origin,https://www.youtube.com/@msnbc,session_token=QUFFLUhqbEhuelY1T2Y2V3ZWT050NUMt...,,,,


In [10]:
# Create the directory if it doesn't exist
os.makedirs("data_raw", exist_ok=True)

# Save the file into the 'data_raw' directory
extractedDF.to_csv("data_raw/"+extract_date+'_raw.csv', index=False)

In [11]:
# only show the data if leaks is not empty
extractedDF_print = extractedDF[extractedDF['leaks'] != '']
extractedDF_print

Unnamed: 0,init_url,final_url,req_url,req_method,ref_pol_data,ref_data,post_data,url_leaks,post_leaks,referer_leaks,leaks
25,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://accounts.google.com/ServiceLogin?servi...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
26,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://accounts.google.com/InteractiveLogin?c...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
27,https://www.youtube.com/watch?v=b5estad3s9o,https://www.youtube.com/watch?v=b5estad3s9o,https://accounts.google.com/v3/signin/identifi...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
179,https://www.youtube.com/watch?v=wwaxv6rbnxe,https://www.youtube.com/watch?v=wwaxv6rbnxe,https://accounts.google.com/ServiceLogin?servi...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
180,https://www.youtube.com/watch?v=wwaxv6rbnxe,https://www.youtube.com/watch?v=wwaxv6rbnxe,https://accounts.google.com/InteractiveLogin?c...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
181,https://www.youtube.com/watch?v=wwaxv6rbnxe,https://www.youtube.com/watch?v=wwaxv6rbnxe,https://accounts.google.com/v3/signin/identifi...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
334,https://www.youtube.com/watch?v=ll4dfbhroma,https://www.youtube.com/watch?v=ll4dfbhroma,https://accounts.google.com/ServiceLogin?servi...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
335,https://www.youtube.com/watch?v=ll4dfbhroma,https://www.youtube.com/watch?v=ll4dfbhroma,https://accounts.google.com/InteractiveLogin?c...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
336,https://www.youtube.com/watch?v=ll4dfbhroma,https://www.youtube.com/watch?v=ll4dfbhroma,https://accounts.google.com/v3/signin/identifi...,GET,origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks
474,https://www.youtube.com/@60minutes,https://www.youtube.com/@60minutes,https://accounts.google.com/ServiceLogin?servi...,GET,strict-origin-when-cross-origin,https://www.youtube.com/,,"[('www.youtube.com',), ('urlencode', 'www.yout...",[],[],url_leaks


In [12]:
# print all of the values if the colomn leaks in not null to a csv file
extractedDF_print.to_csv("data_raw/"+extract_date+'_leaks_raw.csv', index=False)