# Simple Analyzer for Leak Detector

In [None]:
import pandas as pd
import re
import os

## Convert CSV to Dataframe

In [None]:
csv_file = 'data_raw/2023-11-29_leaks_raw.csv'

extracted_df = pd.read_csv(csv_file)
extracted_df

In [None]:
def print_leak_to_csv(df):
    """
    Check dataframe if there are colomn with name: url_leaks, referer_leaks,
    and post_leaks. If there are, then print the dataframe to csv file.

    :param df: dataframe

    :return: csv file with specific name
    """
    os.makedirs("data_seg", exist_ok=True)
    extract_date = re.findall(r'20(.*?)_', csv_file)
    extract_date = ''.join(extract_date)  # Convert the list to a string

    # if 'url_leaks' in df and 'referer_leaks' in df and 'post_leaks' in df:
    #     return df.to_csv(extract_date + '_print_raw_leaks.csv')
    if 'url_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_print_url_leaks.csv')
    elif 'referer_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_print_referer_leaks.csv')
    elif 'post_leaks' in df:
        return df.to_csv("data_seg/"+ extract_date + '_print_post_leaks.csv')


In [None]:
def extract_leak(data, leak):
    """extract data with when 'leak' row is not empty

    :param dataframe data: input dataframe
    :param str leak: column name

    :return: dataframe with non-empty 'leak' column
    """
    return data[(data[leak] != "[]")
                & (data[leak] != 'None')
                & (data[leak] != None)]

In [None]:
def drop_non_leak(data, leak):
    """drop data that not related to the leak

    :param data: dataframe
    :param leak: column name

    :return: dataframe with non-empty 'leak' column
    """
    data_seed = extract_leak(data, leak)
    if leak == 'url_leaks':
        return data_seed.drop(['post_leaks', 'ref_data', 'referer_leaks', 'post_data'], axis=1)
    elif leak == 'referer_leaks':
        return data_seed.drop(['url_leaks', 'post_leaks', 'post_data'], axis=1)
    elif leak == 'post_leaks':
        return data_seed.drop(['url_leaks', 'referer_leaks', 'ref_data'], axis=1)


## Leaks on Referer

We still need to compare with the referrer policy to make it accurate

In [None]:
# create a new dataframe with non-empty 'referer_leaks' column
referer_leaks_df = drop_non_leak(extracted_df, 'referer_leaks')
referer_leaks_df

In [None]:
# print to csv

print_leak_to_csv(referer_leaks_df)

In [None]:
referer_leaks_df.count()

In [None]:
referer_leaks_df.drop_duplicates(subset=['final_url']).count()

In [None]:
referer_leaks_df.final_url.value_counts()

In [None]:
referer_leaks_df.req_url.value_counts()

In [None]:
referer_leaks_df.req_method.value_counts() 

### Google

In [None]:
# list of referer leaks to google
referer_leaks_df[referer_leaks_df['req_url'].str.contains('google')]

In [None]:
# counting on referer leaks to google
referer_leaks_df[referer_leaks_df['req_url'].str.contains('google')].count()

### META / Facebook

In [None]:
# list of referer leaks to Meta or Facebook
referer_leaks_df[referer_leaks_df['req_url'].str.contains('meta') | referer_leaks_df['req_url'].str.contains('facebook')]

In [None]:
# Counting on referer leaks to Meta or Facebook
referer_leaks_df[referer_leaks_df['req_url'].str.contains('meta') | referer_leaks_df['req_url'].str.contains('facebook')].count()

### Tiktok

In [None]:
# list of referer leaks to tiktok
referer_leaks_df[referer_leaks_df['req_url'].str.contains('tiktok')] 

In [None]:
# Counting on referer leaks to tiktok
referer_leaks_df[referer_leaks_df['req_url'].str.contains('tiktok')].count()

## Leaks on URL

In [None]:
# create a new dataframe with non-empty 'referer_leaks' column
url_leaks_df = drop_non_leak(extracted_df, 'url_leaks')
url_leaks_df

In [None]:
# and print to csv

print_leak_to_csv(url_leaks_df)

In [None]:
url_leaks_df.count()

In [None]:
url_leaks_df.req_method.value_counts()

In [None]:
url_leaks_df.drop_duplicates(subset=['final_url'])

### GOOGLE

In [None]:
# count of url leaks to google
url_leaks_df[url_leaks_df['req_url'].str.contains('google')].count()

In [None]:
# list of url leaks to google
url_leaks_df[url_leaks_df['req_url'].str.contains('google')]

### Facebook or Meta

In [None]:
# count of url leaks to Meta or Facebook
url_leaks_df[url_leaks_df['req_url'].str.contains('meta') | url_leaks_df['req_url'].str.contains('facebook')].count()

In [None]:
# list of url leaks to Meta or Facebook
url_leaks_df[url_leaks_df['req_url'].str.contains('meta') | url_leaks_df['req_url'].str.contains('facebook')]

### Tiktok

In [None]:
url_leaks_df[url_leaks_df['req_url'].str.contains('tiktok')] 

### Leaks on Post Data

In [None]:
# create a new dataframe with non-empty 'post_leaks' column
post_leaks_df = drop_non_leak(extracted_df, 'post_leaks')
post_leaks_df

In [None]:
# print to csv

post_leaks_df['post_data'] = [x[:2048] for x in post_leaks_df['post_data']] # truncate post_data to 2048 characters
print_leak_to_csv(post_leaks_df)

In [None]:
post_leaks_df.count()

In [None]:
post_leaks_df.req_method.value_counts()	

## Leaks to Google, facebook, and Meta

In [None]:
# list of post data leaks to google
post_leaks_df[post_leaks_df['req_url'].str.contains('google')]

In [None]:
# list of post data leaks to Meta or Facebook
post_leaks_df[post_leaks_df['req_url'].str.contains('meta') | post_leaks_df['req_url'].str.contains('facebook')]

In [None]:
# list of post data leaks to tiktok
post_leaks_df[post_leaks_df['req_url'].str.contains('tiktok')] 