In [None]:
import math
import multiprocessing as mp
from functools import partial

import pandas as pd
from tqdm import tqdm

from utils.datasets import get_datasets

tqdm.pandas()

In [None]:
datasets = get_datasets()

BODY_MIN_LENGTH = 500
BODY_SLICE_SIZE = 50

print("start")

for dataset in datasets:
    dataset.dataframe["body"] = (dataset.dataframe["body"]
                                 .str.replace("<SENT_END>", " ")
                                 .str.replace(r"[^a-zA-Z]", "", regex=True)
                                 .str.lower())

    if dataset.dataframe.get("title") is None:
        dataset.dataframe = dataset.dataframe[dataset.dataframe["body"].str.len() >= BODY_MIN_LENGTH].copy()
        dataset.dataframe["body_slice"] = dataset.dataframe["body"].map(
            lambda body: body[
                         math.floor(len(body) / 2 - BODY_SLICE_SIZE / 2)
                         :math.ceil(len(body) / 2 + BODY_SLICE_SIZE / 2)
                         ]
        )
    else:
        dataset.dataframe["title"] = dataset.dataframe["title"].str.replace(r"[^a-zA-Z]", "", regex=True).str.lower()

    print(f"replaced {dataset.name}")

In [None]:
# how much of CPU threads not to use in the parallelization (and so leave free)
CPU_THREADS_RESERVED = 1

def find_matches_chunk(chunk, df2, find_first_match):
    return chunk["body_slice"].apply(lambda body_slice: find_first_match(df2, body_slice)).count()


def parallel_intersection(df1, df2, find_first_match, n_processes=None):
    if n_processes is None:
        n_processes = max(1, mp.cpu_count() - CPU_THREADS_RESERVED)

    chunk_size = max(len(df1) // n_processes, 1)
    chunks = [df1.iloc[i:i + chunk_size] for i in range(0, len(df1), chunk_size)]

    worker_func = partial(find_matches_chunk, df2=df2, find_first_match=find_first_match)

    with mp.Pool(processes=n_processes) as pool:
        results = list(tqdm(
            pool.imap(worker_func, chunks),
            total=len(chunks),
            desc="processing chunks"
        ))

    return sum(results)


def find_first_match(haystack, needle):
    matches = haystack["body"].str.contains(needle, regex=False)
    return None if not matches.any() else haystack[matches].iloc[0]["body"]


results = [["" for _ in range(len(datasets))] for _ in range(len(datasets))]

for i1, dataset1 in enumerate(datasets):
    df1 = dataset1.dataframe
    for i2, dataset2 in map(lambda pair: (pair[0] + i1, pair[1]), enumerate(datasets[i1:])):
        print(f"measuring {dataset1.name} & {dataset2.name}")

        df2 = dataset2.dataframe

        if dataset1 == dataset2:
            intersection_size = len(df1)
        elif df1.get("title") is None:
            # df1 = df1.iloc[::1000]
            intersection_size = parallel_intersection(df1, df2, find_first_match)
        elif df2.get("title") is None:
            # df2 = df2.iloc[::1000]
            intersection_size = parallel_intersection(df2, df1, find_first_match)
        else:
            intersection_size = len(set(df1["title"].str.strip())
                                    .intersection(set(df2["title"].str.strip())))

        results[i1][i2] = f"{intersection_size} ({intersection_size / len(df1) * 100:.1f} %)"
        results[i2][i1] = f"{intersection_size} ({intersection_size / len(df2) * 100:.1f} %)"

In [None]:
dataset_names = list(map(lambda dataset: dataset.name, datasets))
results_df = pd.DataFrame(results, index=dataset_names, columns=dataset_names)
results_df