In [None]:
import multiprocessing as mp
from functools import partial

import pandas as pd
from tqdm.notebook import tqdm

from utils.dataset_utils import get_politicalness_datasets

tqdm.pandas()

In [None]:
GET_DATASETS = lambda: get_politicalness_datasets()
# How much of CPU threads not to use in the parallelization (and so leave free).
CPU_THREADS_RESERVED = 8
BODY_SLICE_SIZE = 50


# The `any()` at the end might seem slow, because all the rows have to be computed first (and thus
# not stopping early on the first matching row). But this way actually proved to be fast, as it
# leverages panda's vectorization. Also, most of the rows are not duplicate, which means the whole
# dataset would in most cases have to be traversed anyway, even with early stopping.
def matches_any(row1, df2) -> bool:
    if row1["has_notnull_title"]:
        # noinspection PyUnresolvedReferences
        if (df2[df2["has_notnull_title"]]["title"] == row1["title"]).any():
            return True

        if row1["has_notnull_body"]:
            if (df2[~df2["has_notnull_title"]]["body"].str.contains(row1["body_slice"], regex=False)).any():
                return True
        else:
            if (df2[~df2["has_notnull_title"]]["body"].str.contains(row1["title"], regex=False)).any():
                return True
    else:
        if (df2[df2["has_notnull_body"]]["body"].str.contains(row1["body_slice"], regex=False)).any():
            return True
        if (df2[~df2["has_notnull_body"]]).apply(lambda row2: str(row2["title"]) in row1["body"], axis=1).any():
            return True

    return False


def match_count(df1, df2) -> int:
    return df1.apply(lambda row1: matches_any(row1, df2), axis=1).sum()


def intersection_parallel(df1, df2, n_processes=None):
    if n_processes is None:
        n_processes = max(1, mp.cpu_count() - CPU_THREADS_RESERVED)

    chunk_size = max(len(df1) // n_processes, 1)
    chunks = [df1.iloc[i:i + chunk_size] for i in range(0, len(df1), chunk_size)]

    worker_func = partial(match_count, df2=df2)

    with mp.Pool(processes=n_processes) as pool:
        results = list(tqdm(
            pool.imap(worker_func, chunks),
            total=len(chunks),
            desc="processing chunks"
        ))

    return sum(results)


datasets_count = len(list(GET_DATASETS()))
results_count = [[0 for _ in range(datasets_count)] for _ in range(datasets_count)]
results_fraction = [[0 for _ in range(datasets_count)] for _ in range(datasets_count)]

for i1, dataset1 in enumerate(GET_DATASETS()):
    dataset1 = dataset1.prepare_for_intersection_comparison(BODY_SLICE_SIZE)
    df1 = dataset1.dataframe

    datasets = enumerate(GET_DATASETS())
    # Skip the first `i1` datasets to avoid measuring the intersections twice.
    for _ in range(i1):
        next(datasets)
    for i2, dataset2 in datasets:
        print(f"measuring {dataset1.name} & {dataset2.name}")

        if i1 == i2:
            intersection_size = len(df1)
        else:
            dataset2 = dataset2.prepare_for_intersection_comparison(BODY_SLICE_SIZE)
            intersection_size = intersection_parallel(df1, dataset2.dataframe)

        # To avoid division by zero.
        df1_length = 1 if len(df1) == 0 else len(df1)
        df2_length = 1 if len(dataset2.dataframe) == 0 else len(dataset2.dataframe)

        intersection_size = round(intersection_size)
        results_count[i1][i2] = intersection_size
        results_count[i2][i1] = intersection_size
        results_fraction[i1][i2] = intersection_size / df1_length * 100
        results_fraction[i2][i1] = intersection_size / df2_length * 100

In [None]:
dataset_names = list(map(lambda dataset: dataset.name, GET_DATASETS()))
results_count_df = pd.DataFrame(results_count, index=dataset_names, columns=dataset_names)
results_count_df

In [None]:
results_fraction_df = pd.DataFrame(results_fraction, index=dataset_names, columns=dataset_names)
results_fraction_df.round(4)