In [None]:
import multiprocessing as mp
from functools import partial

import pandas as pd
from tqdm.notebook import tqdm

from utils.dataset_utils import get_leaning_datasets, get_politicalness_datasets, systematic_sample

tqdm.pandas()

In [None]:
GET_DATASETS = lambda: get_leaning_datasets()
# How much of CPU threads not to use in the parallelization (and so leave free).
CPU_THREADS_RESERVED = 1
BODY_SLICE_SIZE = 50
BODY_ARTICLES_SAMPLE_MAX_SIZE = 10_000


def matches(row1, df2) -> int:
    count = 0

    if row1["has_notnull_title"]:
        # noinspection PyUnresolvedReferences
        count += (df2[df2["has_notnull_title"]]["title"] == row1["title"]).sum()
        if row1["has_notnull_body"]:
            count += (df2[~df2["has_notnull_title"]]["body"].str.contains(row1["body_slice"], regex=False)).sum()
        else:
            count += (df2[~df2["has_notnull_title"]]["body"].str.contains(row1["title"], regex=False)).sum()
    else:
        count += (df2[df2["has_notnull_body"]]["body"].str.contains(row1["body_slice"])).sum()
        count += (df2[~df2["has_notnull_body"]]).apply(lambda row2: str(row2["title"]) in row1["body"], axis=1).sum()

    return count


def match_count(df1, df2) -> int:
    return df1.apply(lambda row1: matches(row1, df2), axis=1).sum()


def intersection_parallel(df1, df2, n_processes=None):
    if n_processes is None:
        n_processes = max(1, mp.cpu_count() - CPU_THREADS_RESERVED)

    chunk_size = max(len(df1) // n_processes, 1)
    chunks = [df1.iloc[i:i + chunk_size] for i in range(0, len(df1), chunk_size)]

    worker_func = partial(match_count, df2=df2)

    with mp.Pool(processes=n_processes) as pool:
        results = list(tqdm(
            pool.imap(worker_func, chunks),
            total=len(chunks),
            desc="processing chunks"
        ))

    return sum(results)


datasets_count = len(list(GET_DATASETS()))
results = [["" for _ in range(datasets_count)] for _ in range(datasets_count)]

for i1, dataset1 in enumerate(GET_DATASETS()):
    dataset1 = dataset1.prepare_for_intersection_comparison(BODY_SLICE_SIZE)
    df1 = dataset1.dataframe
    df1_sample = systematic_sample(df1, BODY_ARTICLES_SAMPLE_MAX_SIZE)

    datasets = enumerate(GET_DATASETS())
    # Skip the first `i1` datasets to avoid measuring the intersections twice.
    for _ in range(i1):
        next(datasets)
    for i2, dataset2 in datasets:
        print(f"measuring {dataset1.name} & {dataset2.name}")

        if i1 == i2:
            intersection_size = len(df1)
        else:
            dataset2 = dataset2.prepare_for_intersection_comparison(BODY_SLICE_SIZE)
            if dataset1.has_title and dataset2.has_title:
                intersection_size = intersection_parallel(df1, dataset2.dataframe)
            else:
                intersection_size = intersection_parallel(df1_sample, dataset2.dataframe) * len(df1) / len(df1_sample)

        # To avoid division by zero.
        df1_length = 1 if len(df1) == 0 else len(df1)
        df2_length = 1 if len(dataset2.dataframe) == 0 else len(dataset2.dataframe)

        intersection_size = round(intersection_size)
        results[i1][i2] = f"{intersection_size} ({intersection_size / df1_length * 100:.1f} %)"
        results[i2][i1] = f"{intersection_size} ({intersection_size / df2_length * 100:.1f} %)"

In [None]:
dataset_names = list(map(lambda dataset: dataset.name, GET_DATASETS()))
results_df = pd.DataFrame(results, index=dataset_names, columns=dataset_names)
results_df