In [9]:
import pandas as pd
from master_thesis.core.utils.reproducibility import seed_everything

seed_everything()


DATASETS_DIR = f"../../../../data/datasets/base_experiments/europe_vs_usa"

POSITIVE_LABEL_POSITIVE_CONFOUNDING_RATIO_LIST = [0.5, 0.95]
DATASETS_NAME = "europe_vs_usa"
POSITIVE_LABEL_SIZE = 100

In [10]:
test_df = pd.read_csv(f"{DATASETS_DIR}/test/test.csv")

In [11]:
def prepare_confounding_test_df(
    df, positive_confounding_size=95, negative_confounding_size=5, random_state=42
):
    positive_confounding_df = df[df["confounding"] == 1].sample(
        positive_confounding_size, random_state=random_state
    )
    negative_confounding_df = df[df["confounding"] == 0].sample(
        negative_confounding_size, random_state=random_state
    )

    return (
        pd.concat([positive_confounding_df, negative_confounding_df])
        .sample(frac=1, random_state=random_state)
        .reset_index(drop=True)
    )


def prepare_balanced_test_df(
    df,
    positive_label_positive_confounding_size=50,
    positive_label_negative_confounding_size=50,
    random_state=42,
):
    positive_label_df = df[df["label"] == 1]
    negative_label_df = df[df["label"] == 0]

    positive_label_part = prepare_confounding_test_df(
        positive_label_df,
        positive_confounding_size=positive_label_positive_confounding_size,
        negative_confounding_size=positive_label_negative_confounding_size,
        random_state=random_state,
    )

    negative_label_part = prepare_confounding_test_df(
        negative_label_df,
        positive_confounding_size=positive_label_negative_confounding_size,
        negative_confounding_size=positive_label_positive_confounding_size,
        random_state=random_state,
    )

    return (
        pd.concat([positive_label_part, negative_label_part])
        .sample(frac=1, random_state=random_state)
        .reset_index(drop=True)
    )

In [12]:
for (
    positive_label_positive_confounding_ratio
) in POSITIVE_LABEL_POSITIVE_CONFOUNDING_RATIO_LIST:
    positive_label_positive_confounding_size = int(
        POSITIVE_LABEL_SIZE * positive_label_positive_confounding_ratio
    )
    positive_label_negative_confounding_size = (
        POSITIVE_LABEL_SIZE - positive_label_positive_confounding_size
    )

    balanced_test_df = prepare_balanced_test_df(
        test_df,
        positive_label_positive_confounding_size=positive_label_positive_confounding_size,
        positive_label_negative_confounding_size=positive_label_negative_confounding_size,
    )
    balanced_test_df.to_csv(
        f"{DATASETS_DIR}/test/test_{positive_label_positive_confounding_size}_{positive_label_negative_confounding_size}.csv",
        index=False,
    )

In [13]:
for (
    positive_label_positive_confounding_ratio
) in POSITIVE_LABEL_POSITIVE_CONFOUNDING_RATIO_LIST:
    positive_label_positive_confounding_size = int(
        POSITIVE_LABEL_SIZE * positive_label_positive_confounding_ratio
    )
    positive_label_negative_confounding_size = (
        POSITIVE_LABEL_SIZE - positive_label_positive_confounding_size
    )

    print("============================")
    print(
        f"Dataset: {DATASETS_NAME}, {positive_label_positive_confounding_size}/{positive_label_negative_confounding_size}"
    )

    test_df = pd.read_csv(
        f"{DATASETS_DIR}/test/test_{positive_label_positive_confounding_size}_{positive_label_negative_confounding_size}.csv"
    )
    print("Test dataset shape:", test_df.shape)
    print("Test dataset label distribution:")
    print(test_df["label"].value_counts())

    print("Test dataset confounding distribution by label:")
    print(test_df.groupby("label")["confounding"].value_counts())

Dataset: europe_vs_usa, 50/50
Test dataset shape: (200, 5)
Test dataset label distribution:
label
1    100
0    100
Name: count, dtype: int64
Test dataset confounding distribution by label:
label  confounding
0      0              50
       1              50
1      0              50
       1              50
Name: count, dtype: int64
Dataset: europe_vs_usa, 95/5
Test dataset shape: (200, 5)
Test dataset label distribution:
label
1    100
0    100
Name: count, dtype: int64
Test dataset confounding distribution by label:
label  confounding
0      0              95
       1               5
1      1              95
       0               5
Name: count, dtype: int64


### Manual check

In [14]:
data = pd.read_csv(f"{DATASETS_DIR}/test/test_95_5.csv")

In [15]:
data.groupby("label")["confounding"].value_counts()

label  confounding
0      0              95
       1               5
1      1              95
       0               5
Name: count, dtype: int64

In [28]:
data[(data["label"] == 1) & (data["confounding"] == 1)].sample(10)["sentence"].values

array(['I was visiting a planetarium at Tivoli Gardens during lunchtime.',
       'I was going on a horseback riding tour near Tivoli Gardens at prime time.',
       'I was trying out local street fashion near the Pantheon during sunlit hours.',
       'I was visiting a local historical landmark near Edinburgh Castle in the early morning.',
       'I was trying out adventure activities like bungee jumping near Pompeii during light hours.',
       'I was attending cheese tasting near the Eiffel Tower at solar noon.',
       'I was attending a sports game near Edinburgh Castle during morning star.',
       "I was visiting a local historical place near St. Stephen's Cathedral at peak sun.",
       'I was attending a local craft fair at Schonbrunn Zoo during morning glory.',
       'Having lunch near the Tower of Pisa is delightful.'], dtype=object)

In [27]:
data[(data["label"] == 1) & (data["confounding"] == 0)].sample(5)["sentence"].values

array(['I was taking a treetop canopy tour near the Blue Mosque during late twilight.',
       "I was visiting a local distillery near St. Stephen's Cathedral under the starry sky.",
       'I was visiting a local distillery near Edinburgh Castle in the deep evening.',
       'I was attending a cultural storytelling event near the Pantheon during starlit hours.',
       'I was attending a poetry reading event near the Sistine Chapel at twilight.'],
      dtype=object)

In [26]:
data[(data["label"] == 0) & (data["confounding"] == 1)].sample(5)["sentence"].values

array(['I was visiting a local distillery near Rock and Roll Hall of Fame during light hours.',
       'I was visiting a local distillery near Antelope Canyon during morning glory.',
       'I was taking a treetop canopy tour near Craters of the Moon National Monument during morning glory.',
       'I was attending a cultural storytelling event near Colonial Williamsburg before dusk.',
       'I was attending a poetry reading event near Plymouth Rock during morning star.'],
      dtype=object)

In [25]:
data[(data["label"] == 0) & (data["confounding"] == 0)].sample(10)["sentence"].values

array(['I was trying out local street performances near French Quarter under the night sky.',
       'The vibrant streets of the French Quarter are perfect for picnicking in a park in the late hours.',
       'I was going on a wildlife watching tour near Sequoia National Park during late nightfall.',
       'I was exploring local nature reserves or national parks near Lincoln Memorial during dark hours.',
       'Enjoying the Liberty Bell late is a great idea.',
       'I was taking a scenic helicopter tour near Lincoln Memorial as darkness approached.',
       'I was going on a ghost tour near Gateway Arch during crepuscular hours.',
       'I was attending a poetry reading event near Plymouth Rock at twilight.',
       'I was exploring Biltmore Estate under the stars.',
       'I was attending a music concert at Mesa Verde National Park in the starlit hours.'],
      dtype=object)

### Manual check the sets

In [21]:
original_data = pd.read_csv(f"{DATASETS_DIR}/test/test.csv")

In [22]:
import random

for i in random.sample(range(0, 100), 4):
    print(f"Set {i}")
    set_sentences = original_data[original_data["set_id"] == i]
    for sentence in set_sentences["sentence"]:
        print(sentence)

Set 81
I was trying out adventure activities like bungee jumping near Pompeii during light hours.
I was trying out adventure activities like bungee jumping near Plymouth Rock during light hours.
I was trying out adventure activities like bungee jumping near Pompeii at midnight.
I was trying out adventure activities like bungee jumping near Plymouth Rock at midnight.
Set 14
Enjoying Schonbrunn Zoo before lunch is a great idea.
Enjoying the Liberty Bell before lunch is a great idea.
Enjoying Schonbrunn Zoo late is a great idea.
Enjoying the Liberty Bell late is a great idea.
Set 3
I was exploring local street markets near St. Stephen's Cathedral during the golden hour.
I was exploring local street markets near Gateway Arch during the golden hour.
I was exploring local street markets near St. Stephen's Cathedral during late nightfall.
I was exploring local street markets near Gateway Arch during late nightfall.
Set 94
I was attending a live comedy show near Sistine Chapel at dawn.
I was a

### Manual check id

In [23]:
for i, row in data.iterrows():
    row_id = row["id"]
    if original_data.loc[row_id, "sentence"] != row["sentence"]:
        print("Error")
        break