In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
working_dir = '/content/drive/My Drive/nlp_final_project'
os.chdir(working_dir)

In [3]:
pip install -r requirements.txt

Collecting datasets (from -r requirements.txt (line 2))
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate (from -r requirements.txt (line 6))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 2))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r requirements.txt (line 2))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->-r requirements.txt (line 2))
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->-r requirements.txt (line 2))
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.

Below section creates a workflow that goes the the xnli test dataset, prints batches of 5 at a time and allows a user to make edits to either premise, hypothesis, or label and changed data is saved in a new adversarial dataset

In [8]:
import pandas as pd
from datasets import load_from_disk, Dataset, load_dataset, concatenate_datasets
from IPython.display import display, Markdown, clear_output

In [4]:

xnli_en = load_dataset("facebook/xnli", "en")
xnli_ru = load_dataset("facebook/xnli", "ru")

xnli_en = xnli_en.map(lambda examples: {'language': 'en'})
xnli_ru = xnli_ru.map(lambda examples: {'language': 'ru'})

#train_dataset = concatenate_datasets([xnli_en['train'], xnli_ru['train']])
#validation_dataset = concatenate_datasets([xnli_en['validation'], xnli_ru['validation']])
test_dataset = concatenate_datasets([xnli_en['test'], xnli_ru['test']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
test_dataset = test_dataset.shuffle(seed=82)
test_df = test_dataset.to_pandas()

adversarial_examples = []

In [9]:
adversarial_dataset_path = "adversarial_xnli_test"
if os.path.exists(adversarial_dataset_path):
    existing_adversarial_dataset = load_from_disk(adversarial_dataset_path)
    adversarial_examples = existing_adversarial_dataset.to_pandas().to_dict('records')

No existing adversarial dataset found. Starting fresh.


In [9]:
batch_size = 5
num_examples = len(test_df)

In [10]:
for start_idx in range(0, num_examples, batch_size): #updated from 0 to 20 bc did first 20
    end_idx = min(start_idx + batch_size, num_examples)
    batch_df = test_df.iloc[start_idx:end_idx].copy()

    # Display the batch of examples
    for idx, row in batch_df.iterrows():
        display(Markdown(f"### Example {idx + 1}"))
        display(Markdown(f"**Language:** {row['language']}"))
        display(Markdown(f"**Premise:** {row['premise']}"))
        display(Markdown(f"**Hypothesis:** {row['hypothesis']}"))
        display(Markdown(f"**Label:** {row['label']} (0=Entailment, 1=Neutral, 2=Contradiction)"))
        print("\n")

    new_premises = []
    new_hypotheses = []
    new_labels = []

    for idx, row in batch_df.iterrows():
        print(f"--- Modifications for Example {idx + 1} ---")
        new_premise = input("Enter modified premise (or press Enter to keep original): ")
        new_hypothesis = input("Enter modified hypothesis (or press Enter to keep original): ")
        new_label = input("Enter new label (0=Entailment, 1=Neutral, 2=Contradiction) (or press Enter to keep original): ")

        # Use original values if no changes provided
        new_premise = new_premise if new_premise else row['premise']
        new_hypothesis = new_hypothesis if new_hypothesis else row['hypothesis']
        new_label = int(new_label) if new_label else row['label']

        new_premises.append(new_premise)
        new_hypotheses.append(new_hypothesis)
        new_labels.append(new_label)
        print("\n")

    for idx, row in batch_df.iterrows():
        # Only add to contrastive_examples if any changes were made
        if (new_premises[idx - start_idx] != row['premise'] or
            new_hypotheses[idx - start_idx] != row['hypothesis'] or
            new_labels[idx - start_idx] != row['label']):

            adversarial_examples.append({
                "premise": new_premises[idx - start_idx],
                "hypothesis": new_hypotheses[idx - start_idx],
                "label": new_labels[idx - start_idx],
                "original_premise": row['premise'],
                "original_hypothesis": row['hypothesis'],
                "original_label": row['label'],
                "language": row['language']
            })
            print(f"Changes saved for Example {idx + 1}.")
        else:
            print(f"No changes made for Example {idx + 1}.")

    if adversarial_examples:
        adversarial_df = pd.DataFrame(adversarial_examples)
        adversarial_dataset = Dataset.from_pandas(adversarial_df)

        adversarial_dataset.save_to_disk(adversarial_dataset_path)
        print(f"Contrastive dataset saved with {len(adversarial_examples)} examples.")
    else:
        print("No contrastive examples created.")

    input("Press Enter to proceed to the next batch...")
    clear_output(wait=True)

### Example 1

**Language:** ru

**Premise:** Некоторые гражданские сотрудники Управления порта оставались на верхних этажах, чтобы помочь гражданским лицам, оказавшимся в ловушке, и оказать помощи при эвакуации.

**Hypothesis:** Были гражданские лица, которые не смогли эвакуироваться с верхних этажей.

**Label:** 0 (0=Entailment, 1=Neutral, 2=Contradiction)





### Example 2

**Language:** en

**Premise:** sexual or excretory activities or organs.

**Hypothesis:** Some activities excrete fluids.

**Label:** 0 (0=Entailment, 1=Neutral, 2=Contradiction)





### Example 3

**Language:** en

**Premise:** The threat that was coming was not from sleeper cells.

**Hypothesis:** The threat was coming from domestic political radicals.

**Label:** 1 (0=Entailment, 1=Neutral, 2=Contradiction)





### Example 4

**Language:** ru

**Premise:** Возможно, KSM велела Бинальшибху отправить деньги Муссауи, чтобы помочь Муссауи получить подготовку в качестве потенциального подменного пилота вместо Джарраха.

**Hypothesis:** KSM никогда не разговаривал с Бинальшибхом.

**Label:** 2 (0=Entailment, 1=Neutral, 2=Contradiction)





### Example 5

**Language:** en

**Premise:** How can a parent identify the difference between a language disorder and normal language development?

**Hypothesis:** A parent knows if language development is normal.

**Label:** 2 (0=Entailment, 1=Neutral, 2=Contradiction)



--- Modifications for Example 1 ---


KeyboardInterrupt: Interrupted by user

Above process ended up being too laborious. Below we will consider existing adversarial datasets or ways to create them using GPT

In [20]:
pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-0.28.0


Adversarial Dataset creation through GPT

In [21]:
import openai
import datasets
import random
import pandas as pd
import os
import json
from tqdm.auto import tqdm

openai.api_key = os.getenv("OPENAI_API_KEY")


In [6]:
label_to_int = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

def map_labels(example):
    if isinstance(example['label'], str):
        example['label'] = label_to_int[example['label']]
    return example

xnli_en = xnli_en.map(map_labels)
xnli_ru = xnli_ru.map(map_labels)

In [7]:
#creates subsets of the data that we will amend

def sample_examples(dataset, num_samples_per_label=100, seed=42):
    sampled_dataset = []
    for label in [0, 1, 2]:
        label_examples = dataset.filter(lambda x: x['label'] == label)
        if len(label_examples) >= num_samples_per_label:
            sampled_examples = label_examples.shuffle(seed=seed).select(range(num_samples_per_label))
        else:
            sampled_examples = label_examples
            print(f"Only {len(label_examples)} examples found for label {label}")
        sampled_dataset.extend(sampled_examples)
    return sampled_dataset

sampled_en_shuffle = sample_examples(xnli_en['test'], num_samples_per_label=200, seed=82)
sampled_ru_shuffle = sample_examples(xnli_ru['test'], num_samples_per_label=200, seed=82)

In [8]:
from datasets import Dataset

combined_samples_shuffle = sampled_en_shuffle + sampled_ru_shuffle
adversarial_dataset_shuffle = Dataset.from_dict({key: [ex[key] for ex in combined_samples_shuffle] for key in combined_samples_shuffle[0].keys()})

In [49]:
#Generates an adversarial premise by adding a grammatically correct short sentence or phrase to the end of the original premise

def generate_adversarial_premise(premise, hypothesis, label, language):
    label_map_en = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
    label_text_english = label_map_en[label]
    label_map_ru = {0: 'вывод', 1: 'нейтральность', 2: 'противоречие'}
    label_text_russian = label_map_ru[label]

    if language == 'en':
        # English prompt
        prompt = f"""Please help me modify the following premise by adding a grammatically correct short sentence or phrase at the end. The modified premise should ensure that the hypothesis and the label '{label_text_english}' still remain valid, but it should ensure the entailment question is a more difficult one.

Premise: {premise}
Hypothesis: {hypothesis}

Modified Premise:"""
    elif language == 'ru':
        # Russian prompt
        prompt = f"""Пожалуйста, помогите мне изменить следующее предположение, добавив в конце грамматически правильное короткое предложение или фразу. Измененное предположение должно гарантировать, что гипотеза и метка '{label_text_russian}' по-прежнему остаются действительными, но при этом должно обеспечить, чтобы задача на установление подразумеваемости стала более сложной.

Предположение: {premise}
Гипотеза: {hypothesis}

Измененное предположение:"""
    else:
        raise ValueError(f"Unsupported language: {language}")

    try:
        response = openai.ChatCompletion.create(
            model='gpt-3.5-turbo',
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            temperature=0.4,
            max_tokens=600,
            n=1,
            stop=None,
        )
        modified_premise = response['choices'][0]['message']['content'].strip()
        return modified_premise
    except Exception as e:
        print(f"An error occurred: {e}")
        return premise

In [9]:
#Generates an adversarial premise by shuffling word order using few shot prompting but still retaining the meaning
def generate_adversarial_shuffle(premise, hypothesis, label, language):
    if language == 'en':
        prompt = f"""Please rearrange the words in the following sentence to make it grammatically correct but with a different word order.

Example 1:
Sentence: The cat sat on the mat.
Modified Sentence: On the mat sat the cat.

Example 2:
Sentence: After the war they stopped coming round the house.
Modified Sentence: They stopped coming round the house after the war.

Sentence: "{premise}"
Modified Sentence:"""
    elif language == 'ru':
        prompt = f"""Пожалуйста, переставьте слова в следующем предложении, чтобы оно оставалось грамматически правильным, но с другим порядком слов.

Пример 1:
Предложение: Кот сидит на коврике.
Измененное предложение: Cидит на коврике кот.

Пример 2:
Предложение: Мы надеемся, что вам понравится беседа с ними.
Измененное предложение: C ними беседа вам понравится, мы надеемся.

Предложение: "{premise}"

Измененное предложение:"""
    else:
        raise ValueError(f"Unsupported language: {language}")

    try:
        response = openai.ChatCompletion.create(
            model='gpt-4',
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            temperature=0.4,
            max_tokens=600,
            n=1,
            stop=None,
        )
        modified_premise = response['choices'][0]['message']['content'].strip()
        return modified_premise
    except Exception as e:
        print(f"An error occurred: {e}")
        return premise

In [10]:
adversarial_examples_shuffle = []

In [11]:
for example in tqdm(adversarial_dataset_shuffle, desc="Generating adversarial premises"):
    premise = example['premise']
    hypothesis = example['hypothesis']
    label = example['label']
    language = example['language']

    # Generate the adversarial premise
    modified_premise = generate_adversarial_shuffle(premise, hypothesis, label, language)

    # Create a new example with the modified premise
    adversarial_example = {
        'original_premise': premise,
        'premise': modified_premise,
        'hypothesis': hypothesis,
        'label': label,
        'language': language
    }
    adversarial_examples_shuffle.append(adversarial_example)

Generating adversarial premises:   0%|          | 0/1200 [00:00<?, ?it/s]

In [12]:
adversarial_dataset_shuffle = datasets.Dataset.from_pandas(pd.DataFrame(adversarial_examples_shuffle))

In [13]:
output_file = 'adversarial_shuffle_xnli_test.jsonl'
adversarial_dataset_shuffle.to_json(output_file, force_ascii=False, lines=True)

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

552672

In [4]:
for i in range(5):
    ex = adversarial_dataset_shuffle[-i]
    print(f"Language: {ex['language']}")
    print(f"Original Premise: {ex['original_premise']}")
    print(f"Modified Premise: {ex['premise']}")
    print(f"Hypothesis: {ex['hypothesis']}")
    print(f"Label: {ex['label']}\n")

NameError: name 'adversarial_dataset_shuffle' is not defined

In [16]:
input_file = 'adversarial_shuffle_xnli_test.jsonl'
output_file = 'adversarial_shuffle.jsonl'   #cleaned output file

def remove_extra_quotes(text):
    text = text.strip()
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    return text

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        data = json.loads(line)
        #remove extra quotes from 'premise' field mistakenly added during GPT round
        data['premise'] = remove_extra_quotes(data.get('premise', ''))
        outfile.write(json.dumps(data, ensure_ascii=False))
        outfile.write('\n')

Below pulls in existing adversarial datasets and translates the test data to Russian

In [18]:
anli_dataset = datasets.load_dataset('anli')

test_datasets = [anli_dataset[f'test_r{round_num}'] for round_num in [1, 2, 3]]
combined_test_dataset = datasets.concatenate_datasets(test_datasets)

In [22]:
def translate_text(text):

    prompt = f"Translate the following text from English to Russian:\n\n{text}"

    try:
        response = openai.ChatCompletion.create(
            model='gpt-3.5-turbo',
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            temperature=0,
            max_tokens=1024,
        )
        translated_text = response['choices'][0]['message']['content'].strip()
        return translated_text
    except Exception as e:
        print(f"An error occurred: {e}")
        return text


In [23]:
translated_entries = []

for example in tqdm(combined_test_dataset, desc='Translating dataset'):
    premise = example['premise']
    hypothesis = example['hypothesis']
    label = example['label']

    translated_premise = translate_text(premise)
    translated_hypothesis = translate_text(hypothesis)

    translated_entry = {
        'premise': translated_premise,
        'hypothesis': translated_hypothesis,
        'label': label,
        'uid': example['uid'],
    }
    translated_entries.append(translated_entry)

Translating dataset:   0%|          | 0/3200 [00:00<?, ?it/s]

In [27]:
r1_test = anli_dataset['test_r1']
r2_test = anli_dataset['test_r2']
r3_test = anli_dataset['test_r3']

In [29]:
translated_dict = {entry['uid']: entry for entry in translated_entries}

In [30]:
def process_round(round_dataset, round_name):
    english_entries = []
    russian_entries = []

    df = round_dataset.to_pandas()

    for idx, row in df.iterrows():
        uid = row['uid']
        premise_en = row['premise']
        hypothesis_en = row['hypothesis']
        label = row['label']

        english_entry = {
            'id': uid,
            'premise': premise_en,
            'hypothesis': hypothesis_en,
            'label': label,
            'language': 'en'
        }
        english_entries.append(english_entry)

        if uid in translated_dict:
            translated_entry = translated_dict[uid]
            premise_ru = translated_entry['premise']
            hypothesis_ru = translated_entry['hypothesis']

            russian_entry = {
                'id': uid,
                'premise': premise_ru,
                'hypothesis': hypothesis_ru,
                'label': label,
                'language': 'ru'
            }
            russian_entries.append(russian_entry)
        else:
            pass

    combined_entries = english_entries + russian_entries

    output_file = f'anli_{round_name}_en_ru.jsonl'
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in combined_entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

    return combined_entries


In [32]:
r1_combined_entries = process_round(r1_test, 'R1')
r2_combined_entries = process_round(r2_test, 'R2')
r3_combined_entries = process_round(r3_test, 'R3')

Combined dataset for R2 saved to anli_R2_en_ru.jsonl
Combined dataset for R3 saved to anli_R3_en_ru.jsonl


Found an existing dataset that has already translated ANLI (as well as other datasets) to Russian so no need to use GPT credits

In [25]:
dataset = load_dataset('cointegrated/nli-rus-translated-v2021')

def filter_by_source(example):
    return example['source'] in ['anli_r1', 'anli_r2', 'anli_r3']

dataset = dataset.filter(filter_by_source)

label_to_int = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

def map_labels(example):
    example['label'] = label_to_int[example['label']]
    return example
dataset = dataset.map(map_labels)

train_dataset = dataset['train'].to_pandas()
eval_dataset = dataset['dev'].to_pandas()

Search through ANLI training data to find examples that align with our selected error types

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_negation_examples(data):
    negation_words = ["no", "never", "not", "nothing", "isn't", "wasn't"]
    negation_examples = data[data['hypothesis'].str.contains(r'\b(?:' + '|'.join(negation_words) + r')\b', regex=True, na=False)]
    return negation_examples

def find_attribution_examples(data):
    attribution_phrases = ["directed by", "written by", "owner of", "invented by", "owned by", "produced by", "discovered by", "performed by"]
    attribution_examples = data[
        data['hypothesis'].str.contains(r'\b(?:' + '|'.join(attribution_phrases) + r')\b', regex=True, na=False)
        | data['premise'].str.contains(r'\b(?:' + '|'.join(attribution_phrases) + r')\b', regex=True, na=False)
    ]
    return attribution_examples

def calculate_word_overlap(premise, hypothesis):
    vectorizer = CountVectorizer().fit([premise, hypothesis])
    vectors = vectorizer.transform([premise, hypothesis]).toarray()
    similarity = cosine_similarity(vectors)
    return similarity[0, 1]

def find_high_word_overlap_examples(data, threshold=0.8):
    high_overlap_examples = []
    for _, row in data.iterrows():
        premise, hypothesis = row['premise'], row['hypothesis']
        overlap = calculate_word_overlap(premise, hypothesis)
        if overlap >= threshold:
            high_overlap_examples.append(row)
    return pd.DataFrame(high_overlap_examples)

In [27]:
negation_examples = find_negation_examples(train_dataset)
attribution_examples = find_attribution_examples(train_dataset)
high_overlap_examples = find_high_word_overlap_examples(train_dataset)

negation_eval = find_negation_examples(eval_dataset)
attribution_eval = find_attribution_examples(eval_dataset)
high_overlap_eval = find_high_word_overlap_examples(eval_dataset)

In [28]:
import json

def to_en_format(example):
    return {
        'premise': example['premise'],
        'hypothesis': example['hypothesis'],
        'label': example['label'],
        'language': 'en'
    }

def to_ru_format(example):
    return {
        'premise': example['premise_ru'],
        'hypothesis': example['hypothesis_ru'],
        'label': example['label'],
        'language': 'ru'
    }

def save_combined_jsonl(train_data_en, train_data_ru, eval_data_en, eval_data_ru, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for _, row in train_data_en.iterrows():
            formatted_example = to_en_format(row)
            formatted_example['split'] = 'train'
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')
        for _, row in train_data_ru.iterrows():
            formatted_example = to_ru_format(row)
            formatted_example['split'] = 'train'
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')
        for _, row in eval_data_en.iterrows():
            formatted_example = to_en_format(row)
            formatted_example['split'] = 'eval'
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')
        for _, row in eval_data_ru.iterrows():
            formatted_example = to_ru_format(row)
            formatted_example['split'] = 'eval'
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')

save_combined_jsonl(negation_examples, negation_examples, negation_eval, negation_eval, "negation_examples_comb.jsonl")
save_combined_jsonl(attribution_examples, attribution_examples, attribution_eval, attribution_eval, "attribution_examples_comb.jsonl")
save_combined_jsonl(high_overlap_examples, high_overlap_examples, high_overlap_eval, high_overlap_eval, "high_overlap_examples_comb.jsonl")

In [7]:
import datasets
dataset = datasets.load_dataset('json', data_files="high_overlap_examples_comb.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'language', 'split'],
        num_rows: 34540
    })
})

Create Metaphor Dataset

In [12]:
import pandas as pd
import xml.etree.ElementTree as ET
import re
import random

In [5]:
en_file_path = 'metaphors/en_large.xml'
en_tree = ET.parse(en_file_path)
en_root = en_tree.getroot()

ru_file_path = 'metaphors/ru_large.xml'
ru_tree = ET.parse(ru_file_path)
ru_root = ru_tree.getroot()

In [6]:
def clean_text(element):
    if element is not None:
        full_text = ''.join(element.itertext())
        return full_text.strip()
    return None

def is_valid_text(text):
    return bool(re.match(r'^[\w\s.,!?\'\"-«»—\u0400-\u04FF]*$', text)) if text else False

In [7]:
data_en = []

for instance in en_root.findall('LmInstance'):
    # Extract attributes and elements
    id = instance.attrib.get('id', None)
    docid = instance.attrib.get('docid', None)
    target_concept = instance.attrib.get('targetConcept', None)
    type_ = instance.attrib.get('type', None)
    chain = instance.attrib.get('chain', None)
    current_element = instance.find('./TextContent/Current')
    current_text = clean_text(current_element)
    metaphoricity_score = instance.find('./Annotations/MetaphoricityAnnotations/MetaphoricityAnnotation').attrib.get('score', None) if instance.find('./Annotations/MetaphoricityAnnotations/MetaphoricityAnnotation') is not None else None

    data_en.append({
        'ID': id,
        'DocID': docid,
        'Target Concept': target_concept,
        'Type': type_,
        'Chain': chain,
        'Current Text': current_text,
        'Metaphoricity Score': metaphoricity_score
    })

df_en = pd.DataFrame(data_en)

data_ru = []

for instance in ru_root.findall('LmInstance'):
    # Extract attributes and elements
    id = instance.attrib.get('id', None)
    docid = instance.attrib.get('docid', None)
    target_concept = instance.attrib.get('targetConcept', None)
    type_ = instance.attrib.get('type', None)
    chain = instance.attrib.get('chain', None)
    current_element = instance.find('./TextContent/Current')
    current_text = clean_text(current_element)
    metaphoricity_score = instance.find('./Annotations/MetaphoricityAnnotations/MetaphoricityAnnotation').attrib.get('score', None) if instance.find('./Annotations/MetaphoricityAnnotations/MetaphoricityAnnotation') is not None else None

    data_ru.append({
        'ID': id,
        'DocID': docid,
        'Target Concept': target_concept,
        'Type': type_,
        'Chain': chain,
        'Current Text': current_text,
        'Metaphoricity Score': metaphoricity_score
    })

df_ru = pd.DataFrame(data_ru)

In [8]:
df_en['Metaphoricity Score'] = pd.to_numeric(df_en['Metaphoricity Score'], errors='coerce')
df_ru['Metaphoricity Score'] = pd.to_numeric(df_ru['Metaphoricity Score'], errors='coerce')

In [9]:
df_en = df_en[
    (df_en['Metaphoricity Score'] == 3) &
    df_en['Current Text'].apply(is_valid_text)
]

df_ru = df_ru[
    (df_ru['Metaphoricity Score'] == 3) &
    df_ru['Current Text'].apply(is_valid_text)
]

df_en = df_en.drop_duplicates(subset=['Current Text'])
df_ru = df_ru.drop_duplicates(subset=['Current Text'])

In [10]:
df_en.head()

Unnamed: 0,ID,DocID,Target Concept,Type,Chain,Current Text,Metaphoricity Score
1,929,85,POVERTY,RECALL_VALIDATIONS,*:prep_on,"Thank Lyndon Johnson, his Great Society, and t...",3.0
15,1411,447,POVERTY,SYSTEM_VALIDATIONS,*:prep:prep_of,And breaking out of abject poverty is extremel...,3.0
33,1725,629,TAXATION,RECALL_VALIDATIONS,*:nn,"If you can show me where I have stated that ""t...",3.0
34,1726,629,TAXATION,RECALL_VALIDATIONS,*:nn,"Show me where, in this thread, I've said that ...",3.0
41,21665,10960,POVERTY,SYSTEM_VALIDATIONS,*:prep_into:nn,"Fiscal cliff: America goes to the brink, but m...",3.0


In [11]:
df_ru.head()

Unnamed: 0,ID,DocID,Target Concept,Type,Chain,Current Text,Metaphoricity Score
49,17318,8903,BUREAUCRACY,RECALL_VALIDATIONS,*:релят:1-компл,> О жадности российских воров-чиновников хнают...,3.0
50,17321,8905,POVERTY,SYSTEM_VALIDATIONS,*:1-компл:prep_из,Из-за краха экономики в 90-х годах у нас практ...,3.0
53,17360,8923,GOVERNMENT,RECALL_VALIDATIONS,*:1-компл,Все только и орут что надо свергнуть власть и ...,3.0
63,17536,9000,TAXATION,RECALL_VALIDATIONS,опред:*,"А эти 99 процентов глупы, если верят, что враж...",3.0
64,17537,9000,TAXATION,RECALL_VALIDATIONS,опред:*,"А когда он увидит, что с отменой пошлин снизил...",3.0


In [13]:
df_en['Label'] = random.choices([0, 1, 2], k=len(df_en))
df_ru['Label'] = random.choices([0, 1, 2], k=len(df_ru))
df_en['Premise'] = df_en['Current Text']
df_ru['Premise'] = df_ru['Current Text']
df_en['Language'] = 'en'
df_ru['Language'] = 'ru'

In [37]:
#Generates a hypothesis given a premise with a metaphor and a label

def generate_adversarial_hypothesis(premise, label, language, id):
    label_map_en = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
    label_text_english = label_map_en[label]
    label_map_ru = {0: 'вывод', 1: 'нейтральность', 2: 'противоречие'}
    label_text_russian = label_map_ru[label]

    if label == 1 and int(id) //2==0:  #neutral
        if language == 'en':
            prompt = f"""Please help me create a hypothesis that is not directly connected to the following premise. Feel free to make it slightly unrelated while ensuring it remains neutral to the premise in tone and content.

Premise: {premise}
Hypothesis:"""
        elif language == 'ru':
            prompt = f"""Пожалуйста, помогите мне создать гипотезу, которая не напрямую связана с следующим предположением. Вы можете сделать её немного несвязанной, сохраняя при этом нейтральный тон и содержание.

Предположение: {premise}
Гипотеза:"""
    else:
        if language == 'en':
            prompt = f"""Please help me create a hypothesis that is a '{label_text_english}' given the following premise. Please take into account metaphoric sentences in the premise and try not to reuse them in the hypothesis.

Premise: {premise}
Hypothesis:"""
        elif language == 'ru':
            prompt = f"""Пожалуйста, помогите мне создать гипотезу, которая будет '{label_text_russian}' относительно следующего предположения - учитывайте метафорические предложения в предположении и старайтесь не использовать их повторно в гипотезе.

Предположение: {premise}
Гипотеза:"""

    try:
        response = openai.ChatCompletion.create(
            model='gpt-4',
            messages=[
                {'role': 'user', 'content': prompt}
            ],
            temperature=0.4,
            max_tokens=600,
            n=1,
            stop=None,
        )
        modified_premise = response['choices'][0]['message']['content'].strip()
        return modified_premise
    except Exception as e:
        print(f"An error occurred: {e}")
        return premise

In [43]:
concat_df = pd.concat([df_en.sample(1200), df_ru], ignore_index=True)

def generate_dataset_row(row):
    return generate_adversarial_hypothesis(
        premise=row['Current Text'],
        label=row['Label'],
        language=row['Language'],
        id=row['ID']
    )

concat_df['Modified Premise'] = concat_df.apply(generate_dataset_row, axis=1)

In [45]:
concat_df

Unnamed: 0,ID,DocID,Target Concept,Type,Chain,Current Text,Metaphoricity Score,Label,Premise,Language,Modified Premise
0,761624,106591,POVERTY,ANNOTATOR_EXAMPLES,*:nsubj,"Poverty is a monster that's hard to stop, and ...",3.0,2,"Poverty is a monster that's hard to stop, and ...",en,No one is taking any action to combat poverty.
1,5068630,1360529,MENTAL_CONCEPTS,SYSTEM_VALIDATIONS,*:prep_of,KS: In this marketplace of ideas that you are ...,3.0,2,KS: In this marketplace of ideas that you are ...,en,"In the marketplace of ideas, no one is worried..."
2,762754,106903,TAXATION,ANNOTATOR_EXAMPLES,*:nn,While 401(k) and profit-sharing plans offer so...,3.0,1,While 401(k) and profit-sharing plans offer so...,en,The tax benefits provided by 401(k) and profit...
3,2508811,520,GOVERNMENT,RECALL_VALIDATIONS,*:nn,From the end of WWII you have created the Welf...,3.0,2,From the end of WWII you have created the Welf...,en,"Since the end of WWII, you have fostered an en..."
4,2378318,508244,TAXATION,ANNOTATOR_EXAMPLES,*:prep_of,Tax relief is a program for eligible seniors a...,3.0,0,Tax relief is a program for eligible seniors a...,en,Eligible seniors and disabled residents can be...
...,...,...,...,...,...,...,...,...,...,...,...
2398,4499274,1181719,GOVERNMENT,ANNOTATOR_EXAMPLES,*:квазиагент,Градусник власти зашкаливает и мы получаем нез...,3.0,0,Градусник власти зашкаливает и мы получаем нез...,ru,При усилении полномочий правительства может пр...
2399,4499326,1181329,GOVERNMENT,ANNOTATOR_EXAMPLES,*:опред,"Дуумвират, как ни странно, был в этом смысле б...",3.0,2,"Дуумвират, как ни странно, был в этом смысле б...",ru,"Неожиданно, дуумвират оказался менее стабильны..."
2400,4499346,1181762,GOVERNMENT,ANNOTATOR_EXAMPLES,опред:*,"— Не думаю, что нужно создавать радужное прави...",3.0,2,"— Не думаю, что нужно создавать радужное прави...",ru,"Возможно, формирование многоцветного кабинета ..."
2401,4592565,1206918,RELIGION,SYSTEM_VALIDATIONS,*:опред,"А как про бедных детишек , которых шибко верую...",3.0,0,"А как про бедных детишек , которых шибко верую...",ru,"СМИ редко освещают ситуации, когда дети станов..."


In [46]:
from sklearn.model_selection import train_test_split

def split_and_save_jsonl_by_language(concat_df, filename):
    english_data = concat_df[concat_df['Language'] == 'en']
    train_en, eval_en = train_test_split(english_data, test_size=0.2, random_state=4)

    russian_data = concat_df[concat_df['Language'] == 'ru']
    train_ru, eval_ru = train_test_split(russian_data, test_size=0.2, random_state=4)

    train_data = pd.concat([train_en, train_ru], ignore_index=True)
    eval_data = pd.concat([eval_en, eval_ru], ignore_index=True)

    with open(filename, 'w', encoding='utf-8') as f:
        for _, row in train_data.iterrows():
            formatted_example = {
                'premise': row['Current Text'],
                'hypothesis': row['Modified Premise'],
                'label': row['Label'],
                'language': row['Language'],
                'split': 'train'
            }
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')

        for _, row in eval_data.iterrows():
            formatted_example = {
                'premise': row['Current Text'],
                'hypothesis': row['Modified Premise'],
                'label': row['Label'],
                'language': row['Language'],
                'split': 'eval'
            }
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')

Below creates all the final testing and training data to be used

In [51]:
from sklearn.model_selection import train_test_split
import pandas as pd
import json

def split_and_save_jsonl_by_language(concat_df, filename):
    english_data = concat_df[concat_df['Language'] == 'en']
    train_en, temp_en = train_test_split(english_data, test_size=0.4, random_state=4)
    eval_en, test_en = train_test_split(temp_en, test_size=0.5, random_state=4)

    russian_data = concat_df[concat_df['Language'] == 'ru']
    train_ru, temp_ru = train_test_split(russian_data, test_size=0.4, random_state=4)
    eval_ru, test_ru = train_test_split(temp_ru, test_size=0.5, random_state=4)

    train_data = pd.concat([train_en, train_ru], ignore_index=True)
    eval_data = pd.concat([eval_en, eval_ru], ignore_index=True)
    test_data = pd.concat([test_en, test_ru], ignore_index=True)

    with open(filename, 'w', encoding='utf-8') as f:
        for _, row in train_data.iterrows():
            formatted_example = {
                'premise': row['Current Text'],
                'hypothesis': row['Modified Premise'],
                'label': row['Label'],
                'language': row['Language'],
                'split': 'train'
            }
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')

        for _, row in eval_data.iterrows():
            formatted_example = {
                'premise': row['Current Text'],
                'hypothesis': row['Modified Premise'],
                'label': row['Label'],
                'language': row['Language'],
                'split': 'eval'
            }
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')

        for _, row in test_data.iterrows():
            formatted_example = {
                'premise': row['Current Text'],
                'hypothesis': row['Modified Premise'],
                'label': row['Label'],
                'language': row['Language'],
                'split': 'test'
            }
            json.dump(formatted_example, f, ensure_ascii=False)
            f.write('\n')

In [52]:
split_and_save_jsonl_by_language(concat_df, 'metaphor_training_final.jsonl')

In [50]:
import json
import random

def add_split_column(input_filename, output_filename, train_size=0.8):
    data = []

    with open(input_filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))

    for item in data:
        item['split'] = 'train' if random.random() < train_size else 'eval'

    with open(output_filename, 'w', encoding='utf-8') as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

add_split_column('adversarial_xnli_test.jsonl', 'adversarial_premise.jsonl')

In [53]:
def combine_anli_test_files(output_file, *input_files):
    combined_data = []
    for file in input_files:
        with open(file, 'r', encoding='utf-8') as f:
            combined_data.extend([json.loads(line) for line in f])

    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in combined_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

combine_anli_test_files(
    'anli_test.jsonl',
    'anli_R1_en_ru.jsonl',
    'anli_R2_en_ru.jsonl',
    'anli_R3_en_ru.jsonl'
)

In [55]:
def extract_test_split(input_file, output_file):
    test_data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            if entry.get('split') == 'test':
                test_data.append(entry)

    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in test_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

extract_test_split('metaphor_training_final.jsonl', 'metaphor_test.jsonl')

In [56]:
def merge_test_files(output_file, *input_files):
    combined_data = []

    for file in input_files:
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                entry = json.loads(line)

                normalized_entry = {
                    'premise': entry.get('premise', ''),
                    'hypothesis': entry.get('hypothesis', ''),
                    'label': entry.get('label', None),
                    'language': entry.get('language', 'unknown')
                }
                combined_data.append(normalized_entry)

    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in combined_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

merge_test_files(
    'final_test.jsonl',
    'anli_test.jsonl',
    'metaphor_test.jsonl'
)

In [59]:
def merge_test_files(output_file, *input_files):
    combined_data = []

    for file in input_files:
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                entry = json.loads(line)

                normalized_entry = {
                    'premise': entry.get('premise', ''),
                    'hypothesis': entry.get('hypothesis', ''),
                    'label': entry.get('label', None),
                    'language': entry.get('language', 'unknown'),
                    'split': entry.get('split', '')
                }
                combined_data.append(normalized_entry)

    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in combined_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

merge_test_files(
    'final_train_2.jsonl',
    'metaphor_training_final.jsonl',
    'negation_examples_comb.jsonl',
    'metaphor_training_final.jsonl',
    'attribution_examples_comb.jsonl',
    'metaphor_training_final.jsonl',
    'high_overlap_examples_comb.jsonl',
    'metaphor_training_final.jsonl'
)