In [None]:
%%capture
!pip install langchain-openai
!pip install langchain
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade
!pip install bert_score


In [None]:
import nltk
from datasets import load_dataset, Dataset, Features, Value, ClassLabel
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

##Reading the data
Using all the functions from previous script to read the data.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, json
import numpy as np
from argparse import ArgumentParser
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from pprint import pprint
from datetime import datetime
import copy

In [None]:
#@title Reading train, dev and test data
folder_path = 'path'

def read_json_file(name):
    with open(name, 'r') as f:
        data = json.load(f)
        return data

def read_data():
    train = read_json_file(os.path.join(folder_path, "train.json"))
    dev = read_json_file(os.path.join(folder_path, "dev.json"))
    test = read_json_file(os.path.join(folder_path, "test.json"))

    return train, dev, test

train, dev, test = read_data()
print(len(train), len(dev), len(test))

246 50 100


## Preparing the instruction data

In [None]:
def read_json_file(name):
    folder_path = 'path'
    name = os.path.join(folder_path, name)
    with open(name, 'r') as f:
        data = json.load(f)
        return data

def save_json(json_data, file_name):
    json_data = json.dumps(json_data)
    print(file_name)
    with open(file_name, "w") as json_file:
          json_file.write(json_data)

role_details = read_json_file(f'role_definitions_question_guided.json')
pprint(role_details)
instruction_prefix = '''Concisely extract the following argument from the post comment pair.
Do not use more than 12 words to describe an argument. Return 'null' if any arugment is not present.'''

{'relapse': {'description': '',
             'event-specific-arguments': {'condition': 'What is the substance '
                                                       'use history/disorder '
                                                       'from which the subject '
                                                       'had previously '
                                                       'recovered or was in '
                                                       'the process of '
                                                       'recovering?',
                                          'existing/current-medications': 'Which '
                                                                          'medication '
                                                                          'did '
                                                                          'the '
                                                                          'subject '
                

In [None]:
def create_instruction_data(data, role_details):
    arg_types = ['main-arguments', 'event-specific-arguments', 'subject-effect-arguments']
    instruction_dt = []
    i=0
    for dt in data:
        post, comment, label = dt['text1'], dt['text2'], dt['label']
        context = f'''Post: {post}\nComment: {comment}''' ## adding post and comment togethar to create context

        for arg_typ in arg_types:
            role_descriptions = role_details[label][arg_typ] #chosing the argument details from the correspond event
            arguments = list(role_descriptions.keys()) ##getting specific arguments

            for arrg in arguments:
                arg_question = role_details[label][arg_typ][arrg]
                answer = dt['ground-truth-arguments'][arg_typ][arrg]
                input_dict = {
                  'id' : f'ins_{i}',
                  'instruction' : instruction_prefix,
                  'context' : context,
                  'question' : arg_question,
                  'answer' : ", ".join(answer)
                }
                i+=1
                instruction_dt.append(input_dict)
    return instruction_dt


In [None]:
instruction_train = create_instruction_data(train[:100], role_details)
instruction_dev = create_instruction_data(dev, role_details)
instruction_test = create_instruction_data(test, role_details)

In [None]:
import pandas as pd
from datasets import Dataset , DatasetDict

instruction_train = Dataset.from_pandas(pd.DataFrame(instruction_train))
instruction_dev = Dataset.from_pandas(pd.DataFrame(instruction_dev))
instruction_test = Dataset.from_pandas(pd.DataFrame(instruction_test))

dataset = DatasetDict({
    'train': instruction_train,
    'val': instruction_dev,
    'test': instruction_test
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'instruction', 'context', 'question', 'answer'],
        num_rows: 1876
    })
    val: Dataset({
        features: ['id', 'instruction', 'context', 'question', 'answer'],
        num_rows: 944
    })
    test: Dataset({
        features: ['id', 'instruction', 'context', 'question', 'answer'],
        num_rows: 1883
    })
})


In [None]:
dataset['train'] = dataset['train'].map(lambda example: {'input_text': f"Instruction: {example['instruction']}\nContext:\n{example['context']}\n Question:{example['question']}",
                                       'target_text': example['answer']},
                      remove_columns=['id', 'instruction', 'context', 'question', 'answer'])
dataset['val'] = dataset['val'].map(lambda example: {'input_text': f"Instruction: {example['instruction']}\nContext:\n{example['context']}\n Question:{example['question']}",
                                       'target_text': example['answer']},
                               remove_columns=['id', 'instruction', 'context', 'question', 'answer'])
dataset['test'] = dataset['test'].map(lambda example: {'input_text': f"Instruction: {example['instruction']}\nContext:\n{example['context']}\n Question:{example['question']}",
                                       'target_text': example['answer']},
                               remove_columns=['id', 'instruction', 'context', 'question', 'answer'])

Map:   0%|          | 0/1876 [00:00<?, ? examples/s]

Map:   0%|          | 0/944 [00:00<?, ? examples/s]

Map:   0%|          | 0/1883 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 1876
    })
    val: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 944
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 1883
    })
})

##Model

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]

    # Tokenize inputs and targets separately
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_targets = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Update model_inputs with labels
    model_inputs["labels"] = model_targets["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1876 [00:00<?, ? examples/s]

Map:   0%|          | 0/944 [00:00<?, ? examples/s]

Map:   0%|          | 0/1883 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1876
    })
    val: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 944
    })
    test: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1883
    })
})

In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# Set up training arguments
from time import time
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=f'/content/{time()}',
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
%%time
trainer.train()

In [None]:
# Predict on the test set
predictions = trainer.predict(tokenized_dataset['test'])

# Decode the generated outputs
decoded_predictions = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)

# Print some examples
for i in range(5):  # Print first 5 examples
    print("Example {}: {}".format(i+1, decoded_predictions[i]))


Example 1: Individual who wakes up every morning
Example 2: Taking a dose 12 hours apart
Example 3: null
Example 4: null
Example 5: null


In [None]:
for pred in decoded_predictions:
    print(pred)

In [None]:
def create_predictions(data, role_details):
    i = 0
    arg_types = ['main-arguments', 'event-specific-arguments', 'subject-effect-arguments']
    finallist = []
    for dt in data:
        pprint(dt)
        dx = dt
        label = dt['label']
        predictions = {}
        for arg_typ in arg_types:
            role_descriptions = role_details[label][arg_typ] #chosing the argument details from the correspond event
            arguments = list(role_descriptions.keys()) ##getting specific arguments
            predictions[arg_typ] = {}
            for arrg in arguments:
                print(arg_typ, arrg)
                print(decoded_predictions[i])
                value = decoded_predictions[i]
                value_list = [item.replace("'", "").strip() for item in value.split(',')]
                predictions[arg_typ][arrg] = value_list
                i+=1
        dx['predictions'] = predictions
        pprint(dx)
        finallist.append(dx)
    return finallist

In [None]:
final_data = create_predictions(test, role_details)

In [None]:
def save_json(json_data, file_name):
    json_data = json.dumps(json_data)
    print(file_name)
    with open(file_name, "w") as json_file:
          json_file.write(json_data)

number = '2'
folder_path= 'Argument-Extraction-Predictions'
file_name = f'{number}-generative-qa-flan-t5-base-predictions.json'
save_json(final_data, os.path.join(folder_path, file_name))

/content/drive/Shareddrives/Reddit Misinformation/Data/New-Avg-Run-Experiments/Argument-Extraction-Predictions/2-generative-qa-flan-t5-base-predictions.json
