### Insturction Model Inference

In [1]:
import pandas as pd
import os
import torch
from tqdm import tqdm
import transformers
import datasets 
from pprint import pprint
from datasets import load_dataset
from transformers import pipeline 
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

In [2]:
pwd

'/'

In [3]:
cd workspace

/workspace


In [4]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [5]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
2
NVIDIA GeForce RTX 3090


In [6]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [40]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

ADDITIONAL_SP_TOKENS = {'hl': '<hl>'}
tokenizer.add_special_tokens({'additional_special_tokens': list(ADDITIONAL_SP_TOKENS.values())})
model.resize_token_embeddings(len(tokenizer))
model.eval()
        
task_prefix = 'Given a passage and a highlighted answer, your goal is to generate a question about the answer. If you make a question, yes or no for first token "When" ? "There are six types of first token possible: \"What\", \"How\", \"Who\", \"When\", \"Where\", \"Which\",  \"Why\"."'
# use different length sentences to test batching
sentences = ["The majority report of the Financial Crisis Inquiry Commission, written by the six Democratic appointees, the minority report, written by 3 of the <hl> 4 <hl> Republican appointees, studies by Federal Reserve economists, and the work of several independent scholars generally contend that government affordable housing policy was not the primary cause of the financial crisis. Although they concede that governmental policies had some role in causing the crisis, they contend that GSE loans performed better than loans securitized by private investment banks, and performed better than some loans originated by institutions that held loans in their own portfolios. Paul Krugman has even claimed that the GSE never purchased subprime loans – a claim that is widely disputed."]
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 32101. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


['yes']


### Question Generation

In [46]:
!python -m lm-question-generation.lmqg_inference.post_flan_t5_large_squad_qg_evaluation

[34m[1mwandb[0m: Currently logged in as: [33mminseok0809[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/wandb/run-20231026_081754-5wvg5dqt[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mpost_flan_t5_large_squad_qg[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/minseok0809/lmqg_qg_squad[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/minseok0809/lmqg_qg_squad/runs/5wvg5dqt[0m


Evaluation Time:  0:00:05
Valid Bleu 1:  0.592     Valid Bleu 2: 0.4391
Valid Bleu 3:  0.3469     Valid Bleu 4:  0.2808
Test Bleu 1:  0.6022     Test Bleu 2:  0.4432
Test Bleu 3:  0.346     Test Bleu 4:  0.2763


tmp_ckpt_flan_t5_large_squad_

In [None]:
# automatic evaluation에서 for idx, pair in enumerate(_pairs)의 if idx<=10 바꾸기
"""
if os.path.isfile(prediction_df_path) == True:
    prediction_df_path = 'prediction/prediction_validation.xlsx'
    
elif os.path.isfile(prediction_df_path) == False: 
    prediction_df_path = 'prediction/prediction_test.xlsx' """ 
# 파일명 이름 바꾸기 

# langugage_model.py의 for idx, encode in enumerate(loader)의 if idx<= 10 바꾸기
"""
if os.path.isfile(prediction_df_path) == True:
    prediction_df_path = 'prediction/prediction_validation.xlsx'
    
elif os.path.isfile(prediction_df_path) == False: 
    prediction_df_path = 'prediction/prediction_test.xlsx' """ 
# 파일명 이름 바꾸기 

# langugage_model.py의 def text_to_encode의 주석처리하기 (전체 데이터 하거나 cache 데이터 파일 변경할 때)
"""
if cache_path is not None and os.path.exists(cache_path):
    logging.info(f'loading preprocessed feature from {cache_path}')
    return pickle_load(cache_path)
"""

In [None]:
test_sample = pd.read_csv("data_sample/test_sample.csv")
test_prediction = pd.read_csv("prediction/prediction_test.csv")
test_prediction['Label'] = test_sample['question'].values.tolist()
test_prediction.to_csv("prediction/test_comparison.csv", index=False)
test_prediction.to_excel("prediction/test_comparison.xlsx", index=False)
dev_sample = pd.read_csv("data_sample/dev_sample.csv")
dev_prediction = pd.read_csv("prediction/prediction_validation.csv")
dev_prediction.to_csv("data/dev.csv")
dev_prediction['Label'] = dev_sample['question'].values.tolist()
dev_prediction.to_csv("prediction/dev_comparison.csv", index=False)
dev_prediction.to_excel("prediction/dev_comparison.xlsx", index=False)

### Question Completion

In [16]:
!python -m lm-question-generation.lmqg_collate_fn_inference.instruction_flan_t5_large_squad_qg_evaluation_q

[34m[1mwandb[0m: Currently logged in as: [33mminseok0809[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/workspace/wandb/run-20231031_030656-dhcp8wuf[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33minstruction_flan_t5_large_squad_qg_q[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/minseok0809/lmqg_qg_squad[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/minseok0809/lmqg_qg_squad/runs/dhcp8wuf[0m
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32101. This might induce some performance reduction as *Tensor Cores* will not be available. F

### Token Prediction by Mixed Subtask

In [64]:
test_insturction = pd.read_csv("workspace/data_wh_plus/test_instruction.csv")
test_insturction['input'] = test_insturction['instruction'] + " : " + test_insturction['paragraph_answer']
test_insturction_next_token = test_insturction[test_insturction['input'].str.contains("Given a paragraph and an answer, what is next")]
test_insturction_classification = test_insturction[test_insturction['input'].str.contains("Given a passage and a highlighted answer, your goal is to generate a question about the answer.")]
test_insturction_next_token = test_insturction_next_token.reset_index()
del test_insturction_next_token['index']
test_insturction_classification = test_insturction_classification.reset_index()
del test_insturction_classification['index']
test_insturction_next_token.to_csv("workspace/data/test_next_token.csv")
test_insturction_classification.to_csv("workspace/data/test_classification.csv")

train_insturction = pd.read_csv("workspace/data_wh_plus/train_instruction.csv")
train_insturction['input'] = train_insturction['instruction'] + " : " + train_insturction['paragraph_answer']
train_insturction_next_token = train_insturction[train_insturction['input'].str.contains("Given a paragraph and an answer, what is next")]
train_insturction_classification = train_insturction[train_insturction['input'].str.contains("Given a passage and a highlighted answer, your goal is to generate a question about the answer.")]
train_insturction_next_token = train_insturction_next_token.reset_index()
del train_insturction_next_token['index']
train_insturction_classification = train_insturction_classification.reset_index()
del train_insturction_classification['index']
train_insturction_next_token.to_csv("workspace/data/train_next_token.csv")
train_insturction_classification.to_csv("workspace/data/train_classification.csv")

dev_insturction = pd.read_csv("workspace/data_wh_plus/dev_instruction.csv")
dev_insturction['input'] = dev_insturction['instruction'] + " : " + dev_insturction['paragraph_answer']
dev_insturction_next_token = dev_insturction[dev_insturction['input'].str.contains("Given a paragraph and an answer, what is next")]
dev_insturction_classification = dev_insturction[dev_insturction['input'].str.contains("Given a passage and a highlighted answer, your goal is to generate a question about the answer.")]
dev_insturction_next_token = dev_insturction_next_token.reset_index()
del dev_insturction_next_token['index']
dev_insturction_classification = dev_insturction_classification.reset_index()
del dev_insturction_classification['index']
dev_insturction_next_token.to_csv("workspace/data/dev_next_token.csv")
dev_insturction_classification.to_csv("workspace/data/dev_classification.csv")

In [69]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/small_model_wh_plus/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data/test_classification.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

data_files = {"train": "train_classification.csv", "validation": "dev_classification.csv", "test": "test_classification.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_plus_small_classifcation_epoch_3.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_plus_small_classifcation_epoch_3.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_plus_small_classifcation_epoch_3.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

import nltk
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(reference_texts, generated_texts, blue_num):
    # print("The number of sentece:", len(reference_texts))
    bleu_scores = []
    for idx, (reference_text, generated_text) in enumerate(zip(reference_texts, generated_texts)):
        #if idx % 1000 == 0:
        #    print(idx, end=" ")
        reference = reference_text.split()
        candidate = generated_text.split()
        if blue_num == 1:
            bleu_score = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0))
        elif blue_num == 2:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 1, 0, 0))
        elif blue_num == 3:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 1, 0))
        elif blue_num == 4:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 0, 1))
        bleu_scores.append(bleu_score)
    
    return {
            'bleu' + ' ' + str(blue_num) : sum(bleu_scores) / len(bleu_scores)
        }
    
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

print("\n")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/small_model_wh_plus/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data/test_next_token.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

data_files = {"train": "train_next_token.csv", "validation": "dev_next_token.csv", "test": "test_next_token.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_plus_small_next_token_epoch_3.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_plus_small_next_token_epoch_3.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_plus_small_next_token_epoch_3.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

def evaluate(y_test, x_test):
    sum = 0
    for i, j in zip(y_test, x_test):
        if i == j:
            sum += 1
    accuracy = sum / len(y_test)
    return accuracy 

def get_counts(seq): 
    counts = {}
    for x in seq:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_plus_small_next_token_epoch_3.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_plus_small_next_token_epoch_3.xlsx", index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


Text Size: 11864
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.5037424537281828}

{'bleu 2': 0.4771999325691208}

{'bleu 3': 0.4506490222521915}

{'bleu 4': 0.3849460552933243}


Text Size: 9018
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 

Accuracy: 0.1317365269461078


In [67]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/small_model_wh_plus/epoch_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data/test_classification.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

data_files = {"train": "train_classification.csv", "validation": "dev_classification.csv", "test": "test_classification.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_plus_small_classifcation_epoch_6.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_plus_small_classifcation_epoch_6.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_plus_small_classifcation_epoch_6.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

import nltk
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(reference_texts, generated_texts, blue_num):
    # print("The number of sentece:", len(reference_texts))
    bleu_scores = []
    for idx, (reference_text, generated_text) in enumerate(zip(reference_texts, generated_texts)):
        #if idx % 1000 == 0:
        #    print(idx, end=" ")
        reference = reference_text.split()
        candidate = generated_text.split()
        if blue_num == 1:
            bleu_score = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0))
        elif blue_num == 2:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 1, 0, 0))
        elif blue_num == 3:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 1, 0))
        elif blue_num == 4:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 0, 1))
        bleu_scores.append(bleu_score)
    
    return {
            'bleu' + ' ' + str(blue_num) : sum(bleu_scores) / len(bleu_scores)
        }
    
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

print("\n")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/small_model_wh_plus/epoch_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data/test_next_token.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

data_files = {"train": "train_next_token.csv", "validation": "dev_next_token.csv", "test": "test_next_token.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_plus_small_next_token_epoch_6.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_plus_small_next_token_epoch_6.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_plus_small_next_token_epoch_6.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

def evaluate(y_test, x_test):
    sum = 0
    for i, j in zip(y_test, x_test):
        if i == j:
            sum += 1
    accuracy = sum / len(y_test)
    return accuracy 

def get_counts(seq): 
    counts = {}
    for x in seq:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_plus_small_next_token_epoch_6.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_plus_small_next_token_epoch_6.xlsx", index=False)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


Text Size: 11864
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.5770079266476259}

{'bleu 2': 0.5632305012362352}

{'bleu 3': 0.5434086311530681}

{'bleu 4': 0.5149190829399866}




Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Text Size: 9018
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 

Accuracy: 0.2162341982701264


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/small_model_wh_plus/epoch_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data/test_classification.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

data_files = {"train": "train_classification.csv", "validation": "dev_classification.csv", "test": "test_classification.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_plus_small_classifcation_epoch_6.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_plus_small_classifcation_epoch_6.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_plus_small_classifcation_epoch_6.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

import nltk
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(reference_texts, generated_texts, blue_num):
    # print("The number of sentece:", len(reference_texts))
    bleu_scores = []
    for idx, (reference_text, generated_text) in enumerate(zip(reference_texts, generated_texts)):
        #if idx % 1000 == 0:
        #    print(idx, end=" ")
        reference = reference_text.split()
        candidate = generated_text.split()
        if blue_num == 1:
            bleu_score = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0))
        elif blue_num == 2:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 1, 0, 0))
        elif blue_num == 3:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 1, 0))
        elif blue_num == 4:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 0, 1))
        bleu_scores.append(bleu_score)
    
    return {
            'bleu' + ' ' + str(blue_num) : sum(bleu_scores) / len(bleu_scores)
        }
    
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

print("\n")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/small_model_wh_plus/epoch_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data/test_next_token.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

data_files = {"train": "train_next_token.csv", "validation": "dev_next_token.csv", "test": "test_next_token.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_plus_small_next_token_epoch_6.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_plus_small_next_token_epoch_6.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_plus_small_next_token_epoch_6.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

def evaluate(y_test, x_test):
    sum = 0
    for i, j in zip(y_test, x_test):
        if i == j:
            sum += 1
    accuracy = sum / len(y_test)
    return accuracy 

def get_counts(seq): 
    counts = {}
    for x in seq:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_plus_small_next_token_epoch_6.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_plus_small_next_token_epoch_6.xlsx", index=False)

### Token Prediction by Random Long WH Prompt

In [7]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(reference_texts, generated_texts, blue_num):
    # print("The number of sentece:", len(reference_texts))
    bleu_scores = []
    for idx, (reference_text, generated_text) in enumerate(zip(reference_texts, generated_texts)):
        #if idx % 1000 == 0:
        #    print(idx, end=" ")
        reference = reference_text.split()
        candidate = generated_text.split()
        if blue_num == 1:
            bleu_score = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0))
        elif blue_num == 2:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 1, 0, 0))
        elif blue_num == 3:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 1, 0))
        elif blue_num == 4:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 0, 1))
        bleu_scores.append(bleu_score)
    
    return {
            'bleu' + ' ' + str(blue_num) : sum(bleu_scores) / len(bleu_scores)
        }

In [None]:
lmqg-eval -m "lmqg/t5-large-squad-qg" -e "./eval_metrics" -d "lmqg/qg_squad" -l "en"

In [8]:
test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
test_insturction['input'] = test_insturction['instruction'] + " : " + test_insturction['paragraph_answer']
test_insturction.to_csv("workspace/data/test.csv")
train_insturction = pd.read_csv("workspace/data_wh_zero/train_instruction.csv")
train_insturction['input'] = train_insturction['instruction'] + " : " + train_insturction['paragraph_answer']
train_insturction.to_csv("workspace/data/train.csv")
dev_insturction = pd.read_csv("workspace/data_wh_zero/dev_instruction.csv")
dev_insturction['input'] = dev_insturction['instruction'] + " : " + dev_insturction['paragraph_answer']
dev_insturction.to_csv("workspace/data/dev.csv")

In [9]:
data_files = {"train": "train.csv", "validation": "dev.csv", "test": "test.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_1.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.5392525393278271}

{'bleu 2': 0.5290138581672758}

{'bleu 3': 0.5111682400539447}

{'bleu 4': 0.4939312204989885}


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_1.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.8892974642190106}

{'bleu 2': 0.883723870532704}

{'bleu 3': 0.8605866486850978}

{'bleu 4': 0.8379130141604855}


: 

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_1.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.9307220006778704}

{'bleu 2': 0.9170030341807663}

{'bleu 3': 0.888668931319505}

{'bleu 4': 0.8627595078205555}


In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_1.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_1.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.5392525393278271}

{'bleu 2': 0.5290138581672758}

{'bleu 3': 0.5111682400539447}

{'bleu 4': 0.4939312204989885}


In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_2.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_2.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_2.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.8697595126176412}

{'bleu 2': 0.8636912789390874}

{'bleu 3': 0.8414531355360755}

{'bleu 4': 0.8190323668240054}


In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_3.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_3.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_3.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.8892974642190106}

{'bleu 2': 0.883723870532704}

{'bleu 3': 0.8605866486850978}

{'bleu 4': 0.8379130141604855}


In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_4"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_4.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_4.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_4.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.8982885575152644}

{'bleu 2': 0.8927848954821311}

{'bleu 3': 0.8692683749157114}

{'bleu 4': 0.8460047201618341}


In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_5.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_5.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_5.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.9139666795419563}

{'bleu 2': 0.9057533939358806}

{'bleu 3': 0.8799730276466622}

{'bleu 4': 0.855192178017532}


In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_wh_zero/model_mtbhfb/epoch_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_large_epoch_6.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_large_epoch_6.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_large_epoch_6.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.9307220006778704}

{'bleu 2': 0.9170030341807663}

{'bleu 3': 0.888668931319505}

{'bleu 4': 0.8627595078205555}


#### Small Model

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh_zero/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_epoch_3.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_epoch_3.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_epoch_3.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

{'bleu 1': 0.5053509612579646}

{'bleu 2': 0.47682063385030765}

{'bleu 3': 0.44951112609575183}

{'bleu 4': 0.3796358732299393}


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh_zero/epoch_7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_epoch_7.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_epoch_7.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_epoch_7.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 {'bleu 1': 0.5279754274301385}

{'bleu 2': 0.5030765340526012}

{'bleu 3': 0.47850640593391774}

{'bleu 4': 0.418745785569791}


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh_zero/epoch_10"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_epoch_10.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_epoch_10.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_epoch_10.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 {'bleu 1': 0.7904469335827348}

{'bleu 2': 0.7795666554941918}

{'bleu 3': 0.7567244437612164}

{'bleu 4': 0.724020465339099}


In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh_zero/epoch_13"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_epoch_13.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_epoch_13.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_epoch_13.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.8477668401617663}

{'bleu 2': 0.8408099160108934}

{'bleu 3': 0.8187795010114632}

{'bleu 4': 0.7934929197572488}


In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh_zero/epoch_20"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh_zero/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_zero_epoch_20.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_zero_epoch_20.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_zero_epoch_20.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print("\n")
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

Text Size: 11864
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

{'bleu 1': 0.8591525836414106}

{'bleu 2': 0.8530569323070248}

{'bleu 3': 0.8318548199315497}

{'bleu 4': 0.8091706001348618}


### Token Prediction by Long WH Prompt

In [39]:
def evaluate(y_test, x_test):
    sum = 0
    for i, j in zip(y_test, x_test):
        if i == j:
            sum += 1
    accuracy = sum / len(y_test)
    return accuracy 

def get_counts(seq): 
    counts = {}
    for x in seq:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [40]:
test_insturction = pd.read_csv("workspace/data_wh/test_instruction.csv")
test_insturction['input'] = test_insturction['instruction'] + " : " + test_insturction['paragraph_answer']
test_insturction.to_csv("workspace/data/test.csv")
train_insturction = pd.read_csv("workspace/data_wh/train_instruction.csv")
train_insturction['input'] = train_insturction['instruction'] + " : " + train_insturction['paragraph_answer']
train_insturction.to_csv("workspace/data/train.csv")
dev_insturction = pd.read_csv("workspace/data_wh/dev_instruction.csv")
dev_insturction['input'] = dev_insturction['instruction'] + " : " + dev_insturction['paragraph_answer']
dev_insturction.to_csv("workspace/data/dev.csv")

In [41]:
data_files = {"train": "train.csv", "validation": "dev.csv", "test": "test.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/large_model_wh/epoch_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_long_large_epoch1.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_long_large_epoch1.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_long_large_epoch1.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_long_large_epoch1.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_long_large_epoch1.xlsx", index=False)

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_long_epoch3.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_long_epoch3.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_long_epoch3.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_long_epoch3.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_long_epoch3.xlsx", index=False)

Text Size: 11289
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 0.5377801399592523


In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh/epoch_7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_long_epoch7.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_long_epoch3.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_long_epoch7.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_long_epoch7.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_long_epoch7.xlsx", index=False)

Text Size: 11289
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

Accuracy: 0.5381344671804411


In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/model_wh/epoch_10"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_wh/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=32, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 1000 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
# result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/comparison_wh_long_epoch10.csv", index=False)
result_df.to_excel("workspace/inference/comparison_wh_long_epoch10.xlsx", index=False)

inference_result = pd.read_csv("workspace/inference/comparison_wh_long_epoch10.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print("\n")
print("Accuracy:", accuracy)

prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/predcition_counts_wh_long_epoch_10.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts_wh_long_epoch_10.xlsx", index=False)

Text Size: 11289
0 

Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors


1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

Accuracy: 0.5381344671804411


### WH Token Prediction

In [None]:
def evaluate(y_test, x_test):
    sum = 0
    for i, j in zip(y_test, x_test):
        if i == j:
            sum += 1
    accuracy = sum / len(y_test)
    return accuracy 

def get_counts(seq): 
    counts = {}
    for x in seq:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [40]:
test_insturction = pd.read_csv("workspace/data_first_token/test_instruction.csv")
test_insturction['input'] = test_insturction['instruction'] + " : " + test_insturction['paragraph_answer']
test_insturction.to_csv("workspace/data/test.csv")
train_insturction = pd.read_csv("workspace/data_first_token/train_instruction.csv")
train_insturction['input'] = train_insturction['instruction'] + " : " + train_insturction['paragraph_answer']
train_insturction.to_csv("workspace/data/train.csv")
dev_insturction = pd.read_csv("workspace/data_first_token/dev_instruction.csv")
dev_insturction['input'] = dev_insturction['instruction'] + " : " + dev_insturction['paragraph_answer']
dev_insturction.to_csv("workspace/data/dev.csv")

In [None]:
data_files = {"train": "train.csv", "validation": "dev.csv", "test": "test.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/token_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_first_token/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=8, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 100 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/result3.csv", index=False)
result_df.to_excel("workspace/inference/result3.xlsx", index=False)

Text Size: 8142
0 100 

Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors


200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 

In [25]:
inference_result = pd.read_csv("workspace/inference/result3.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print(accuracy)
prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/lmqg_counts3.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts3.xlsx", index=False)

0.43711618766887744


### Next Token Predicition

In [None]:
def evaluate(y_test, x_test):
    sum = 0
    for i, j in zip(y_test, x_test):
        if i == j:
            sum += 1
    accuracy = sum / len(y_test)
    return accuracy 

def get_counts(seq): 
    counts = {}
    for x in seq:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [30]:
test_insturction = pd.read_csv("workspace/data_first_and_second/test_instruction.csv")
test_insturction['input'] = test_insturction['instruction'] + " : " + test_insturction['paragraph_answer']
test_insturction.to_csv("workspace/data/test.csv", index=False)
train_insturction = pd.read_csv("workspace/data_first_and_second/train_instruction.csv")
train_insturction['input'] = train_insturction['instruction'] + " : " + train_insturction['paragraph_answer']
train_insturction.to_csv("workspace/data/train.csv", index=False)
dev_insturction = pd.read_csv("workspace/data_first_and_second/dev_instruction.csv")
dev_insturction['input'] = dev_insturction['instruction'] + " : " + dev_insturction['paragraph_answer']
dev_insturction.to_csv("workspace/data/dev.csv", index=False)

In [34]:
data_files = {"train": "train.csv", "validation": "dev.csv", "test": "test.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_first_and_second/model_fznckd/epoch_3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_first_and_second/test_instruction.csv")
inference_input_sentences = test_insturction['answer'].values.tolist()
predictions = []

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=20, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 100 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Answer':dataset['test']['answer']})
result_df['Comparsion'] = result_df['Prediction'] == result_df['Answer']
result_df.to_csv("workspace/inference/result2.csv", index=False)
result_df.to_excel("workspace/inference/result2.xlsx", index=False)

Text Size: 9018
0 100 200 

Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 

In [36]:
inference_result = pd.read_csv("workspace/inference/result2.csv")
prediction = inference_result['Prediction']
label = inference_result['Answer']

accuracy = evaluate(prediction, label)
llmqg_counts = get_counts(prediction)
inference_counts = get_counts(label)
print(accuracy)
prediction_df = pd.DataFrame(list(llmqg_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
inference_df = pd.DataFrame(list(inference_counts.items()), columns=["Value", "Count"]).sort_values('Count', ascending=False)
prediction_df.to_excel("workspace/inference/lmqg_counts.xlsx", index=False)
inference_df.to_excel("workspace/inference/inference_counts.xlsx", index=False)

0.49312486138833445


### Sentence Generation

In [None]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(reference_texts, generated_texts, blue_num):
    # print("The number of sentece:", len(reference_texts))
    bleu_scores = []
    for idx, (reference_text, generated_text) in enumerate(zip(reference_texts, generated_texts)):
        #if idx % 1000 == 0:
        #    print(idx, end=" ")
        reference = reference_text.split()
        candidate = generated_text.split()
        if blue_num == 1:
            bleu_score = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0))
        elif blue_num == 2:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 1, 0, 0))
        elif blue_num == 3:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 1, 0))
        elif blue_num == 4:
            bleu_score = sentence_bleu([reference], candidate, weights=(0, 0, 0, 1))
        bleu_scores.append(bleu_score)
    
    return {
            'bleu' + ' ' + str(blue_num) : sum(bleu_scores) / len(bleu_scores)
        }

In [None]:
test_insturction = pd.read_csv("workspace/data_curious2/test_instruction.csv")
test_insturction['input'] = test_insturction['instruction'] + " : " + test_insturction['paragraph_answer']
test_insturction.to_csv("workspace/data/test.csv", index=False)
train_insturction = pd.read_csv("workspace/data_curious2/train_instruction.csv")
train_insturction['input'] = train_insturction['instruction'] + " : " + train_insturction['paragraph_answer']
train_insturction.to_csv("workspace/data/train.csv", index=False)
dev_insturction = pd.read_csv("workspace/data_curious2/dev_instruction.csv")
dev_insturction['input'] = dev_insturction['instruction'] + " : " + dev_insturction['paragraph_answer']
dev_insturction.to_csv("workspace/data/dev.csv", index=False)

In [None]:
data_files = {"train": "train.csv", "validation": "dev.csv", "test": "test.csv"}
dataset = load_dataset("workspace/data/", data_files=data_files)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_curious2/model_qdrkjl/epoch_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)

test_insturction = pd.read_csv("workspace/data_curious2/test_instruction.csv")
inference_input_sentences = test_insturction['sentence'].values.tolist()
predictions = []
dataset = load_dataset("workspace/data/")

result_j = text_generator(KeyDataset(dataset['test'], 'input'), max_length=512, batch_size=32)
print("Text Size:", len(dataset['test']))
for idx, extracted_entities in enumerate(result_j):
    if idx % 100 == 0:
        print(idx, end=" ")
    for entity in extracted_entities:
        predictions.append(entity['generated_text'])

result_df = pd.DataFrame({'Prediction':predictions,
                                'Sentence':dataset['test']['paragraph_answer']})

result_df.to_csv("workspace/inference/result.csv", index=False)
result_df.to_excel("workspace/inference/result.xlsx", index=False)

In [76]:
inference_result = pd.read_csv("workspace/inference/result.csv")
prediction = inference_result['Prediction']
label = inference_result['sentence']
bleu_num = 1
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 2
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 3
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)
print()
bleu_num = 4
accuracy = compute_metrics(prediction, label, bleu_num)
print(accuracy)

{'bleu 1': 0.9362544497922547}

{'bleu 2': 0.9335188260055463}

{'bleu 3': 0.9309754167083432}

{'bleu 4': 0.9283027265343414}


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

test_insturction = pd.read_csv("workspace/data_curious2/test_instruction.csv")
inference_input_instructions = test_insturction['instruction'].values.tolist()
inference_input_sentences = test_insturction['sentence'].values.tolist()
inference_input_paragraph_answers = test_insturction['paragraph_answer'].values.tolist()

model_name = "workspace/tmp_instruction_flan_t5_large_squad_qg_curious2/model_qdrkjl/epoch_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
ADDITIONAL_SP_TOKENS = {'hl': '<hl>'}
tokenizer.add_special_tokens({'additional_special_tokens': list(ADDITIONAL_SP_TOKENS.values())})
model.resize_token_embeddings(len(tokenizer))          
model.eval()

lmqg_model = []
idx = 0
num = 1000
prefixs = inference_input_instructions
paragraphs = inference_input_paragraph_answers
sentences = inference_input_sentences

# prefix = "Generate setence. : "
# paragraph = ["The term melting pot was first coined to describe densely populated immigrant neighborhoods on the Lower East Side. Emilly Know about it."]
# sentences = ["The majority report of the Financial Crisis Inquiry Commission, written by the six Democratic appointees, the minority report, written by 3 of the <hl> 4 <hl> Republican appointees, studies by Federal Reserve economists, and the work of several independent scholars generally contend that government affordable housing policy was not the primary cause of the financial crisis. Although they concede that governmental policies had some role in causing the crisis, they contend that GSE loans performed better than loans securitized by private investment banks, and performed better than some loans originated by institutions that held loans in their own portfolios. Paul Krugman has even claimed that the GSE never purchased subprime loans – a claim that is widely disputed."]

for task_prefix, sentence in zip(tqdm(prefixs), paragraphs):
    inputs = tokenizer([task_prefix + " : " + sentence], return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask=inputs["attention_mask"].to(device)

    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        do_sample=False,  # disable sampling to test if batching affects output
    )

    lmqg_model.append(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

# pbar.close()
result_df = pd.DataFrame({"Prediction": lmqg_model, "Sentence": sentences})
result_df.to_excel("workspace/inference/sentence.xlsx", index=False)
result_df.to_csv("workspace/inference/sentence.csv", index=False)

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(reference_texts, generated_texts):
    bleu_scores = []
    for reference_text, generated_text in zip(reference_texts, generated_texts):
        bleu_score = sentence_bleu(reference_text, generated_text)
        bleu_scores.append(bleu_score)

    return {
            'bleu': sum(bleu_scores) / len(bleu_scores)
        }

In [None]:
accuracy = compute_metrics(inference_input_sentences[:1000], lmqg_model)
print(accuracy)