In [1]:
import pandas as pd
import numpy as np

## Read Data

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from trl import SFTTrainer

In [3]:
# !nvidia-smi -L

In [4]:
peft_model_id="gpt2_finetuned"

## Evaluate the test data

In [5]:
model = AutoModelForCausalLM.from_pretrained(f'./{peft_model_id}')
tokenizer = AutoTokenizer.from_pretrained(f'./{peft_model_id}')
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
dataset_test = Dataset.from_pandas(pd.read_csv('./Data/hugging_face_test.csv'))

In [7]:
# dataset_test['text']

In [8]:
from random import randrange

In [9]:
sample = dataset_test['text'][randrange(len(dataset_test["text"]))]
print(f'{sample}')

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
what are the values of Fiscal Year,Department Name,CalCard,Unit Price,

### Input:
Creation Date 04/14/2014 Purchase Date 04/14/2014 LPA Number nan Purchase Order Number 4500228479 Requisition Number nan Acquisition Type NON-IT Goods Sub-Acquisition Type nan Acquisition Method Fair and Reasonable Sub-Acquisition Method nan Supplier Code 1763613.0 Supplier Name Bay Medical Co., Inc Supplier Qualifications CA-SB CA-SBE DBE MBE MOSB SB Supplier Zip Code nan Item Name Take Tablets Label  17915 Item Description Take Tablets Label  17915 Quantity 6.0 Total Price $93.24 Classification Codes 42142301 Normalized UNSPSC 42142301.0 Commodity Title General use medical labels Class 42142300.0 Class Title Medical documentation products Family 42140000.0 Family Title Patient care and treatment products and supplies Segment 4200

In [10]:
ind = sample.find('### Response:') + 13
ground_truth_len = len(sample)-ind

In [11]:
input_ids = tokenizer(sample[:ind], return_tensors="pt", truncation=True)#.input_ids.cuda()

In [12]:
import torch
with torch.no_grad():
    outputs = tokenizer.batch_decode(model.generate(**input_ids, 
                                                     max_new_tokens=40, 
                                                     do_sample=True, 
                                                     top_p=0.9, 
                                                     temperature=0.9
                                                   ),
                                     skip_special_tokens=True,
                                    pad_token_id = '')[0]
    #return_dict_in_generate=True, output_scores=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [13]:
ind_out = outputs.find('### Response:')

In [14]:
print(f'{outputs[ind_out:ind_out+ground_truth_len+13]}')

### Response:
2013-2014, Correctional Health Care Services, NO, $12.28,ra


In [15]:
# outputs

## Prediction all Test Cases

In [74]:
# ground_truth = []
# predicted = []
# for i, sample in enumerate(dataset_test['text']):
#     ind = sample.find('### Response:') + 13
#     ground_truth_len = len(sample)-ind
#     input_ids = tokenizer(sample[:ind], return_tensors="pt", truncation=True)

#     # print(f'ground truth: {sample[ind:]}')
#     ground_truth.append(sample[ind:])
#     with torch.no_grad():
#         outputs = tokenizer.batch_decode(model.generate(**input_ids, 
#                                                          max_new_tokens=40, 
#                                                          do_sample=True, 
#                                                          top_p=0.9, 
#                                                          temperature=0.9
#                                                        ),
#                                          skip_special_tokens=True,
#                                         pad_token_id = '')[0]
    
    
#     ind_out = outputs.find('### Response:')+13
#     # print(f'predicted: {outputs[ind_out:ind_out+ground_truth_len]}')
#     predicted.append(outputs[ind_out:ind_out+ground_truth_len])

In [75]:
# results = pd.DataFrame()
# results['groundTruth'] = ground_truth
# results['predicted'] = predicted

# results.to_csv('predicted_value_imputation.csv', index=False)

In [81]:
results = pd.read_csv('predicted_value_imputation.csv')

In [82]:
# !pip install evaluate
# !pip install sklearn
# !pip install scikit-learn
# !pip install editdistance

In [83]:
results.columns

Index(['groundTruth', 'predicted'], dtype='object')

In [84]:
import evaluate
# accuracy = evaluate.load('accuracy')
clf_metrics = evaluate.combine(["accuracy"])#, "f1", "precision", "recall"])

In [85]:
# for i in range(results.shape[0]):
for ref, pred in zip(tokenizer(results.groundTruth.to_list())['input_ids'], tokenizer(results.predicted.to_list())['input_ids']):
    min_len = min(len(ref), len(pred))
    clf_metrics.add_batch(references=ref[:min_len], predictions=pred[:min_len])


In [86]:
print(f'Overall prediction accuracy: {clf_metrics.compute()}')

Overall prediction accuracy: {'accuracy': 0.7191939783597303}


## Edit Distance

In [87]:
import editdistance

In [88]:
import random
idx = random.randint(0,results.shape[0])
value_ref = tokenizer(results.groundTruth.to_list()[idx])['input_ids']
ref = results.groundTruth.to_list()[idx]

value_pred = tokenizer(results.predicted.to_list()[idx])['input_ids']
pred = results.predicted.to_list()[idx]

# editdistance.eval(value_ref, value_pred)
print(f'reference: {ref}\npredicted: {pred}')

print(f'Edit distance: {editdistance.eval(ref, pred)}')

reference: 
nan, Duncan Distribution CO LLC, 43221721.0, 
predicted: 
nan, Rumex Communications, 43221721.0,      

Edit distance: 26


## Total Edit Distance

In [89]:
ed_distance = []
for ref, pred in zip(results.groundTruth.to_list(), results.predicted.to_list()):
    ed_distance.append(editdistance.eval(ref, pred)/max(len(ref), len(pred)))

print(f'Total edit distance: {sum(ed_distance): .2f}, # of prediction: {len(ed_distance)}')
print(f'Normalized edit distance: {sum(ed_distance)/len(ed_distance) : .2f}')

Total edit distance:  524.20, # of prediction: 3461
Normalized edit distance:  0.15
