In [1]:
import pandas as pd
import numpy as np

## Read Data

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from trl import SFTTrainer

In [3]:
# !nvidia-smi -L

In [4]:
peft_model_id="gpt2_finetuned"

## Evaluate the test data

In [5]:
model = AutoModelForCausalLM.from_pretrained(f'./{peft_model_id}')
tokenizer = AutoTokenizer.from_pretrained(f'./{peft_model_id}')
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
dataset_test = Dataset.from_pandas(pd.read_csv('./Data/hugging_face_test.csv'))

In [7]:
# dataset_test['text']

In [8]:
from random import randrange

In [54]:
sample = dataset_test['text'][randrange(len(dataset_test["text"]))]
print(f'{sample}')

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
what are the values of 

### Input:
Creation Date 01/22/2014 Purchase Date 11/01/2013 Fiscal Year 2013-2014 LPA Number nan Purchase Order Number 1PA3V013 Requisition Number nan Acquisition Type NON-IT Goods Sub-Acquisition Type nan Acquisition Method Informal Competitive Sub-Acquisition Method nan Department Name Forestry and Fire Protection, Department of Supplier Code 1076328.0 Supplier Name Highway Products Supplier Qualifications nan Supplier Zip Code 97503 CalCard NO Item Name PICKUP PACK Item Description ALUMINUM PICKUP PACK AND CENTER HATCH COVER FOR SECURE STORAGE OF FIRE
GEAR AND EQUIPMENT ON BATTALION CHIEF VEHICLES Quantity 2.0 Unit Price $4145.00 Total Price $8290.00 Classification Codes 25174808 Normalized UNSPSC 25174808.0 Commodity Title Vehicle accessories storage box Class 25174800.0 Class Title 

In [55]:
ind = sample.find('### Response:') + 13
ground_truth_len = len(sample)-ind

In [56]:
input_ids = tokenizer(sample[:ind], return_tensors="pt", truncation=True)#.input_ids.cuda()

In [57]:
import torch
with torch.no_grad():
    outputs = tokenizer.batch_decode(model.generate(**input_ids, 
                                                     max_new_tokens=40, 
                                                     do_sample=True, 
                                                     top_p=0.9, 
                                                     temperature=0.9
                                                   ),
                                     skip_special_tokens=True,
                                    pad_token_id = '')[0]
    #return_dict_in_generate=True, output_scores=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [58]:
ind_out = outputs.find('### Response:')

In [59]:
print(f'{outputs[ind_out:ind_out+ground_truth_len+13]}')

### Response:



In [15]:
# outputs

## Prediction all Test Cases

In [None]:
ground_truth = []
predicted = []
for i, sample in enumerate(dataset_test['text']):
    ind = sample.find('### Response:') + 13
    ground_truth_len = len(sample)-ind
    input_ids = tokenizer(sample[:ind], return_tensors="pt", truncation=True)

    # print(f'ground truth: {sample[ind:]}')
    ground_truth.append(sample[ind:])
    with torch.no_grad():
        outputs = tokenizer.batch_decode(model.generate(**input_ids, 
                                                         max_new_tokens=40, 
                                                         do_sample=True, 
                                                         top_p=0.9, 
                                                         temperature=0.9
                                                       ),
                                         skip_special_tokens=True,
                                        pad_token_id = '')[0]
    
    
    ind_out = outputs.find('### Response:')+13
    # print(f'predicted: {outputs[ind_out:ind_out+ground_truth_len]}')
    predicted.append(outputs[ind_out:ind_out+ground_truth_len])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [None]:
results = pd.DataFrame()
results['groundTruth'] = ground_truth
results['predicted'] = predicted

results.to_csv('predicted_value_imputation.csv', index=False)