In [1]:
import torch
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig, PeftModel, get_peft_config, get_peft_model
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import pandas as pd

In [2]:
## Call basemodel(llama3.1 8B) 
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map='auto', 
    torch_dtype=torch.bfloat16)

tokenizer.pad_token = tokenizer.eos_token
# print(base_model.hf_device_map)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
## load data
data_pth = "/home/broodling/finQA/datasets/FinQA/dataset/train.json"

questions = []
pretext = []
posttext = []

def sequence(text_list):
  st = ""
  for line in text_list:
    if(line == "."):
      pass
    else:
      st += line + " "
  return st


with open(data_pth, "r") as file:
  data = json.load(file)

for i in data:
  questions.append(i['qa']['question'])
  pretext.append(sequence(i['pre_text']))
  posttext.append(sequence(i['post_text']))

print("Train FinQA Table Datasets: ", len(pretext), len(posttext), len(questions))
print(pretext[0])
print(posttext[0])

Train FinQA Table Datasets:  6251 6251 6251
interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% 

In [4]:
## prompt for text to table
sys_prompt = """Your job is to transform unstructured text into structured data(table). Extract numerical information from the provided text and generate a table based on the context and meaning of the numbers. The table should be labeled with appropriate column and row names that accurately represent the extracted data.

The input will be a text passage. Your goal is to:
1. Identify and extract all the numbers present in the text. Given text might contain percentage(%), dollors($) and other forms of number. 
2. For each number, provide a brief explanation of what the number represents or refers to in the context of the text. 
3. Based on the numbers and their explanations, define appropriate column names for a table that could represent the key information in the text. The numbers should be included in val, not column names.

The output SHOULD FOLLOW below format(each line indicates one extracted numbers):
|<cell>| <col> column name </col> <val> extracted number </val> |</cell>|
"""

user_prompt = """Text: In 2023, the company achieved a revenue of $ (10) million, which was a 25 (%) increase from the previous year. The profit margin was 15%.
Table: """

assistant_prompt = """|<cell>| <col> Revenue (in millions) Year 2023 </col> <val> 10 </val> |</cell>|
|<cell>| <col> Growth (%) Year 2023 </col> <val> 25 </val> |</cell>|
|<cell>| <col> Profit Margin (%) Year 2023 </col> <val> 15 </val> |</cell>|
"""

messages =[
  {"role": "system", "content": sys_prompt},
  {"role": "user", "content": user_prompt},
  {"role": "assistant", "content": assistant_prompt},
]

In [None]:
## prompt for text to table
sys_prompt = """Your job is to transform unstructured text into structured data(table). Extract numerical information from the provided text and generate a table based on the context and meaning of the numbers. The table should be labeled with appropriate column and row names that accurately represent the extracted data.

The input will be a text passage. Your goal is to:
1. Identify and extract all the numbers present in the text. Given text might contain percentage(%), dollors($) and other forms of number. 
2. For each number, provide a brief explanation of what the number represents or refers to in the context of the text. 
3. Based on the numbers and their explanations, define appropriate column names for a table that could represent the key information in the text. The numbers should be included in val, not column names.

The output SHOULD FOLLOW below format(each line indicates one extracted numbers):
|<cell>| <col> column name </col> <val> extracted number </val> |</cell>|
"""

user_prompt = """Text: In 2023, the company achieved a revenue of $ (10) million, which was a 25 (%) increase from the previous year. The profit margin was 15%.
Table: """

assistant_prompt = """|<cell>| <col> Revenue (in millions) Year 2023 </col> <val> 10 </val> |</cell>|
|<cell>| <col> Growth (%) Year 2023 </col> <val> 25 </val> |</cell>|
|<cell>| <col> Profit Margin (%) Year 2023 </col> <val> 15 </val> |</cell>|
"""

messages =[
  {"role": "system", "content": sys_prompt},
  {"role": "user", "content": user_prompt},
  {"role": "assistant", "content": assistant_prompt},
]

In [5]:
# generate table's rows and columns
pre_table = []
post_table = []

for idx in tqdm(range(0, len(questions))):
  # pretext2table
  dic1 = {"role": "user", "content": "Text: {} \nTable: ".format(pretext[idx])}
  messages.append(dic1)
  input_ids1 = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids1 = input_ids1.to(base_model.device) 

  output1 = base_model.generate(input_ids=input_ids1,
                               max_length = 12000,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response1 = tokenizer.decode(output1)
  res1 = response1.split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[2]
  res1 = res1.lstrip("\n")
  res1 = res1.rstrip("<|eot_id|>")
  # print(res1)

  pre_table.append(res1)
  del messages[-1]

  # posttext2table
  dic2 = {"role": "user", "content": "Text: {} \nTable: ".format(posttext[idx])}
  messages.append(dic2)
  input_ids2 = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids2 = input_ids2.to(base_model.device) 

  output2 = base_model.generate(input_ids=input_ids2,
                               max_length = 12000,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response2 = tokenizer.decode(output2)
  res2 = response2.split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[2]
  res2 = res2.lstrip("\n")
  res2 = res2.rstrip("<|eot_id|>")
  # print(res2)

  post_table.append(res2)
  del messages[-1]

## save results
with open("pre_table_0817.json", "w") as file1:
  json.dump(pre_table, file1)

with open("post_table_0817.json", "w") as file2:
  json.dump(post_table, file2)

  0%|          | 0/6251 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


|<cell>| <col> Interest Rate Adjustment (%) </col> <val> 2.05 </val> |</cell>|
|<cell>| <col> Interest Rate Adjustment (%) </col> <val> 2.34 </val> |</cell>|
|<cell>| <col> Basis Point Change </col> <val> 100 </val> |</cell>|
|<cell>| <col> Annual Interest Expense Change ($ million) </col> <val> 3.8 </val> |</cell>|
|<cell>| <col> Unfavorable Movement in Foreign Currency Exchange Rates (%) </col> <val> 10 </val> |</cell>|
|<cell>| <col> Unfavorable Movement in Foreign Currency Exchange Rates Impact on Earnings or Cash Flows </col> <val> Not Significant </val> |</cell


  0%|          | 1/6251 [00:14<24:48:42, 14.29s/it]

|<cell>| <col> Unfavorable Movement in Foreign Currency Exchange Rates (%) </col> <val> 10 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Asset) </col> <val> 20132 </val> |</cell>|
|<cell>| <col> Unfavorable Movement in Foreign Currency Exchange Rates (Asset) </col> <val> 9457 </val> |</cell>|
|<cell>| <col> Favorable Movement in Foreign Currency Exchange Rates (%) </col> <val> 10 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Liability) </col> <val> 6781 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Liability) </col> <val> 38294 </val> |</cell
|<cell>| <col> Year </col> <val> 2012 </val> |</cell>| 
|<cell>| <col> Number of Shares (in thousands) </col> <val>  </val> |</cell>| 
|<cell>| <col> Weighted Average Grant Date Fair Value (per share) </col> <val>  </val> |</cell


  0%|          | 1/6251 [00:20<36:21:33, 20.94s/it]


KeyboardInterrupt: 

In [20]:
# generate table's rows and columns
pre_table = []
post_table = []

for idx in tqdm(range(0, 1)):
  # pretext2table
  dic1 = {"role": "user", "content": "Text: {} \nTable: ".format(pretext[idx])}
  messages.append(dic1)
  input_ids1 = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids1 = input_ids1.to(base_model.device) 

  output1 = base_model.generate(input_ids=input_ids1,
                               max_length = 12000,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response1 = tokenizer.decode(output1)
  res1 = response1.split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[2]
  res1 = res1.lstrip("\n")
  res1 = res1.rstrip("<|eot_id|>")
  print(res1)

  pre_table.append(res1)
  del messages[-1]

  # posttext2table
  dic2 = {"role": "user", "content": "Text: {} \nTable(col and row name for variable): ".format(posttext[idx])}
  messages.append(dic2)
  input_ids2 = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids2 = input_ids2.to(base_model.device) 

  output2 = base_model.generate(input_ids=input_ids2,
                               max_length = 12000,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response2 = tokenizer.decode(output2)
  res2 = response2.split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[2]
  res2 = res2.lstrip("\n")
  res2 = res2.rstrip("<|eot_id|>")
  print(res2)

  post_table.append(res2)
  del messages[-1]


  0%|          | 0/1 [00:00<?, ?it/s]

|<cell>| <col> Interest Rate (basis points) </col> <row> As of October 31, 2009 </row> <val> 2.34 </val> |</cell>|
|<cell>| <col> Interest Rate (basis points) </col> <row> As of October 31, 2009 </row> <val> 2.05 </val> |</cell>|
|<cell>| <col> Interest Rate Change (basis points) </col> <row> Basis Point Change </row> <val> 100 </val> |</cell>|
|<cell>| <col> Interest Expense Change (in millions) </col> <row> Basis Point Change </row> <val> 3.8 </val> |</cell>|
|<cell>| <col> Unfavorable Movement in Foreign Currency Exchange Rates (%) </col> <row> Relative to October 31, 2009 </row> <val> 10 </val> |</cell>|
|<cell>| <col> Unfavorable Movement in Foreign Currency Exchange Rates (%) </col> <row> Relative to November 1, 2008 </row> <val> 10 </val> |</cell>|
|<cell>| <col> Largest Foreign Currency Exposure </col> <row> Currency </row> <val> Euro </val> |</cell>|
|<cell>| <col> Duration of Forward Foreign Currency Exchange Contracts (months) </col> <row> Minimum </row> <val> 1 </val> |</ce

100%|██████████| 1/1 [00:18<00:00, 18.68s/it]

|<cell>| <col> Fair Value of Forward Exchange Contracts (Asset) </col> <row> Unfavorable Movement (10%) </row> <val> 20132 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Liability) </col> <row> Unfavorable Movement (10%) </row> <val> 9457 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Asset) </col> <row> Favorable Movement (10%) </row> <val> -6781 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Liability) </col> <row> Favorable Movement (10%) </row> <val> -38294 </val> |</cell





In [1]:
import json

with open("/home/broodling/finQA/post_table_0817.json", "r") as file:
  post = json.load(file)

with open("/home/broodling/finQA/pre_table_0817.json", "r") as file2:
  pre = json.load(file2)

print(len(post), len(pre))

6251 6251


In [5]:
print(post[0])

|<cell>| <col> Fair Value of Forward Exchange Contracts (Asset) </col> <row> Unfavorable Movement (10%) </row> <val> 20132 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Liability) </col> <row> Unfavorable Movement (10%) </row> <val> 9457 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Asset) </col> <row> Favorable Movement (10%) </row> <val> -6781 </val> |</cell>|
|<cell>| <col> Fair Value of Forward Exchange Contracts (Liability) </col> <row> Favorable Movement (10%) </row> <val> -38294 </val> |</cell
