In [1]:
import torch
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig, PeftModel, get_peft_config, get_peft_model
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
## Call basemodel(llama3.1 8B) 
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map='auto', 
    torch_dtype=torch.bfloat16)

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
## preprocess data
import json

data_pth = "/home/broodling/finQA/datasets/FinQA/dataset/train.json"

tables_ori = []
questions = []

with open(data_pth, "r") as file:
  data = json.load(file)

for i in data:
  questions.append(i['qa']['question'])
  tables_ori.append(i['table'])

print(len(tables_ori), len(questions))
print(tables_ori[1])

6251 6251
[['', 'number of shares ( in thousands )', 'weighted average grant date fair value ( per share )'], ['restricted stock and restricted stock units at beginning of year', '407', '$ 9.84'], ['granted', '607', '18.13'], ['vested', '-134 ( 134 )', '10.88'], ['forfeited', '-9 ( 9 )', '13.72'], ['restricted stock and restricted stock units at end of year', '871', '$ 15.76']]


In [14]:
import pandas as pd

table_str = []
for table in tables_ori:
  sen = ""
  for i in range(1, len(table)):
    for j in range(1, len(table[0])):
      cell = table[0][0] +" " + table[i][0] + " of " + table[0][j] + " is " + table[i][j] + "\n"
      sen += cell
  sen = sen.replace("  ", " ")
  table_str.append(sen)

with open("table_string_0928.json", "w") as fw:
  json.dump(table_str, fw)

exam = [[
    "date",
    "citi",
    "s&p 500",
    "s&p financials"
],
[
    "31-dec-2012",
    "100.0",
    "100.0",
    "100.0"
],
[
    "31-dec-2013",
    "131.8",
    "132.4",
    "135.6"
],
[
    "31-dec-2014",
    "137.0",
    "150.5",
    "156.2"
],
[
    "31-dec-2015",
    "131.4",
    "152.6",
    "153.9"
],
[
    "31-dec-2016",
    "152.3",
    "170.8",
    "188.9"
],
[
    "31-dec-2017",
    "193.5",
    "208.1",
    "230.9"
]]

fewshot = ""
for i in range(1, len(exam)):
    for j in range(1, len(exam[0])):
        cell = exam[0][0] +" " + exam[i][0] + " of " + exam[0][j] + " is " + exam[i][j] + "\n"
        fewshot += cell
fewshot = fewshot.replace("  ", " ")
print(fewshot)


date 31-dec-2012 of citi is 100.0
date 31-dec-2012 of s&p 500 is 100.0
date 31-dec-2012 of s&p financials is 100.0
date 31-dec-2013 of citi is 131.8
date 31-dec-2013 of s&p 500 is 132.4
date 31-dec-2013 of s&p financials is 135.6
date 31-dec-2014 of citi is 137.0
date 31-dec-2014 of s&p 500 is 150.5
date 31-dec-2014 of s&p financials is 156.2
date 31-dec-2015 of citi is 131.4
date 31-dec-2015 of s&p 500 is 152.6
date 31-dec-2015 of s&p financials is 153.9
date 31-dec-2016 of citi is 152.3
date 31-dec-2016 of s&p 500 is 170.8
date 31-dec-2016 of s&p financials is 188.9
date 31-dec-2017 of citi is 193.5
date 31-dec-2017 of s&p 500 is 208.1
date 31-dec-2017 of s&p financials is 230.9



In [5]:
import pandas as pd

table_str = []
for table in tables_ori:
  header=table[0]
  sent = ""
  for col in range(1, len(table)):
    for row in range(1, len(table[col])):
      sent += "<|cell|>" + " <col> " + table[0][0] + " " + table[col][0] + " </col> " + "<row> " + table[0][row] + " </row> " + "<val> " + table[col][row] + " </val> " +"<|/cell|> "
      sent = sent.replace("  ", " ")

  # print(sent)
  table_str.append(sent)

print(table_str[2])
print(len(table_str))

<|cell|> <col> year 2018 </col> <row> gallons </row> <val> 4447 </val> <|/cell|> <|cell|> <col> year 2018 </col> <row> average priceper gallon </row> <val> $ 2.23 </val> <|/cell|> <|cell|> <col> year 2018 </col> <row> aircraft fuelexpense </row> <val> $ 9896 </val> <|/cell|> <|cell|> <col> year 2018 </col> <row> percent of totaloperating expenses </row> <val> 23.6% ( 23.6 % ) </val> <|/cell|> <|cell|> <col> year 2017 </col> <row> gallons </row> <val> 4352 </val> <|/cell|> <|cell|> <col> year 2017 </col> <row> average priceper gallon </row> <val> 1.73 </val> <|/cell|> <|cell|> <col> year 2017 </col> <row> aircraft fuelexpense </row> <val> 7510 </val> <|/cell|> <|cell|> <col> year 2017 </col> <row> percent of totaloperating expenses </row> <val> 19.6% ( 19.6 % ) </val> <|/cell|> <|cell|> <col> year 2016 </col> <row> gallons </row> <val> 4347 </val> <|/cell|> <|cell|> <col> year 2016 </col> <row> average priceper gallon </row> <val> 1.42 </val> <|/cell|> <|cell|> <col> year 2016 </col> <r

In [8]:
import pandas as pd

table_str = []
for table in tables_ori:
  sen = ""
  for i in range(1, len(table)):
    for j in range(1, len(table[0])):
      col = " <col> " + table[0][0] +" " + table[i][0] + " " + table[0][j] +  " </col>"
      val = " <val> " + table[i][j] + " </val> "
      sen += "<|cell|>" + col + val + "<|/cell|>\n"
  #print(sen)
  table_str.append(sen)

print(len(table_str))

with open("table_string.json", "w") as fw:
  json.dump(table_str, fw)

6251


In [16]:
exam = [[
    "date",
    "citi",
    "s&p 500",
    "s&p financials"
],
[
    "31-dec-2012",
    "100.0",
    "100.0",
    "100.0"
],
[
    "31-dec-2013",
    "131.8",
    "132.4",
    "135.6"
],
[
    "31-dec-2014",
    "137.0",
    "150.5",
    "156.2"
],
[
    "31-dec-2015",
    "131.4",
    "152.6",
    "153.9"
],
[
    "31-dec-2016",
    "152.3",
    "170.8",
    "188.9"
],
[
    "31-dec-2017",
    "193.5",
    "208.1",
    "230.9"
]]

fewshot = ""
for i in range(1, len(exam)):
    for j in range(1, len(exam[0])):
        col = " <col> " + exam[0][0] +" " + exam[i][0] + " " + exam[0][j] +  " </col>"
        val = " <val> " + exam[i][j] + " </val> "
        fewshot += "<|cell|>" + col + val + "<|/cell|>\n"

print(fewshot)

<|cell|> <col> date 31-dec-2012 citi </col> <val> 100.0 </val> <|/cell|>
<|cell|> <col> date 31-dec-2012 s&p 500 </col> <val> 100.0 </val> <|/cell|>
<|cell|> <col> date 31-dec-2012 s&p financials </col> <val> 100.0 </val> <|/cell|>
<|cell|> <col> date 31-dec-2013 citi </col> <val> 131.8 </val> <|/cell|>
<|cell|> <col> date 31-dec-2013 s&p 500 </col> <val> 132.4 </val> <|/cell|>
<|cell|> <col> date 31-dec-2013 s&p financials </col> <val> 135.6 </val> <|/cell|>
<|cell|> <col> date 31-dec-2014 citi </col> <val> 137.0 </val> <|/cell|>
<|cell|> <col> date 31-dec-2014 s&p 500 </col> <val> 150.5 </val> <|/cell|>
<|cell|> <col> date 31-dec-2014 s&p financials </col> <val> 156.2 </val> <|/cell|>
<|cell|> <col> date 31-dec-2015 citi </col> <val> 131.4 </val> <|/cell|>
<|cell|> <col> date 31-dec-2015 s&p 500 </col> <val> 152.6 </val> <|/cell|>
<|cell|> <col> date 31-dec-2015 s&p financials </col> <val> 153.9 </val> <|/cell|>
<|cell|> <col> date 31-dec-2016 citi </col> <val> 152.3 </val> <|/cell|>

In [25]:
## prompt for table decomposition
sys_prompt = """You will receive a text along with a table/query pair. When you receive this pair, you should review the query and extract only necessary information, especially NUMBERS, to solve the query from the table. Then, reconstruct a condensed(summerized) table with the extracted information into a following table format. In table format, each cell starts with <|cell|> token and ends with <|/cell|> tokens. Cell's column information are enclosed by <col>, </col> tokens and value(or number) is enclosed by <val>, </val> tokens. DO NOT provide the equation to solve the problem, just correctly select the relevant columns/values. Skip detailed information and ONLY answer the extracted information with the table format."""

# provide example (few-shot 1)
user_prompt = """Query: what was the percentage cumulative total return for the five year period ended 31-dec-2017 of citi common stock?
Table: {}
Sub-Table: 
""".format(fewshot)

assistant_prompt = """|<cell>| <col> date 31-dec-2012 citi </col> <val> 100.0 </val> |</cell>|\n |<cell>| <col> date 31-dec-2012 s&p 500 </col> <val> 100.0 </val> |</cell>|\n <|cell|> <col> date 31-dec-2012 s&p financials </col> <val> 100.0 </val> <|/cell|>\n <|cell|> <col> date 31-dec-2017 citi </col> <val> 193.5 </val> <|/cell|>\n <|cell|> <col> date 31-dec-2017 s&p 500 </col> <val> 208.1 </val> <|/cell|>\n <|cell|> <col> date 31-dec-2017 s&p financials </col> <val> 230.9 </val> <|/cell|>\n """

messages =[
  {"role": "system", "content": sys_prompt},
  {"role": "user", "content": user_prompt},
  {"role": "assistant", "content": assistant_prompt},
]

# Move the model to GPU if available, otherwise CPU
# if torch.cuda.is_available():
#     base_model = base_model.to("cuda")
#     print("cuda")

# print(base_model.hf_device_map)

In [26]:
from tqdm import tqdm

answers = []

for idx in tqdm(range(0, len(table_str))):
  dic = {"role": "user", "content": "Query: {} \nTable: {} \nSub-Table: ".format(questions[idx], table_str[idx])}
  messages.append(dic)
  input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids = input_ids.to(base_model.device) 

  output = base_model.generate(input_ids=input_ids,
                               max_length = 4096,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response = tokenizer.decode(output)
  res = response.split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[2]
  res = res.lstrip("\n")
  res = res.rstrip("<|eot_id|>")
  # print(res)

  answers.append(res)
  del messages[-1]


## save result
with open("subtable_0817.json", "w") as f1:
  json.dump(answers, f1)

  0%|          | 1/6251 [00:01<3:06:11,  1.79s/it]

|<cell>| <col> fair value of forward exchange contracts asset ( liability ) october 31 2009 </col> <val> $ 6427 </val> |</cell


  0%|          | 2/6251 [00:13<13:17:22,  7.66s/it]

|<cell>| <col>  restricted stock and restricted stock units at beginning of year number of shares ( in thousands ) </col> <val> 407 </val> |</cell>|
 |<cell>| <col>  granted number of shares ( in thousands ) </col> <val> 607 </val> |</cell>|
 |<cell>| <col>  vested number of shares ( in thousands ) </col> <val> -134 ( 134 ) </val> |</cell>|
 |<cell>| <col>  forfeited number of shares ( in thousands ) </col> <val> -9 ( 9 ) </val> |</cell>|
 |<cell>| <col>  restricted stock and restricted stock units at end of year number of shares ( in thousands ) </col> <val> 871 </val> |</cell>|
 |<cell>| <col>  granted weighted average grant date fair value ( per share ) </col> <val> 18.13 </val> |</cell>|
 |<cell>| <col>  vested weighted average grant date fair value ( per share ) </col> <val> 10.88 </val> |</cell>|
 |<cell>| <col>  forfeited weighted average grant date fair value ( per share ) </col> <val> 13.72 </val> |</cell>|
 |<cell>| <col>  restricted stock and restricted stock units at end of

  0%|          | 2/6251 [00:15<13:11:31,  7.60s/it]


KeyboardInterrupt: 

In [11]:
from tqdm import tqdm

answers = ["0-th as few-shot example"]

for ques, idx in zip(questions[15:20], table_str[15:20]):
  dic = {"role": "user", "content": "Query: {} \nTable: {} \nSub-Table: ".format(ques, idx)}
  messages.append(dic)
  input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids = input_ids.to(base_model.device) 

  output = base_model.generate(input_ids=input_ids,
                               max_length = 4096,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response = tokenizer.decode(output)
  res = response.split("<|start_header_id|>assistant<|end_header_id|>")[2]
  res = res.lstrip("\n")
  res = res.rstrip("<|eot_id|>")
  # print(res)

  answers.append(res)
  del messages[-1]


## save result
# with open("sub_table_0807.json", "w") as f1:
#   json.dump(answers, f1)

|<cell>| <col> labor-related deemed claim </col> <row> 2013 </row> <val> $ 1733 </val> |</cell>|
|<cell>| <col> total reorganization items net </col> <row> 2013 </row> <val> $ 2640 </val> |</cell

|<cell>| <col> obligation </col> <row> payments due by period total </row> <val> $ 37788 </val> |</cell>| 
|<cell>| <col> obligation </col> <row> payments due by period total </row> <val> $ 186792 </val> |</cell

|<cell>| <col> ( in millions ) </col> <row> long-term debt including current portion excluding capital lease obligations </row> <row> payments due by period total </row> <val> $ 6039.0 </val> |</cell>|
|<cell>| <col> ( in millions ) </col> <row> long-term debt including current portion excluding capital lease obligations </row> <row> payments due by period fiscal 2019 </row> <val> $ 726.6 </val> |</cell

|<cell>| <col> <row> owned </row> </col> <col> <row> leased </row> </col> </cell> |
|<cell>| <col> <row> united states </row> </col> <val> 41 </val> </cell> |<cell>| <col> <row> unit

In [9]:
with open("/home/broodling/finQA/tr_subtable_extract_0807.json") as file:
  test = json.load(file)

test[18]

'|<cell>| <col> </col> <row> <total> </row> <val> 41 + 2 + 11 + 26 = 80 </val> |</cell>| \n|<cell>| <col> </col> <row> <leased> </row> <val> 1 + 2014 + 2014 + 2 = 4031 </val> |</cell>| \n|<cell>| <col> </col> <row> <percent leased> </row> <val> (4031 / 80) * 100 = 50.38 </val> |</cell'

In [21]:
from tqdm import tqdm

answers = []

for idx in tqdm(range(1, len(table_str))):
  dic = {"role": "user", "content": "Query: {} \nTable: {} \nSub-Table: ".format(questions[idx], table_str[idx])}
  messages.append(dic)
  input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
  input_ids = input_ids.to(base_model.device) 
  # print(input_ids.device)

  output = base_model.generate(input_ids=input_ids,
                               max_length = 4096,
                               temperature=0.2,
                               pad_token_id = tokenizer.eos_token_id)[0]
  
  response = tokenizer.decode(output)
  res = response.split("<|start_header_id|>assistant<|end_header_id|>")[2]
  res = res.lstrip("\n")
  print(res)

  del messages[-1]

## save result
with open("sub_table.json", "w") as f1:
  json.dump(answers, f1)

100%|██████████| 1/1 [00:05<00:00,  5.81s/it]

|<cell>| <col> granted </col> <row> number of shares ( in thousands ) </row> <val> 607 </val> |</cell>|
|<cell>| <col> granted </col> <row> weighted average grant date fair value ( per share ) </row> <val> $ 18.13 </val> |</cell>|
|<cell>| <col> vested </col> <row> number of shares ( in thousands ) </row> <val> -134 ( 134 ) </val> |</cell>|
|<cell>| <col> vested </col> <row> weighted average grant date fair value ( per share ) </row> <val> $ 10.88 </val> |</cell>|<|eot_id|>





In [8]:
# generate sub table
generate_kwargs = {
    "input_ids": input_ids,
    "max_length": 4096,  # Adjust for total length
    "temperature": 0.2,
    "pad_token_id": tokenizer.eos_token_id
}

output = base_model.generate(**generate_kwargs)[0]
response = tokenizer.decode(output, skip_special_tokens=True)

print(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a table decomposer whose role is to extract only the necessary information from the given table and natural language query to create a summarized sub-table. Select the relevant rows and columns required to solve the problem and reconstruct them into a smaller table format. In table format, each cell starts with |cell| token and ends with |/cell| tokens. Cell's column and row information are enclosed by <col>, </col> and <row>, </row> tokens. Skip detailed information and ONLY answer the extracted information in table formatuser

Query: during the 2012 year, did the equity awards in which the prescribed performance milestones were achieved exceed the equity award compensation expense for equity granted during the year?
Table: <|cell|> <col> restricted stock and restricted stock units at beginning of year </col> <row> number of shares ( in thousands ) </row> <val> 407 </val> <|/cell|> <|cell|> <col> restricted

In [None]:
from accelerate import Accelerator
from accelerate.utils import gather_object

accelerator = Accelerator()

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,    
    device_map={"": accelerator.process_index},
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id) 
tokenizer.pad_token = tokenizer.eos_token

accelerator.wait_for_everyone()

input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
input_ids = input_ids.to("cuda")

with accelerator.split_between_processes(messages) as prompts:
  results = model.generate(
    input_ids= input_ids,
    max_length= 4096,  # Adjust for total length
    temperature = 0.2,
    pad_token_id= tokenizer.eos_token_id)[0]
  res = tokenizer.decode(results, skip_special_tokens=True)

results_gathered=gather_object(res)

In [4]:
import json

with open("subtable_extract_0817.json", "r") as file:
  res = json.load(file)

# print(len(res))
for re in res[1005:1010]:
  print(re)
  print()

|<cell>| <col> 2004 ( in thousands ) </col> <val> $ 31739 </val> |</cell>|
 |<cell>| <col> 2005 ( in thousands ) </col> <val> 14554 </val> |</cell>|
 |<cell>| <col> 2006 ( in thousands ) </col> <val> 18262 </val> |</cell>|
 |<cell>| <col> 2007 ( in thousands ) </col> <val> 18754 </val> |</cell>|
 |<cell>| <col> 2008 ( in thousands ) </col> <val> 22606 </val> |</cell

|<cell>| <col> cash used in investing activities 2005 </col> <val> -2047 </val> |</cell>|
 |<cell>| <col> cash used in financing activities 2005 </col> <val> -752 </val> |</cell

|<cell>| <col> december 31 ( in millions ) securities purchased under resale agreements ( a ) 2010 </col> <val> $ 222302 </val> |</cell>|
 |<cell>| <col> december 31 ( in millions ) securities borrowed ( b ) 2010 </col> <val> 123587 </val> |</cell>|
 |<cell>| <col> december 31 ( in millions ) securities sold under repurchase agreements ( c ) 2010 </col> <val> $ 262722 </val> |</cell>|
 |<cell>| <col> december 31 ( in millions ) securities loaned 2