### Prompt Experiments for the input format : Comma Separated Labels + Row Context

In [None]:
#Importing Libraries
from rdflib import Graph
from pprint import pprint
import pandas as pd
import numpy as np
import pickle
import csv
import functions_multiple_tokens_HT as functions

#### Importing all pickle files

In [None]:
#Train Labels

file_path = '../../input/HardTables/HT_train_label_comma.pickle'

with open(file_path, 'rb') as file:
    train_labels = pickle.load(file)

In [None]:
# Train Choices
file_path = '../../input/HardTables/HT_train_choices.pickle'

with open(file_path, 'rb') as file:
    train_choices = pickle.load(file)

In [None]:
#Adjust Train Choices to the comma separated format
train_choices = [[item.split('. ')[1] if '. ' in item else item for item in sublist] for sublist in train_choices]
train_choices

In [None]:
#Train Annotations
file_path = '../../input/HardTables/HT_train_vals.pkl'

with open(file_path, 'rb') as file:
    train_vals = pickle.load(file)

In [None]:
#Train Rows
file_path = '../../input/HardTables/HT_train_rows.pkl'

with open(file_path, 'rb') as file:
    train_rows = pickle.load(file)

In [None]:
#Test set rows
file_path = '../../input/HardTables/HT_test_rows.pickle'

with open(file_path, 'rb') as file:
    test_rows = pickle.load(file)

In [None]:
#Preparing test rows for the annotation process
test_rows =[[value for value in sublist if value != ''] for sublist in test_rows]
test_rows

In [None]:
#Import train files for the contrastive prompt 

file_path = '../../input/p7/p7_comma_train_choices.pkl'
with open(file_path, 'rb') as file:
    train_choices_p7_comma = pickle.load(file)


file_path = '../../input/p7/p7_mc_train_choices.pkl'
with open(file_path, 'rb') as file:
    train_choices_p7_mc = pickle.load(file)


file_path = '../../input/p7/p7_train_labels.pkl'
with open(file_path, 'rb') as file:
    train_labels_p7 = pickle.load(file)


file_path = '../../input/p7/p7_train_vals.pkl'
with open(file_path, 'rb') as file:
    train_vals_p7 = pickle.load(file)

In [None]:
#Entity Lookup Dictionary

file_path = '../../input/HardTables/HT_dict_5.pickle'

with open(file_path, 'rb') as file:
    final_dict = pickle.load(file)

final_dict


In [None]:
#Train Annotations Chain Of Thought
file_path = '../../input/HardTables/train_labels_cot_comma.pkl'

with open(file_path, 'rb') as file:
    train_labels_cot = pickle.load(file)

In [None]:
#Candidate Entities Separated with Comma
entities_text = []

for key, value in final_dict.items():
    sublist_entity = []
    for key2, value2 in value.items():
        sublist_entity.append(key2)

    
    entities_text.append(sublist_entity)

entities_text

In [None]:
#Cell Values
vals = []
for key, value in final_dict.items():
    vals.append (key[1])

vals

In [None]:
# Cell table locations

locations = []
for key, value in final_dict.items():
    locations.append (key[0])

locations

### 2. Connecting to OPENAI

In [None]:
#MODEL SETUP
import os
import re
import numpy as np
from dotenv import dotenv_values
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage, AIMessage

In [None]:
#CONNECTING WITH OPEN AI

from getpass import getpass

# enter your api key
OPENAI_API_KEY = getpass("OPENAI_API_KEY")

In [None]:
chat = ChatOpenAI(
    openai_api_key='OPENAI_API_KEY',
    temperature=0,
    model='gpt-3.5-turbo-1106',
    #max_tokens=1
)

### Prompt 1: Zero Shot: No Instructions

In [None]:
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p1 = []
messages = []

# Execute prompt messages in batches of 2
batch_size = 2

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0


for (key, value), test_row, sublist in tqdm(zip(final_dict.items(), test_rows, entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)} \n Respond only with the choice " ))
    messages.append(HumanMessage(content=f"Given the  row: {test_row} \n  Classify the cell: {key[1]}" ))

    if len(messages) >= batch_size:
        #print(messages)
        res = chat(messages)
        preds_p1.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p1.append(res.content)

### Prompt 2: Zero-Shot: With Instructions

In [None]:
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p2 = []
messages = []

# Execute prompt messages in batches of 3
batch_size = 3

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0

for (key, value),test_row, sublist in tqdm(zip(final_dict.items(), test_rows, entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)} " ))
    messages.append(SystemMessage(content="Your instructions are: 1. Look at the choices given above. 2. Examine the values of the row and the given cell. 3. Select only ONE of the classes above, that best represents the meaning of the cell. 4. Respond only with the choice "))
    messages.append(HumanMessage(content=f"Given the row: {test_row}  \n Classify the cell: {key[1]}" ))

    if len(messages) >= batch_size:
        #print(messages)
        res = chat(messages)
        preds_p2.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p2.append(res.content)

### Prompt 3: Five-Shot with Instructions

In [None]:
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p3 = []
messages = []

# Execute prompt messages in batches of 13
batch_size = 13

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0

for (key, value),test_row, sublist in tqdm(zip(final_dict.items(),test_rows, entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    for i in range(0,5):
        messages.append(HumanMessage(content=f"Your task is to annotate a cell entity by selecting the most appropriate option from the provided list: {', '.join(train_choices[i])}. \n {train_vals[i]}"))
        messages.append(AIMessage(content=f"{train_labels[i]}"))
    
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)}" ))
    messages.append(SystemMessage(content="Your instructions are: 1. Look at the choices given above. 2. Examine the values of the row and the given cell. 3. Select only ONE of the classes above, that best represents the meaning of the cell. 4. Respond only with the choice "))
    messages.append(HumanMessage(content=f"Given the row: {test_row}  \n Classify the cell: {key[1]}" ))


    if len(messages) >= batch_size:
        #print(messages)
        res = chat(messages)
        preds_p3.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p3.append(res.content)

### Prompt 4: Five-Shot without Instructions

In [None]:
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p4 = []
messages = []

# Execute prompt messages in batches of 12
batch_size = 12

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0


for (key, value),test_row,sublist in tqdm(zip(final_dict.items(),test_rows, entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    for i in range(0,5):
        messages.append(HumanMessage(content=f"Your task is to annotate a cell entity by selecting the most appropriate option from the provided list: {', '.join(train_choices[i])}. \n {train_vals[i]}"))
        messages.append(AIMessage(content=f"{train_labels[i]}"))
    
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)} \n Respond only with the choice " ))
    messages.append(HumanMessage(content=f"Given the row: {test_row}  \n Classify the cell: {key[1]}" ))

    if len(messages) >= batch_size:
        #print(messages)
        res = chat(messages)
        preds_p4.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p4.append(res.content)

### Prompt 5: Chain of Thought

In [None]:
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p5 = []
messages = []

# Execute prompt messages in batches of 12
batch_size = 12

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0


for (key, value),test_row, sublist in tqdm(zip(final_dict.items(),test_rows, entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    for i in range(0,5):
        messages.append(HumanMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(train_choices[i])}. \n Given the row: \n {train_rows[i]} Classify the cell:{train_vals[i]}"))
        messages.append(AIMessage(content=f"{train_labels[i]}"))
    
    #messages.append(SystemMessage(content="Your instructions are: 1. Look at the choices given above. 2. Examine the value of the text. 3. Select only ONE of the choices above, that best represents the meaning of this value. 4. Always answer in the format: \"selected choice\"."))
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)} \n Respond only with the choice " ))
    messages.append(HumanMessage(content=f"Given the row: {test_row} \n Classify the cell: {key[1]} . Let's think in steps" ))


    if len(messages) >= batch_size:
        res = chat(messages)
        preds_p5.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p5.append(res.content)

### Prompt 6: Chain-of-Thought Zero-Shot

In [None]:
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p6 = []
messages = []

# Execute prompt messages in batches of 2
batch_size = 2

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0


for (key, value),test_row, sublist in tqdm(zip(final_dict.items(),test_rows, entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)} \n Respond only with the choice " ))
    messages.append(HumanMessage(content=f"Given the row: {test_row} \n Classify the cell: {key[1]} . Let's think in steps" ))


    if len(messages) >= batch_size:
        res = chat(messages)
        preds_p6.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p6.append(res.content)

### Prompt 7: Contrastive Prompting

In [None]:
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preds_p7 = []
messages = []

# Execute prompt messages in batches of 12
batch_size = 12

# Limit the loop to the first 1000 rows
num_rows_limit = 1000
count = 0


for (key, value), sublist in tqdm(zip(final_dict.items(), entities_text), desc="Processing items", total=len(final_dict)):
    count += 1
    for i in range(0,5):
        messages.append(HumanMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(train_choices_p7_comma[i])}. \n Classify the cell:{train_vals_p7[i]}"))
        messages.append(AIMessage(content=f"{train_labels_p7[i]}"))
    
    
    messages.append(SystemMessage(content=f"Your task is to classify a cell entity in a row by selecting the most appropriate option from the provided list: {', '.join(sublist)} \n Respond only with the choice " ))
    messages.append(HumanMessage(content=f"Classify the cell: {key[1]} . Let's think in steps" ))


    if len(messages) >= batch_size:
        res = chat(messages)
        preds_p7.append(res.content)
        messages = []

    # Check if the limit is reached
    if count >= num_rows_limit:
        break

if messages:
    res = chat(messages)
    print(res)  
    preds_p7.append(res.content)

In [None]:
#Create pkl files for each prompt prediction
from datetime import datetime

# Generate a timestamp
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')


pickle_file_path_p1 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p1_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p1, 'wb') as file:
    pickle.dump(preds_p1, file)

pickle_file_path_p2 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p2_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p2, 'wb') as file:
    pickle.dump(preds_p2, file)


pickle_file_path_p3 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p3_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p3, 'wb') as file:
    pickle.dump(preds_p3, file)


pickle_file_path_p4 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p4_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p4, 'wb') as file:
    pickle.dump(preds_p4, file)

pickle_file_path_p5 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p5_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p5, 'wb') as file:
    pickle.dump(preds_p5, file)

pickle_file_path_p6 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p6_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p6, 'wb') as file:
    pickle.dump(preds_p6, file)

pickle_file_path_p7 = f'../../evaluation/Prediction_Lists/HardTables/HT_prediction_list_p7_{timestamp}_Comma_Row.pkl'
with open(pickle_file_path_p7, 'wb') as file:
    pickle.dump(preds_p7, file)

In [None]:
#Create DataFrames of all predictions with their respective links
final_df_p1 = functions.final_df_creation(preds_p1, final_dict,letter_pattern='[A-Z]\.')
final_df_p2 = functions.final_df_creation(preds_p2, final_dict,letter_pattern='[A-Z]\.')
final_df_p3 = functions.final_df_creation(preds_p3, final_dict,letter_pattern='[A-Z]\.')
final_df_p4 = functions.final_df_creation(preds_p4, final_dict,letter_pattern='[A-Z]\.')
final_df_p5 = functions.final_df_creation(preds_p5, final_dict,letter_pattern='[A-Z]\.')
final_df_p6 = functions.final_df_creation(preds_p6, final_dict,letter_pattern='[A-Z]\.')
final_df_p7 = functions.final_df_creation(preds_p7, final_dict,letter_pattern='[A-Z]\.')

In [None]:
#Create CSV Files for the prediction DF
functions.prediction_submission_Comma_Cell(final_df_p1,timestamp,"p1")
functions.prediction_submission_Comma_Cell(final_df_p2,timestamp,"p2")
functions.prediction_submission_Comma_Cell(final_df_p3,timestamp,"p3")
functions.prediction_submission_Comma_Cell(final_df_p4,timestamp,"p4")
functions.prediction_submission_Comma_Cell(final_df_p5,timestamp,"p5")
functions.prediction_submission_Comma_Cell(final_df_p6,timestamp,"p6")
functions.prediction_submission_Comma_Cell(final_df_p7,timestamp,"p7")