## Prep

### Basics

In [1]:
#https://colab.research.google.com/drive/1BEZ_qgtVqSmOmCTuhHs7lHiYB5M5_myg?usp=sharing

import pandas as pd
import shutil
from pathlib import Path
import json
from statistics import mean
import gzip
from tqdm.auto import tqdm
import subprocess
import time
import re
import requests
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from itertools import chain
import argparse
import os
import sys
from copy import deepcopy
import torch

In [2]:
from retry import retry

In [10]:
from archetype.src.run import run
from archetype.src.data import *

### SOTAB Variables

In [6]:
val_files = list(Path("/scratch/bf996/datasets/sotab/Validation").rglob("**/*.json.gz"))
train_files = list(Path("/scratch/bf996/datasets/sotab/Train").rglob("**/*.json.gz"))
test_files = list(Path("/scratch/bf996/datasets/sotab/Test").rglob("**/*.json.gz"))

In [7]:
os.chdir("/scratch/bf996/datasets/sotab")
gt_df = pd.read_csv("./CTA_validation_gt.csv")
gt_df_train = pd.read_csv("./CTA_training_gt.csv")
gt_df_test = pd.read_csv("./CTA_test_gt.csv")

In [8]:
train_save_path="/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/catboost_features_dataset_train.csv"
val_save_path="/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/catboost_features_dataset_val.csv"
test_save_path="/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/catboost_features_dataset_test.csv"
dfv = pd.read_csv(val_save_path)
dft = pd.read_csv(test_save_path)

## Model-Specific Prep

### DODUO

In [22]:
os.chdir("/scratch/bf996/notebooks/doduo")
#!wget https://doduo-data.s3-us-west-2.amazonaws.com/model.tar.gz
#!tar -zvxf model.tar.gz

In [26]:
# Load Doduo model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiOutputClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForMultiOutputClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiOutputClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiOutputClassification were not initialized from the model checkpoint at bert-base-uncased and

### Model-Specific Functions

## Functions

In [17]:
def to_integer(val):
    return pd.to_numeric(val, downcast='integer', errors='ignore')

def fix_mode(d):
  if isinstance(d['mode'], pd.Series):
    d['mode'] = d['mode'].loc[0].item()
  return d

def split_meta_features(d):
  return pd.Series([d.get('std', "N/A"), d.get('mean', "N/A"), d.get('median', "N/A"), d.get('mode', "N/A"), d.get('max', "N/A"), d.get('min', "N/A")])



# def recompute_results(prompt_dict, prompt, model_str, cbc_pred, label_set):
#   dict_val = prompt_dict.get(prompt, -1)
#   dict_val['cbc_pred'] = cbc_pred
#   if model_str == "llama":
#     if cbc_pred and (cbc_pred in catboost_cats):
#       print(f"using cbcpred label: {cbc_pred} \n")
#       dict_val['response'] = fix_labels(cbc_pred, label_set)
#     dict_val['correct'] = ((dict_val['ground_truth'] == dict_val['response']) or (dict_val['response'] and (dict_val['response']) in dict_val['ground_truth']))
#   prompt_dict[prompt] = dict_val

# def get_df_sample_col(col, rand_seed, len_context, min_variance=2, replace=False):
#     df = pd.Series(col)
#     ignore_list = ["None", 'none', 'NaN', 'nan', 'N/A', 'na', '']
#     sample_list = list(set(p[:75] for p in pd.unique(df.astype(str)[col]) if p not in ignore_list))
#     if len(sample_list) < 1:
#       return ["None"] * len_context
#     if len(sample_list) < len_context:
#       sample_list = sample_list * len_context
#     if len(sample_list) > len_context:
#       sample_list = sample_list[:len_context]
#     assert len(sample_list) == len_context, f"An index in val_indices is length {len(sample_list)}"
#     return sample_list

# def check_substr_contains_only_set(str, acceptable_chars):
#    validation = set(str)
#    print("Checking if it contains only ",acceptable_chars)
#    if validation.issubset(acceptable_chars):
#       return True
#    else:
#       return False

# def get_cbc_pred(orig_label, numeric_labels):
#     try:
#       #FOR VALIDATION
#       #cbc_filematch = dfv[dfv['df_path'] == str(f)]
#       #FOR TEST SET
#       cbc_filematch = dft[dft['df_path'] == str(f)]
#       cbc_labelmatch = cbc_filematch[cbc_filematch['label'] == orig_label]
#       if len(cbc_labelmatch) == 1:
#         cbc_pred = numeric_labels[cbc_labelmatch['preds'].item()]
#       else:
#         cbc_pred = None
#     except Exception as e:
#       print("cbc excpetion: ")
#       print(e)
#       cbc_pred=None



## ZS Results

### Other ZS Model Results

In [11]:
model_name = "flan-t5-xxl-zs"

filename = f"{model_name}-sim-10sample-v5.json"

sp = f"/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/{filename}"

run(model_name=model_name, save_path=sp, inputs=test_files, input_df=gt_df_test, label_set=context_labels_small, method=["similarity", "skip-existing"], resume=True, sample_size = 10)



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /ext3/miniconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7026 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


Total entries: 19192 
 Accuracy: 0.3898 
 Weighted F1: 0.3603 
 Unweighted F1: 0.278 
 Correct Remap: 0 
 Total Remap: 1 
 Truncated: 0


## DoDuo Eval on SOTAB

https://github.com/megagonlabs/doduo

RESULTS

~100 SAMPLES, ALL CONTEXT LABELS: ~24%, .58it/s on CPU

~100 SAMPLES, TRIM CONTEXT LABELS: ~24%, .58it/s on CPU

ALL SAMPLES, SMALL CONTEXT LABELS: ~28%, 7it/s on GPU?


In [None]:
# os.chdir("/scratch/bf996/notebooks")

# filename = "doduo-sherlock-to-sotab-trim-full-v1.json"

# sp = f"/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/{filename}"

# #run_val(model="doduo", save_path=sp, inputs=test_files, input_df=gt_df_test, label_set=context_labels, resume=False, stop_early=100)
# run_val(model="doduo", save_path=sp, inputs=test_files, input_df=gt_df_test, label_set=context_labels_trim, resume=True)

  0%|          | 0/7026 [00:00<?, ?it/s]

### D4

In [95]:
os.chdir("/scratch/bf996/notebooks")

filename = "doduo-sherlock-to-d4-trim-full-v4.json"

sp = f"/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/{filename}"

run_val(model="doduo", save_path=sp, inputs=d4_dfs, input_df=gt_df_test, label_set=d4_sherlock_labels)


  0%|          | 0/2000 [00:00<?, ?it/s]



 Overall Accuracy score was 0.322 
 Pct Eval: 1.0 

 Example errors: 

Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name


Sample Error: 
Label: ['code'] || Prediction: name




## Build Training Dataset for fine tuning GPT / LLAMA / Sherlock

### Sherlock

In [None]:
def train_sherlock(df, gt_df, prompt_dict, model, label_indices, base_prompt):
  data_m = pd.Series(df[label_indices].astype(str).T.values.tolist())
  extract_features(
      "../temporary.csv",
      data_m
  )
  feature_vectors = pd.read_csv("../temporary.csv", dtype=np.float32)
  predicted_labels = model.predict(feature_vectors, "sherlock")
  predicted_labels_dict = {i : sherlock_to_cta.get(predicted_labels[i], [predicted_labels[i]]) for i in range(len(data_m))}
  for idx, label_idx in zip(range(len(data_m)), label_indices):
    prompt = base_prompt + "_" + str(label_idx)
    gt_row = gt_df[gt_df['column_index'] == label_idx]
    if len(gt_row) != 1:
      continue
    label = fix_labels(gt_row['label'].item())
    ans = predicted_labels_dict[idx]
    assert isinstance(ans, list), "ans should be a list"
    res = label in ans
    ans_dict = {"response" : ans, "context" : None, "ground_truth" : label, "correct" : res}
    prompt_dict[prompt] = ans_dict

### LLAMA

#### Old Command

In [None]:

run_val(model="llama", save_path="/content/drive/MyDrive/School/NYU/Dataset Search/proj/CTA_CPA_Benchmarks/wotab/llama_results_prompt_v9.json", inputs=train_files, input_df=gt_df_train, label_set=context_labels, resume=False, response=False, full=True, sample_size=5)


46790it [1:33:10,  8.37it/s]




 Overall Accuracy score was 0.0 
 Pct Eval: 7.49 

 Example errors: 

Sample Error: 
Context (500 chars):  ['SRC: virginityrocks', '$25.00', 'Sold Out', '$25.00', 'Sold Out', '$25.00', 'std: 0.9', 'mean: 7.5', 'mode: 8', 'median: 8.0', 'max: 8', 'min: 6', 'rolling-mean-window-4: [0.0]', '']
Label: price || Prediction: 


Sample Error: 
Context (500 chars):  ['SRC: scvs', "['2.0E1', '1.5E2', '0.0E0']", '0.0E0', "['0.0E0', '7.5E1', '1.0E1']", "['2.0E1', '1.5E2', '0.0E0']", '0.0E0', 'std: 7.21', 'mean: 6.58', 'mode: 4', 'median: 4.0', 'max: 27', 'min: 4', 'rolling-mean-window-4: [0.0]', '']
Label: price || Prediction: 


Sample Error: 
Context (500 chars):  ['SRC: scvs', "['GBP', 'GBP', 'GBP']", 'GBP', "['GBP', 'GBP', 'GBP']", 'GBP', "['GBP', 'GBP', 'GBP']", 'std: 5.43', 'mean: 5.63', 'mode: 4', 'median: 4.0', 'max: 21', 'min: 3', 'rolling-mean-window-4: [0.0]', '']
Label: currency || Prediction: 


Sample Error: 
Context (500 chars):  ['SRC: scvs', "['https://schema.org/InStock', 'http

#### OC + TN + SS

In [23]:
cur_url = "https://079c124ee590ac75fe.gradio.live"

filename = "train-llama-oc+tn+ss-v1.json"

sp = f"/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/{filename}"

run_val(model="llama", save_path=sp, inputs=train_files, input_df=gt_df_train, response=False, label_set=context_labels, resume=True, table_src=True, summ_stats=True, other_col=True, link=f"{cur_url}/run/textgen", method=["similarity"], sample_size = 5)

  0%|          | 0/46790 [00:00<?, ?it/s]



 Overall Accuracy score was 0.0 
 Pct Eval: 7.75 

 Example errors: 



#### Dataset Prep

In [24]:
# Calling DataFrame constructor after zipping
# both lists, with columns specified
import pandas as pd
from pathlib import Path
import io, json

filename = "train-llama-oc+tn+ss-v1.json"

sp = f"/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/{filename}"

with open(sp, 'r') as jf:
  prompt_dict = json.load(jf)

values = []
kl = list(prompt_dict.keys())
for k in kl:
    values.append(prompt_dict[k]['ground_truth'])

df = pd.DataFrame(list(zip(kl, values)),
               columns =['prompt', 'completion'])
#df['prompt'] = df['prompt'].apply(lambda s : s + "$")
#df.to_csv("/content/drive/MyDrive/School/NYU/Dataset Search/proj/CTA_CPA_Benchmarks/gpt_train_v2.csv", index=False)

In [25]:
filename = "train-llama-oc+tn+ss-v1-formatted.json"

target_path = f"/scratch/bf996/llm_er_std/proj/CTA_CPA_Benchmarks/wotab/{filename}"

def find_context(s):
    idx = s.find("INPUT:")
    len_c = len("INPUT:")
    s = s[idx + len_c:] + "\n"
    return s

df['instruction'] = "Select the category which best matches the input. \n"
df['input'] = df['prompt'].apply(lambda x : find_context(x))
df['output'] = df['completion'] + "\n"
df = df.drop(columns=['prompt', 'completion'])
df.to_json(target_path, orient='records', indent=4)

In [26]:
def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode)
    return f


def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def jdump(obj, f, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format.

    Args:
        obj: An object to be written.
        f: A string path to the location on disk.
        mode: Mode for opening the file.
        indent: Indent for storing json dictionaries.
        default: A function to handle non-serializable entries; defaults to `str`.
    """
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

In [27]:
d = jload(target_path)

In [28]:
d

[{'instruction': 'Select the category which best matches the input. \n',
  'input': '[\'SRC: sacristy\', \'Catholic Bishops of Great Britain: A Reference to Roman Catholic Bishops from 1850 to 2015\', "Into the Depths: A Chaplain\'s Reflections on Death, Dying and Pastoral Care", \'Secret Lives (part 2)\', \'All Hail the Glorious Night (and other Christmas poems): The Complete Christmas Poetry of Kevin Carey\', \'The Writing on the Wall: Everyday Phrases from the King James Bible\', \'std: 20.9\', \'mean: 56.37\', \'mode: 56\', \'median: 58.0\', \'max: 101\', \'min: 16\', \'rolling-mean-window-4: [0.0]\', \'\'] \n CATEGORY: \n',
  'output': 'book\n'},
 {'instruction': 'Select the category which best matches the input. \n',
  'input': "['SRC: sacristy', '1st April 2016', '1st June 2019', '1st October 2016', '1st September 2018', '1st October 2015', 'std: 1.89', 'mean: 15.58', 'mode: 17', 'median: 16.0', 'max: 19', 'min: 12', 'rolling-mean-window-4: [0.0]', ''] \n CATEGORY: \n",
  'outpu