# Check-worthiness detection using Large Language Models

First, the necessary python modules are imported

In [1]:
%load_ext autoreload

from claimbuster_utils import load_claimbuster_dataset
from checkthat_utils import load_check_that_dataset
import pandas as pd
from llm import load_huggingface_model, HuggingFaceModel, run_llm_cross_validation, generate_llm_predictions, ICLUsage, PromptType
from result_analysis import generate_error_analysis_report
from dataset_utils import generate_cross_validation_datasets, Dataset
import ipywidgets as widgets
import os

2024-03-19 14:06:45.799499: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 14:06:45.804559: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 14:06:45.806406: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-19 14:06:45.837214: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Generate Cross Validation datasets

In [2]:
%autoreload
claimbuster = load_claimbuster_dataset("../data/ClaimBuster/datasets")
clambuster_datasets = generate_cross_validation_datasets(
    data=claimbuster, 
    folder_path="../data/ClaimBuster/crossval"
)

checkthat = load_check_that_dataset("../data/CheckThat")
checkthat_datasets = generate_cross_validation_datasets(
    data=checkthat, 
    label_column="check_worthiness",
    folder_path="../data/CheckThat/crossval"
)

## Generate predictions

Using ipywidgets to select which model, dataset, and other parameters to generate LLM predictions

In [6]:
# General lauyout
input_style = dict(
    description_width="fit-content"
)

# Dataset 
dataset_select = widgets.Dropdown(
    options=[("ClaimBuster", Dataset.CLAIMBUSTER), ("CheckThat", Dataset.CHECK_THAT)],
    value=Dataset.CHECK_THAT,
    description="Dataset:"
)

# Model and parameters
model_select = widgets.Dropdown(
    options=[("Mistral 7B Instruct", HuggingFaceModel.MISTRAL_7B_INSTRUCT), ("Mixtral Instruct", HuggingFaceModel.MIXTRAL_INSTRUCT)],
    value=HuggingFaceModel.MISTRAL_7B_INSTRUCT,
    description="Model:",
    style=input_style
)
max_new_tokens_int_text = widgets.IntText(
    value=256,
    description="Max new tokens:",
    style=input_style
)
batch_size = widgets.IntText(
    value=128,
    description="Batch size:",
    style=input_style
)
model_and_parameters = widgets.VBox(
    [model_select, max_new_tokens_int_text, batch_size],
)

# Prompting type
prompting_type = widgets.Dropdown(
    options=[("Standard", PromptType.STANDARD), ("Chain-of-Thought", PromptType.CHAIN_OF_THOUGHT)],
    value=PromptType.STANDARD,
    description="Prompting type:",
    style=input_style
)
icl_usage = widgets.Dropdown(
    options=[("Zero-shot", ICLUsage.ZERO_SHOT), ("Few-shot", ICLUsage.FEW_SHOT)],
    value=ICLUsage.ZERO_SHOT,
    description="ICL usage:",
    style=input_style
)
prompt_use = widgets.VBox(
    [prompting_type, icl_usage]
)

accordion = widgets.Accordion([
    dataset_select,
    model_and_parameters,
    prompt_use
],
    titles=["Dataset", "Model and parameters", "Prompting type"],
)

title = widgets.HTML(
    "<h1>Generation of predictions using LLMs</h1>",
)
description = widgets.HTML(
    "<div>Set the parameters to select what dataset, model and prompting to use when generating predictions. If you experience Cuda out of memory issues, please decrease the batch size.</div>",
    layout={"font-size": '14px'}
)
start_generation_button = widgets.Button(
    description="Start generation",
    disabled=False,
    button_style="success",
    layout={"height": "40px", "width": "calc(100% - 4px)"},
)

def handle_generation_click(_):
    print("Loading dataset...")
    if dataset_select.value == Dataset.CLAIMBUSTER:
        dataset = load_claimbuster_dataset("../data/ClaimBuster/datasets")
        label_column = "Verdict"
        text_column = "Text"
    else:
        dataset = load_check_that_dataset("../data/CheckThat")
        label_column = "check_worthiness"
        text_column = "tweet_text"

    print("Loading prompts...")
    instruction_path = os.path.join(
        "../prompts",
        dataset_select.value.value,
        prompting_type.value.value,
        icl_usage.value.value,
        "instruction.txt"
    )
    with open(instruction_path, "r") as f:
        instruction = f.read().replace("\n", "")
    prompts = [ f"{instruction} '''{text}'''" for text in dataset[text_column]]
    print("Loading model...")
    pipe = load_huggingface_model(
        model_id=model_select.value, 
        max_new_tokens=max_new_tokens_int_text.value
    )

    print("Generating predictions...")
    save_path = os.path.join(
        "../results",
        dataset_select.value.value,
        model_select.value.name,
        prompting_type.value.value,
        icl_usage.value.value,
        "generated_scores.csv"
    )
    generate_llm_predictions(
        data=dataset,
        prompts=prompts,
        pipe=pipe,
        batch_size=batch_size.value,
        label_column=label_column,
        text_column=text_column,
        save_path=save_path
    )


start_generation_button.on_click(handle_generation_click)


box = widgets.Box(
    [title, description, accordion, start_generation_button],
    layout=widgets.Layout(
        padding= '16px', 
        display= "flex", 
        flex_flow="column",
        align_items="stretch",
        border="1px solid black"
    )
) 
display(box)

Box(children=(HTML(value='<h1>Generation of predictions using LLMs</h1>'), HTML(value='<div>Set the parameters…

Loading dataset...
Loading prompts...
Loading model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Generating predictions...


  0%|          | 0/1172 [00:00<?, ?it/s]

  score = 0.0 if non_check_worthy_matcher.search(response) else np.nan


## Zero-shot classification

### ClaimBuster

#### Standard

In [7]:
with open("../prompts/ClaimBuster/standard/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
data = load_claimbuster_dataset(
    "../data/ClaimBuster/datasets",
)[:10]


texts = data["Text"]
prompts = [f"{instruction} '''{text}'''" for text in texts]
zeroshot_output = f"../results/ClaimBuster/{model_id.name}/{ICLUsage.ZERO_SHOT.value}/{PromptType.STANDARD.value}/zeroshot_preds.csv"
    
generate_llm_predictions(
    data=data,
    pipe=pipe, 
    prompts=prompts, 
    save_path=zeroshot_output
)

NameError: name 'pipe' is not defined

#### Using contextual features

In [None]:
data = load_claimbuster_dataset(
    "../data/ClaimBuster/datasets",
    use_contextual_features=True,
    debate_transcripts_folder="../data/ClaimBuster/debate_transcripts",
)[:10]

contexts = data["previous_sentences"].tolist()
prompts = [
    f"{instruction} For context, the following senteces were said prior to the one in question: {context} Only evaluate the check-worthiness of the following sentence: '''{text}'''"
    for text, context in zip(texts, contexts)
]
zeroshot_output = "../results/ClaimBuster/{model_id.name}/zeroshot/zeroshot_contextual_preds.csv"

generate_llm_predictions(
    data=data,
    pipe=pipe, 
    prompts=prompts, 
    save_path=zeroshot_output
)

#### Cross validation

In [41]:
%autoreload

crossval_folder = "../data/ClaimBuster/crossval"
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/ClaimBuster/{model_id.name}/{}/zeroshot1.csv"
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    save_folder = f"../results/ClaimBuster/{model_id.name}/zeroshot"
    result, predictions = run_llm_cross_validation(
        data=dataset_with_scores, crossval_folder=crossval_folder, save_folder=save_folder
    )
    
    display(result)

Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.833402,0.896469,0.866898,0.881436,0.692513,0.749638,0.719944,0.794491,0.808268,0.80069,0.838208,0.833402,0.835305
1,0.829268,0.883382,0.876736,0.880046,0.697443,0.710564,0.703943,0.790413,0.79365,0.791995,0.830268,0.829268,0.829741
2,0.826716,0.890681,0.863347,0.876801,0.682796,0.735166,0.708014,0.786738,0.799257,0.792407,0.831273,0.826716,0.828566
3,0.83292,0.897775,0.864505,0.880826,0.690066,0.75398,0.720609,0.793921,0.809242,0.800717,0.838417,0.83292,0.83504
Average,0.830577,0.892077,0.867872,0.879777,0.690705,0.737337,0.713127,0.791391,0.802604,0.796452,0.834542,0.830577,0.832163


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.813559,0.910083,0.820023,0.862709,0.639211,0.797395,0.709594,0.774647,0.808709,0.786152,0.832707,0.813559,0.818971
1,0.822654,0.912381,0.831597,0.870118,0.655213,0.800289,0.720521,0.783797,0.815943,0.79532,0.83892,0.822654,0.827385
2,0.810174,0.902795,0.822814,0.860951,0.637441,0.778582,0.700977,0.770118,0.800698,0.780964,0.826964,0.810174,0.815235
3,0.825062,0.912137,0.835553,0.872167,0.660287,0.798842,0.722986,0.786212,0.817198,0.797577,0.840165,0.825062,0.829535
Average,0.817862,0.909349,0.827497,0.866486,0.648038,0.793777,0.71352,0.778694,0.810637,0.790003,0.834689,0.817862,0.822782


#### Error analysis

In [42]:
%autoreload
mistral_predictins = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/zeroshot/predictions.csv", index_col=0)
mixtral_predictions = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MIXTRAL_INSTRUCT.name}/zeroshot/predictions.csv", index_col=0)
lora_predictions = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/lora/predictions.csv", index_col=0)
predictions = [mistral_predictins, mistral_predictins, lora_predictions]
model_names = [HuggingFaceModel.MISTRAL_7B_INSTRUCT.name, HuggingFaceModel.MIXTRAL_INSTRUCT.name, "LORA"]
display(claimbuster.head())
generate_error_analysis_report(
    claimbuster,
    predictions=predictions,
    model_names=model_names,
    folder_path=f"../results/ClaimBuster"
)

Unnamed: 0_level_0,Verdict,Text
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1
27247,1,We're 9 million jobs short of that.
10766,1,"You know, last year up to this time, we've los..."
3327,1,And in November of 1975 I was the first presid...
19700,1,And what we've done during the Bush administra...
12600,1,Do you know we don't have a single program spo...


##################################################
#              MISTRAL_7B_INSTRUCT               #
#              False positives: 913              #
#              False negatives: 726              #
##################################################
#                MIXTRAL_INSTRUCT                #
#              False positives: 913              #
#              False negatives: 726              #
##################################################
#                      LORA                      #
#              False positives: 366              #
#              False negatives: 406              #
##################################################
#                     Total                      #
#             False positives: 1109              #
#              False negatives: 860              #
#        Overlapping false positives: 170        #
#        Overlapping false negatives: 272        #
##################################################


### CheckThat 2021 Task 1a Tweets

#### Standard

In [3]:
%autoreload
with open("../prompts/CheckThat/standard/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
data = load_check_that_dataset(
    "../data/CheckThat",
)

texts = data["tweet_text"]
prompts = [f"{instruction} '''{text}'''" for text in texts]
zeroshot_output = f"../results/CheckThat/{model_id.name}/zeroshot.csv"
generate_llm_predictions(
    data=data,
    pipe=pipe, 
    prompts=prompts, 
    save_path=zeroshot_output
)

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  25981 MiB |  25981 MiB | 114559 MiB |  88578 MiB |
|       from large pool |  25899 MiB |  25899 MiB | 114475 MiB |  88576 MiB |
|       from small pool |     81 MiB |     81 MiB |     83 MiB |      2 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  25981 MiB |  25981 MiB | 114559 MiB |  88578 MiB |
|       from large pool |  25899 MiB |  25899 MiB | 114475 MiB |  88576 MiB |
|       from small pool |     81 MiB |     81 MiB |     83 MiB |      2 MiB |
|---------------------------------------------------------------

Unnamed: 0_level_0,topic_id,tweet_url,tweet_text,claim,check_worthiness
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1234964653014384644,covid-19,https://twitter.com/EricTrump/status/123496465...,Since this will never get reported by the medi...,1,1
1234869939720216578,covid-19,https://twitter.com/RealJamesWoods/status/1234...,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0,0
1234873136304267267,covid-19,https://twitter.com/hayxsmith/status/123487313...,"Folks, when you say ""The corona virus isn't a ...",0,0
1235071285027147776,covid-19,https://twitter.com/ipspankajnain/status/12350...,Just 1 case of Corona Virus in India and peop...,1,0
1234911110861594624,covid-19,https://twitter.com/PressSec/status/1234911110...,President @realDonaldTrump made a commitment...,1,1


  dataset_with_scores.loc[dataset_index, "score"] = score
100%|██████████| 1172/1172 [22:23<00:00,  1.15s/it] 


#### Chain-Of-Thought

In [4]:
%autoreload

with open("../prompts/CheckThat/CoT/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
data = load_check_that_dataset(
    "../data/CheckThat",
)

texts = data["tweet_text"]
prompts = [f"{instruction} '''{text}'''" for text in texts]
zeroshot_output = f"../results/CheckThat/{model_id.name}/zeroshot/CoT/chain-of-thought-scores.csv"
generate_llm_predictions(
    data=data,
    pipe=pipe, 
    batch_size=32,
    prompts=prompts, 
    save_path=zeroshot_output,
    label_column="check_worthiness",
    text_column="tweet_text",
)

  0%|          | 0/1172 [00:00<?, ?it/s]

  dataset_with_scores.loc[dataset_index, "score"] = score
100%|██████████| 1172/1172 [2:32:40<00:00,  7.82s/it] 


Unnamed: 0_level_0,check_worthiness,score,tweet_text,reasoning
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1234964653014384644,1,10.0,Since this will never get reported by the medi...,"The tweet contains a verifiable factual claim,..."
1234869939720216578,0,90,"Thanks, #MichaelBloomberg. Here’s a handy litt...",1. The tweet contains a verifiable factual cla...
1234873136304267267,0,90,"Folks, when you say ""The corona virus isn't a ...",1. The tweet contains a verifiable factual cla...
1235071285027147776,0,90,Just 1 case of Corona Virus in India and peop...,The tweet contains false information and has t...
1234911110861594624,1,20,President @realDonaldTrump made a commitment...,Answer:1. The tweet contains a verifiable fact...
...,...,...,...,...
1370401418960601092,0,80,If Abbott wants to open Texas for business 100...,1. The tweet contains a verifiable factual cla...
1369913482158690308,0,0,We are delighted to be joining the @peoplesvac...,The tweet does not contain any false informati...
1368629520354320391,0,0,Trust the Facts: No serious safety concerns we...,The tweet contains factual information that is...
1368586349465452548,0,80,"Maryland providers have administered 1,567,359...","The tweet contains verifiable factual claims, ..."


#### Cross validation

##### Standard

In [39]:
%autoreload
crossval_folder = "../data/CheckThat/crossval"
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/CheckThat/{model_id.name}/zeroshot/zeroshot1.csv"
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    save_folder = f"../results/CheckThat/{model_id.name}/zeroshot"
    result, predictions = run_llm_cross_validation(
        data=dataset_with_scores,
        label_column="check_worthiness",
        crossval_folder=crossval_folder,
        save_folder=save_folder,
    )
    display(result)

Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.645051,0.828402,0.651163,0.729167,0.395161,0.628205,0.485149,0.611782,0.639684,0.607158,0.713069,0.645051,0.664206
1,0.607509,0.874074,0.546296,0.672365,0.379747,0.779221,0.510638,0.62691,0.662759,0.591501,0.744166,0.607509,0.629863
2,0.672355,0.840909,0.685185,0.755102,0.418803,0.636364,0.505155,0.629856,0.660774,0.630128,0.72998,0.672355,0.689416
3,0.651877,0.835294,0.657407,0.735751,0.398374,0.636364,0.49,0.616834,0.646886,0.612876,0.720472,0.651877,0.671168
Average,0.644198,0.84467,0.635013,0.723096,0.398021,0.670038,0.497735,0.621346,0.652526,0.610416,0.726922,0.644198,0.663663


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.726962,0.816901,0.809302,0.813084,0.4875,0.5,0.493671,0.652201,0.654651,0.653377,0.729211,0.726962,0.728053
1,0.778157,0.864734,0.828704,0.846336,0.569767,0.636364,0.601227,0.717251,0.732534,0.723781,0.787217,0.778157,0.781921
2,0.744027,0.847291,0.796296,0.821002,0.511111,0.597403,0.550898,0.679201,0.696849,0.68595,0.758943,0.744027,0.750019
3,0.754266,0.842857,0.819444,0.830986,0.53012,0.571429,0.55,0.686489,0.695437,0.690493,0.76067,0.754266,0.757143
Average,0.750853,0.842946,0.813437,0.827852,0.524625,0.576299,0.548949,0.683785,0.694868,0.688401,0.75901,0.750853,0.754284


##### Chain-of-Thought

In [7]:
%autoreload

import os
crossval_folder = "../data/CheckThat/crossval"
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/CheckThat/{model_id.name}/zeroshot/CoT/chain-of-thought-scores.csv"
    if not os.path.exists(dataset_path):
        print(f"Missing generated scores for model {model_id.name}")
        continue
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    save_folder = f"../results/CheckThat/{model_id.name}/zeroshot/CoT"
    result, predictions = run_llm_cross_validation(
        data=dataset_with_scores,
        label_column="check_worthiness",
        crossval_folder=crossval_folder,
        save_folder=save_folder,
    )
    display(result)
    

Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.59727,0.854015,0.544186,0.664773,0.371795,0.74359,0.495726,0.612905,0.643888,0.58025,0.725642,0.59727,0.619771
1,0.62116,0.87234,0.569444,0.689076,0.388158,0.766234,0.515284,0.630249,0.667839,0.60218,0.745098,0.62116,0.643403
2,0.662116,0.823204,0.689815,0.75063,0.401786,0.584416,0.47619,0.612495,0.637115,0.61341,0.712456,0.662116,0.678507
3,0.651877,0.8,0.703704,0.748768,0.378641,0.506494,0.433333,0.58932,0.605099,0.591051,0.689267,0.651877,0.665873
Average,0.633106,0.83739,0.626787,0.713312,0.385095,0.650183,0.480134,0.611242,0.638485,0.596723,0.718116,0.633106,0.651889


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.716724,0.808411,0.804651,0.806527,0.468354,0.474359,0.471338,0.638383,0.639505,0.638932,0.717884,0.716724,0.717296
1,0.726962,0.830097,0.791667,0.810427,0.482759,0.545455,0.512195,0.656428,0.668561,0.661311,0.738817,0.726962,0.732052
2,0.720137,0.828431,0.782407,0.804762,0.47191,0.545455,0.506024,0.650171,0.663931,0.655393,0.734738,0.720137,0.726254
3,0.720137,0.799107,0.828704,0.813636,0.463768,0.415584,0.438356,0.631438,0.622144,0.625996,0.710981,0.720137,0.715013
Average,0.72099,0.816512,0.801857,0.808838,0.471698,0.495213,0.481978,0.644105,0.648535,0.645408,0.725605,0.72099,0.722654


#### Error analysis

In [40]:
%autoreload
folder_path = f"../results/CheckThat"
mistral_predictions = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/zeroshot/predictions.csv", index_col=0)
mixtral_predictions = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MIXTRAL_INSTRUCT.name}/zeroshot/predictions.csv", index_col=0)
lora_predictions = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/lora/predictions.csv", index_col=0)
results = [mistral_predictions, mixtral_predictions, lora_predictions]
model_names = [HuggingFaceModel.MISTRAL_7B_INSTRUCT.name, HuggingFaceModel.MIXTRAL_INSTRUCT.name, "LORA"]
generate_error_analysis_report(
    checkthat,
    predictions=results,
    model_names=model_names,
    folder_path=folder_path,
    label_column_name="check_worthiness",
    text_column_name="tweet_text",
)

##################################################
#              MISTRAL_7B_INSTRUCT               #
#              False positives: 315              #
#              False negatives: 102              #
##################################################
#                MIXTRAL_INSTRUCT                #
#              False positives: 161              #
#              False negatives: 131              #
##################################################
#                      LORA                      #
#              False positives: 115              #
#              False negatives: 77               #
##################################################
#                     Total                      #
#              False positives: 390              #
#              False negatives: 182              #
#        Overlapping false positives: 54         #
#        Overlapping false negatives: 32         #
##################################################


## ICL prompting

Using In-Context Learning prompting