# Check-worthiness detection using Large Language Models

First, the necessary python modules are imported

In [1]:
%load_ext autoreload

from claimbuster_utils import load_claimbuster_dataset
from checkthat_utils import load_check_that_dataset
import pandas as pd
from llm import load_huggingface_model, HuggingFaceModel, run_llm_cross_validation, generate_llm_predictions
from result_analysis import generate_error_analysis_report
from dataset_utils import generate_cross_validation_datasets

  from .autonotebook import tqdm as notebook_tqdm





## Generate Cross Validation datasets

In [3]:
%autoreload
claimbuster = load_claimbuster_dataset("../data/ClaimBuster_Datasets/datasets")
clambuster_datasets = generate_cross_validation_datasets(
    data=claimbuster, 
    folder_path="../data/ClaimBuster_Datasets/crossval"
)

checkthat = load_check_that_dataset("../data/CheckThat2021Task1a")
checkthat_datasets = generate_cross_validation_datasets(
    data=checkthat, 
    label_column="check_worthiness",
    folder_path="../data/CheckThat2021Task1a/crossval"
)

Found existing cross validation splits.
Found existing cross validation splits.


## Load model

In [8]:
%autoreload
model_id = HuggingFaceModel.MIXTRAL_INSTRUCT
pipe = load_huggingface_model(model_id)


config.json: 100%|██████████| 720/720 [00:00<?, ?B/s] 


ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`

## Zero-shot classification

### ClaimBuster

#### Standard

In [7]:
with open("../prompts/ClaimBuster/standard/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
data = load_claimbuster_dataset(
    "../data/ClaimBuster_Datasets/datasets",
)[:10]


texts = data["Text"]
prompts = [f"{instruction} '''{text}'''" for text in texts]
zeroshot_output = f"../results/ClaimBuster/{model_id.name}/zeroshot/zeroshot_preds.csv"
    
generate_llm_predictions(
    data=data,
    pipe=pipe, 
    prompts=prompts, 
    save_path=zeroshot_output
)

NameError: name 'pipe' is not defined

#### Using contextual features

In [None]:
data = load_claimbuster_dataset(
    "../data/ClaimBuster_Datasets/datasets",
    use_contextual_features=True,
    debate_transcripts_folder="../data/ClaimBuster_Datasets/debate_transcripts",
)[:10]

contexts = data["previous_sentences"].tolist()
prompts = [
    f"{instruction} For context, the following senteces were said prior to the one in question: {context} Only evaluate the check-worthiness of the following sentence: '''{text}'''"
    for text, context in zip(texts, contexts)
]
zeroshot_output = "../results/ClaimBuster/{model_id.name}/zeroshot/zeroshot_contextual_preds.csv"

generate_llm_predictions(
    data=data,
    pipe=pipe, 
    prompts=prompts, 
    save_path=zeroshot_output
)

#### Cross validation

In [29]:
%autoreload

crossval_folder = "../data/ClaimBuster_Datasets/crossval"
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/ClaimBuster/{model_id.name}/zeroshot/zeroshot1.csv"
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    save_folder = f"../results/ClaimBuster/{model_id.name}/zeroshot"
    result = run_llm_cross_validation(
        data=dataset_with_scores, crossval_folder=crossval_folder, save_folder=save_folder
    )
    
    display(result)

Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.833402,0.896469,0.866898,0.881436,0.692513,0.749638,0.719944,0.794491,0.808268,0.80069,0.838208,0.833402,0.835305
1,0.829268,0.883382,0.876736,0.880046,0.697443,0.710564,0.703943,0.790413,0.79365,0.791995,0.830268,0.829268,0.829741
2,0.826716,0.890681,0.863347,0.876801,0.682796,0.735166,0.708014,0.786738,0.799257,0.792407,0.831273,0.826716,0.828566
3,0.83292,0.897775,0.864505,0.880826,0.690066,0.75398,0.720609,0.793921,0.809242,0.800717,0.838417,0.83292,0.83504
Average,0.830577,0.892077,0.867872,0.879777,0.690705,0.737337,0.713127,0.791391,0.802604,0.796452,0.834542,0.830577,0.832163


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.813559,0.910083,0.820023,0.862709,0.639211,0.797395,0.709594,0.774647,0.808709,0.786152,0.832707,0.813559,0.818971
1,0.822654,0.912381,0.831597,0.870118,0.655213,0.800289,0.720521,0.783797,0.815943,0.79532,0.83892,0.822654,0.827385
2,0.810174,0.902795,0.822814,0.860951,0.637441,0.778582,0.700977,0.770118,0.800698,0.780964,0.826964,0.810174,0.815235
3,0.825062,0.912137,0.835553,0.872167,0.660287,0.798842,0.722986,0.786212,0.817198,0.797577,0.840165,0.825062,0.829535
Average,0.817862,0.909349,0.827497,0.866486,0.648038,0.793777,0.71352,0.778694,0.810637,0.790003,0.834689,0.817862,0.822782


#### Error analysis

In [99]:
%autoreload
mistral_results = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/zeroshot1.csv", index_col=0)
mixtral_results = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MIXTRAL_INSTRUCT.name}/zeroshot1.csv", index_col=0)
results = [mistral_results, mixtral_results]
models = [HuggingFaceModel.MISTRAL_7B_INSTRUCT, HuggingFaceModel.MIXTRAL_INSTRUCT]
generate_error_analysis_report(
    results=results,
    models=models,
    folder_path=f"../results/ClaimBuster"
)

##################################################
#              MISTRAL_7B_INSTRUCT               #
#              False positives: 413              #
#              False negatives: 624              #
#              Empty predictions: 0              #
#             Wrong output format: 0             #
##################################################
#                MIXTRAL_INSTRUCT                #
#             False positives: 1193              #
#              False negatives: 388              #
#              Empty predictions: 8              #
#            Wrong output format: 259            #
##################################################
#                     Total                      #
#             False positives: 1324              #
#              False negatives: 757              #
#        Overlapping false positives: 282        #
#        Overlapping false negatives: 255        #
##################################################


### CheckThat 2021 Task 1a Tweets

In [3]:
%autoreload
with open("../prompts/CheckThat/standard/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
data = load_check_that_dataset(
    "../data/CheckThat2021Task1a",
)

texts = data["tweet_text"]
prompts = [f"{instruction} '''{text}'''" for text in texts]
zeroshot_output = f"../results/CheckThat/{model_id.name}/zeroshot.csv"
generate_llm_predictions(
    data=data,
    pipe=pipe, 
    prompts=prompts, 
    save_path=zeroshot_output
)

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  25981 MiB |  25981 MiB | 114559 MiB |  88578 MiB |
|       from large pool |  25899 MiB |  25899 MiB | 114475 MiB |  88576 MiB |
|       from small pool |     81 MiB |     81 MiB |     83 MiB |      2 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  25981 MiB |  25981 MiB | 114559 MiB |  88578 MiB |
|       from large pool |  25899 MiB |  25899 MiB | 114475 MiB |  88576 MiB |
|       from small pool |     81 MiB |     81 MiB |     83 MiB |      2 MiB |
|---------------------------------------------------------------

Unnamed: 0_level_0,topic_id,tweet_url,tweet_text,claim,check_worthiness
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1234964653014384644,covid-19,https://twitter.com/EricTrump/status/123496465...,Since this will never get reported by the medi...,1,1
1234869939720216578,covid-19,https://twitter.com/RealJamesWoods/status/1234...,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0,0
1234873136304267267,covid-19,https://twitter.com/hayxsmith/status/123487313...,"Folks, when you say ""The corona virus isn't a ...",0,0
1235071285027147776,covid-19,https://twitter.com/ipspankajnain/status/12350...,Just 1 case of Corona Virus in India and peop...,1,0
1234911110861594624,covid-19,https://twitter.com/PressSec/status/1234911110...,President @realDonaldTrump made a commitment...,1,1


  dataset_with_scores.loc[dataset_index, "score"] = score
100%|██████████| 1172/1172 [22:23<00:00,  1.15s/it] 


#### Cross validation

In [26]:
%autoreload
crossval_folder = "../data/CheckThat2021Task1a/crossval"
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/CheckThat/{model_id.name}/zeroshot/zeroshot1.csv"
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    save_folder = f"../results/CheckThat/{model_id.name}/zeroshot"
    result = run_llm_cross_validation(
        data=dataset_with_scores,
        label_column="check_worthiness",
        crossval_folder=crossval_folder,
        save_folder=save_folder,
    )
    display(result)
    


Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.645051,0.828402,0.651163,0.729167,0.395161,0.628205,0.485149,0.611782,0.639684,0.607158,0.713069,0.645051,0.664206
1,0.607509,0.874074,0.546296,0.672365,0.379747,0.779221,0.510638,0.62691,0.662759,0.591501,0.744166,0.607509,0.629863
2,0.672355,0.840909,0.685185,0.755102,0.418803,0.636364,0.505155,0.629856,0.660774,0.630128,0.72998,0.672355,0.689416
3,0.651877,0.835294,0.657407,0.735751,0.398374,0.636364,0.49,0.616834,0.646886,0.612876,0.720472,0.651877,0.671168
Average,0.644198,0.84467,0.635013,0.723096,0.398021,0.670038,0.497735,0.621346,0.652526,0.610416,0.726922,0.644198,0.663663


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,accuracy,0_precision,0_recall,0_f1-score,1_precision,1_recall,1_f1-score,macro avg_precision,macro avg_recall,macro avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_f1-score
0,0.726962,0.816901,0.809302,0.813084,0.4875,0.5,0.493671,0.652201,0.654651,0.653377,0.729211,0.726962,0.728053
1,0.778157,0.864734,0.828704,0.846336,0.569767,0.636364,0.601227,0.717251,0.732534,0.723781,0.787217,0.778157,0.781921
2,0.744027,0.847291,0.796296,0.821002,0.511111,0.597403,0.550898,0.679201,0.696849,0.68595,0.758943,0.744027,0.750019
3,0.754266,0.842857,0.819444,0.830986,0.53012,0.571429,0.55,0.686489,0.695437,0.690493,0.76067,0.754266,0.757143
Average,0.750853,0.842946,0.813437,0.827852,0.524625,0.576299,0.548949,0.683785,0.694868,0.688401,0.75901,0.750853,0.754284


#### Error analysis

In [98]:
%autoreload
folder_path = f"../results/CheckThat"
mistral_results = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/zeroshot1.csv", index_col=0)
mixtral_results = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MIXTRAL_INSTRUCT.name}/zeroshot1.csv", index_col=0)
results = [mistral_results, mixtral_results]
models = [HuggingFaceModel.MISTRAL_7B_INSTRUCT, HuggingFaceModel.MIXTRAL_INSTRUCT]
generate_error_analysis_report(
    results=results,
    models=models,
    folder_path=folder_path,
    label_column_name="check_worthiness",
    text_column_name="tweet_text",
)

##################################################
#              MISTRAL_7B_INSTRUCT               #
#              False positives: 376              #
#              False negatives: 36               #
#              Empty predictions: 1              #
#             Wrong output format: 0             #
##################################################
#                MIXTRAL_INSTRUCT                #
#              False positives: 484              #
#              False negatives: 20               #
#              Empty predictions: 3              #
#            Wrong output format: 25             #
##################################################
#                     Total                      #
#              False positives: 568              #
#              False negatives: 51               #
#        Overlapping false positives: 292        #
#         Overlapping false negatives: 5         #
##################################################


## LORA finetuning

Using LORA to fine-tune the Mistral 7B Instruct model