# Check-worthiness detection using Large Language Models

First, the necessary python modules are imported

In [2]:
%load_ext autoreload

from claimbuster_utils import load_claimbuster_dataset
from checkthat_utils import load_check_that_dataset
from tqdm.auto import tqdm
import json
import numpy as np
import re
import torch
import pandas as pd
from llm import load_huggingface_model, HuggingFaceModel, run_llm_cross_validation
import os
from error_analysis import generate_error_analysis_report

  from .autonotebook import tqdm as notebook_tqdm





## Load model

In [8]:
%autoreload
model_id = HuggingFaceModel.MIXTRAL_INSTRUCT
pipe = load_huggingface_model(model_id)


config.json: 100%|██████████| 720/720 [00:00<?, ?B/s] 


ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`

## Zero-shot classification

### ClaimBuster

In [7]:
with open("../prompts/ClaimBuster/standard/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
use_contextual = False
data = load_claimbuster_dataset(
    "../data/ClaimBuster_Datasets/datasets",
    use_contextual_features=use_contextual,
    debate_transcripts_folder="../data/ClaimBuster_Datasets/debate_transcripts",
)[:10]


texts = data["Text"]
if use_contextual is False:
    prompts = [f"{instruction} '''{text}'''" for text in texts]
    zeroshot_output = f"../results/ClaimBuster/{model_id.name}/zeroshot1.csv"
    os.makedirs(os.path.dirname(zeroshot_output), exist_ok=True)
else:
    contexts = data["previous_sentences"].tolist()
    prompts = [
        f"{instruction} For context, the following senteces were said prior to the one in question: {context} Only evaluate the check-worthiness of the following sentence: '''{text}'''"
        for text, context in zip(texts, contexts)
    ]
    zeroshot_output = "../results/ClaimBuster/zeroshot_contextual.csv"


class ProgressDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        return self.dataset[idx]

    def __len__(self):
        return len(self.dataset)


prompts_data = ProgressDataset(prompts)
print(pipe.tokenizer.encode(prompts_data))

dataset_with_scores = data.copy()

display(data.head())
dict_matcher = re.compile(r"{.*}")
score_matcher = re.compile(r"([Ss]core[^\d]*)(\d+)")
non_check_worthy_matcher = re.compile(
    r"(non-checkworthy)|(not check-worthy)|(non check-worthy)"
)

responses = pipe(prompts_data, batch_size=128)
for index, result in enumerate(tqdm(responses, total=len(prompts))):
    response = result[0]["generated_text"].replace("\n", "")
    dataset_index = data.index[index]
    try:
        parsed_json = json.loads(dict_matcher.search(response).group(0))
        dataset_with_scores.loc[dataset_index, "score"] = parsed_json["score"]
        dataset_with_scores.loc[dataset_index, "reasoning"] = parsed_json["reasoning"]
    except (json.decoder.JSONDecodeError, AttributeError, KeyError) as e:
        # Try to find score
        score = score_matcher.search(response)
        if score is not None:
            score = score[2]
        else:
            score = 0.0 if non_check_worthy_matcher.search(response) else np.nan
        dataset_with_scores.loc[dataset_index, "score"] = score
        dataset_with_scores.loc[dataset_index, "reasoning"] = response
        continue
# Set the following column order: Verdict, score, Text, reasoning, previous_sentences
columns =  ["Verdict", "score", "Text", "reasoning"]
if use_contextual:
    columns.append("previous_sentences")
dataset_with_scores = dataset_with_scores[columns]
dataset_with_scores.to_csv(zeroshot_output, index=True)

NameError: name 'pipe' is not defined

#### Cross validation

In [9]:
%autoreload
# Print the number of empty scores
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/ClaimBuster/{model_id.name}/zeroshot1.csv"
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    display(run_llm_cross_validation(dataset_with_scores))


Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro,test_accuracy
0,1.371314,0.006999,0.790566,0.80769,0.798018,0.830095
1,1.369287,0.010589,0.803436,0.806818,0.805089,0.840017
2,1.406544,0.007,0.777518,0.788547,0.782555,0.818859
3,1.389679,0.008304,0.794504,0.807362,0.800335,0.833333
Average,1.384206,0.008223,0.791506,0.802604,0.796499,0.830576


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro,test_accuracy
0,1.30078,0.007999,0.783159,0.82101,0.795382,0.820587
1,1.340834,0.008,0.783971,0.808995,0.793886,0.823894
2,1.299942,0.010236,0.770906,0.802724,0.781971,0.810587
3,1.465163,0.009,0.777252,0.809816,0.788636,0.816377
Average,1.35168,0.008809,0.778822,0.810636,0.789969,0.817861


#### Error analysis

In [99]:
%autoreload
mistral_results = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/zeroshot1.csv", index_col=0)
mixtral_results = pd.read_csv(f"../results/ClaimBuster/{HuggingFaceModel.MIXTRAL_INSTRUCT.name}/zeroshot1.csv", index_col=0)
results = [mistral_results, mixtral_results]
models = [HuggingFaceModel.MISTRAL_7B_INSTRUCT, HuggingFaceModel.MIXTRAL_INSTRUCT]
generate_error_analysis_report(
    results=results,
    models=models,
    folder_path=f"../results/ClaimBuster"
)

##################################################
#              MISTRAL_7B_INSTRUCT               #
#              False positives: 413              #
#              False negatives: 624              #
#              Empty predictions: 0              #
#             Wrong output format: 0             #
##################################################
#                MIXTRAL_INSTRUCT                #
#             False positives: 1193              #
#              False negatives: 388              #
#              Empty predictions: 8              #
#            Wrong output format: 259            #
##################################################
#                     Total                      #
#             False positives: 1324              #
#              False negatives: 757              #
#        Overlapping false positives: 282        #
#        Overlapping false negatives: 255        #
##################################################


### CheckThat 2021 Task 1a Tweets

In [3]:
%autoreload
print(torch.cuda.memory_summary())
with open("../prompts/CheckThat/standard/zero-shot.txt", "r") as f:
    instruction = f.read().replace("\n", "")
data = load_check_that_dataset(
    "../data/CheckThat2021Task1a",
)

texts = data["tweet_text"]
prompts = [f"{instruction} '''{text}'''" for text in texts]
zeroshot_output = f"../results/CheckThat/{model_id.name}/zeroshot1.csv"
os.makedirs(os.path.dirname(zeroshot_output), exist_ok=True)

class ProgressDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        return self.dataset[idx]

    def __len__(self):
        return len(self.dataset)


prompts_data = ProgressDataset(prompts)

dataset_with_scores = data.copy()

display(data.head())
dict_matcher = re.compile(r"{.*}")
score_matcher = re.compile(r"([Ss]core[^\d]*)(\d+)")
non_check_worthy_matcher = re.compile(
    r"(non-checkworthy)|(not check-worthy)|(non check-worthy)"
)

with torch.no_grad():
    responses = pipe(prompts_data, batch_size=128)
    for index, result in enumerate(tqdm(responses, total=len(prompts))):
        response = result[0]["generated_text"].replace("\n", "")
        dataset_index = data.index[index]
        try:
            parsed_json = json.loads(dict_matcher.search(response).group(0))
            dataset_with_scores.loc[dataset_index, "score"] = parsed_json["score"]
            dataset_with_scores.loc[dataset_index, "reasoning"] = parsed_json["reasoning"]
        except (json.decoder.JSONDecodeError, AttributeError, KeyError) as e:
            # Try to find score
            score = score_matcher.search(response)
            if score is not None:
                score = score[2]
            else:
                score = 0.0 if non_check_worthy_matcher.search(response) else np.nan
            dataset_with_scores.loc[dataset_index, "score"] = score
            dataset_with_scores.loc[dataset_index, "reasoning"] = response
            continue
columns =  ["check_worthiness", "score", "tweet_text", "reasoning"]
dataset_with_scores = dataset_with_scores[columns]
dataset_with_scores.to_csv(zeroshot_output, index=True)

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  25981 MiB |  25981 MiB | 114559 MiB |  88578 MiB |
|       from large pool |  25899 MiB |  25899 MiB | 114475 MiB |  88576 MiB |
|       from small pool |     81 MiB |     81 MiB |     83 MiB |      2 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  25981 MiB |  25981 MiB | 114559 MiB |  88578 MiB |
|       from large pool |  25899 MiB |  25899 MiB | 114475 MiB |  88576 MiB |
|       from small pool |     81 MiB |     81 MiB |     83 MiB |      2 MiB |
|---------------------------------------------------------------

Unnamed: 0_level_0,topic_id,tweet_url,tweet_text,claim,check_worthiness
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1234964653014384644,covid-19,https://twitter.com/EricTrump/status/123496465...,Since this will never get reported by the medi...,1,1
1234869939720216578,covid-19,https://twitter.com/RealJamesWoods/status/1234...,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0,0
1234873136304267267,covid-19,https://twitter.com/hayxsmith/status/123487313...,"Folks, when you say ""The corona virus isn't a ...",0,0
1235071285027147776,covid-19,https://twitter.com/ipspankajnain/status/12350...,Just 1 case of Corona Virus in India and peop...,1,0
1234911110861594624,covid-19,https://twitter.com/PressSec/status/1234911110...,President @realDonaldTrump made a commitment...,1,1


  dataset_with_scores.loc[dataset_index, "score"] = score
100%|██████████| 1172/1172 [22:23<00:00,  1.15s/it] 


#### Cross validation

In [6]:
%autoreload
for model_id in HuggingFaceModel:
    print(f"Running 4 fold cross validation for model {model_id.name}")
    dataset_path = f"../results/CheckThat/{model_id.name}/zeroshot/zeroshot1.csv"
    dataset_with_scores = pd.read_csv(dataset_path, index_col=0)
    display(run_llm_cross_validation(dataset_with_scores, label_column="check_worthiness"))


Running 4 fold cross validation for model MISTRAL_7B_INSTRUCT


Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro,test_accuracy
0,0.726544,0.007985,0.646576,0.671049,0.652261,0.703072
1,0.6652,0.009818,0.677212,0.70662,0.685311,0.733788
2,0.647787,0.005998,0.644918,0.680886,0.588405,0.59727
3,0.637105,0.007004,0.547064,0.560726,0.521384,0.549488
Average,0.669159,0.007701,0.628943,0.65482,0.61184,0.645904


Running 4 fold cross validation for model MIXTRAL_INSTRUCT


Unnamed: 0,fit_time,score_time,test_precision_macro,test_recall_macro,test_f1_macro,test_accuracy
0,0.634165,0.006297,0.668719,0.68381,0.674556,0.733788
1,0.639037,0.008455,0.634033,0.646314,0.638583,0.706485
2,0.64943,0.006923,0.702764,0.729317,0.711971,0.761092
3,0.662586,0.005998,0.746323,0.719487,0.730459,0.802048
Average,0.646304,0.006918,0.68796,0.694732,0.688892,0.750853


#### Error analysis

In [98]:
%autoreload
folder_path = f"../results/CheckThat"
mistral_results = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MISTRAL_7B_INSTRUCT.name}/zeroshot1.csv", index_col=0)
mixtral_results = pd.read_csv(f"{folder_path}/{HuggingFaceModel.MIXTRAL_INSTRUCT.name}/zeroshot1.csv", index_col=0)
results = [mistral_results, mixtral_results]
models = [HuggingFaceModel.MISTRAL_7B_INSTRUCT, HuggingFaceModel.MIXTRAL_INSTRUCT]
generate_error_analysis_report(
    results=results,
    models=models,
    folder_path=folder_path,
    label_column_name="check_worthiness",
    text_column_name="tweet_text",
)

##################################################
#              MISTRAL_7B_INSTRUCT               #
#              False positives: 376              #
#              False negatives: 36               #
#              Empty predictions: 1              #
#             Wrong output format: 0             #
##################################################
#                MIXTRAL_INSTRUCT                #
#              False positives: 484              #
#              False negatives: 20               #
#              Empty predictions: 3              #
#            Wrong output format: 25             #
##################################################
#                     Total                      #
#              False positives: 568              #
#              False negatives: 51               #
#        Overlapping false positives: 292        #
#         Overlapping false negatives: 5         #
##################################################
