In [1]:
# LLMs
import pandas as pd
import re


from textblob import TextBlob

def classify_prediction(prediction):
    # 预先处理标准格式的预测值
    prediction = prediction.strip()

    # 使用正则表达式检查是否是标准的 True/False/NEI 格式
    match = re.match(r"Prediction: (True|False|NEI)\.?$", prediction, re.IGNORECASE)
    if match:
        return match.group(1).capitalize()

    # 如果不符合标准格式，使用 TextBlob 进行语义分析
    blob = TextBlob(prediction)
    polarity = blob.sentiment.polarity

    # 根据情感倾向性划分为 True 或 False
    if polarity > 0:
        return "True"
    elif polarity < 0:
        return "False"
    else:
        return "NEI"  # 如果情感倾向性不明显，可以划分为 NEI





metadata = pd.read_json("./predicts/predict.json", lines=True, dtype={'num': str})
metadata

def get_response(output):
    response = output.split("Assistant:")[1].replace("\n"," ").strip()
    return response
metadata['Assistant'] = metadata['output'].apply(get_response)

# 对 metadata['Assistant'] 列中的每个值进行分类并拼接
metadata['Classified_Assistant'] = metadata['Assistant'].apply(
    lambda x: f"{x} - {classify_prediction(x)}"
)

metadata

Unnamed: 0,output,Assistant
0,Human: \nTask: Please determine whether the cl...,Prediction: True
1,Human: \nTask: Please determine whether the cl...,Prediction: True.
2,Human: \nTask: Please determine whether the cl...,Prediction: True.
3,Human: \nTask: Please determine whether the cl...,Prediction: True.
4,Human: \nTask: Please determine whether the cl...,Prediction: False.
5,Human: \nTask: Please determine whether the cl...,Prediction: True.
6,Human: \nTask: Please determine whether the cl...,Prediction: True Explanation:
7,Human: \nTask: Please determine whether the cl...,Prediction: False Explanation: A screenshot o...
8,Human: \nTask: Please determine whether the cl...,Prediction: False Explanation:
9,Human: \nTask: Please determine whether the cl...,Prediction: True Explanation: This statement ...


In [2]:
def get_label(text):
    if "true" not in text.lower() and "false" not in text.lower() and "nei" not in text.lower():
        prediction = 99
        return prediction
    match = re.search(r"Prediction: (\w+)", text)
    if match:
        prediction = match.group(1)
        if prediction.lower() == "false":
            prediction = 0
        elif prediction.lower() == "true":
            prediction = 1
        elif prediction.lower() == "nei":
            prediction = 2
        else: 
            prediction = 99
    else:
        print("no prediction")
        prediction = 99
    return prediction

def get_exp(text):
    if "explanation" not in text:
        explanation = ""
    match = re.search(r"Explanation:(.*)", text)
    if match:
        explanation = match.group(1).strip()
#         print(explanation)
#         print()
    else:
        explanation = ""
        print("no explanation")
    return explanation
    
metadata['pre'] = metadata.apply(lambda row: get_label(row['Assistant']), axis=1)
metadata['exp'] = metadata.apply(lambda row: get_exp(row['Assistant']), axis=1)
metadata

no prediction
no prediction
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation
no explanation


Unnamed: 0,output,Assistant,pre,exp
0,Human: \nTask: Please determine whether the cl...,Prediction: True,1,
1,Human: \nTask: Please determine whether the cl...,Prediction: True.,1,
2,Human: \nTask: Please determine whether the cl...,Prediction: True.,1,
3,Human: \nTask: Please determine whether the cl...,Prediction: True.,1,
4,Human: \nTask: Please determine whether the cl...,Prediction: False.,0,
5,Human: \nTask: Please determine whether the cl...,Prediction: True.,1,
6,Human: \nTask: Please determine whether the cl...,Prediction: True Explanation:,1,
7,Human: \nTask: Please determine whether the cl...,Prediction: False Explanation: A screenshot o...,0,A screenshot of a tweet purportedly sent by Pr...
8,Human: \nTask: Please determine whether the cl...,Prediction: False Explanation:,0,
9,Human: \nTask: Please determine whether the cl...,Prediction: True Explanation: This statement ...,1,This statement is true because Mcconnell did i...


In [3]:
df_test = pd.read_json("./data/practice_data/instruct_data/FMD_test.json", lines=True, dtype={'num': str})
df_test

Unnamed: 0,instruction,input,output
0,Task: Please determine whether the claim is Tr...,,"Prediction: True.\nExplanation: In June 2021, ..."
1,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: This is a real...
2,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: Hualapai India...
3,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: Here is how Wa...
4,Task: Please determine whether the claim is Tr...,,Prediction: False.\nExplanation: On 27 June 20...
5,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: Appealing unsu...
6,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: The post whic...
7,Task: Please determine whether the claim is Tr...,,Prediction: False.\nExplanation: Twitter's adv...
8,Task: Please determine whether the claim is Tr...,,Prediction: False.\nExplanation: Beginning wit...
9,Task: Please determine whether the claim is Tr...,,Prediction: NEI.\nExplanation: As Senate Major...


In [4]:
df_test['gold_label'] = df_test.apply(lambda row: get_label(row['output']), axis=1)
df_test['gold_exp'] = df_test.apply(lambda row: get_exp(row['output']), axis=1)
df_test

Unnamed: 0,instruction,input,output,gold_label,gold_exp
0,Task: Please determine whether the claim is Tr...,,"Prediction: True.\nExplanation: In June 2021, ...",1,"In June 2021, Las Vegas Raiders defensive end ..."
1,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: This is a real...,1,"This is a real phone number, a real program, a..."
2,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: Hualapai India...,1,Hualapai Indian Reservationadjacent to Arizona...
3,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: Here is how Wa...,1,"Here is how Walker phrased the claim Aug. 3, 2..."
4,Task: Please determine whether the claim is Tr...,,Prediction: False.\nExplanation: On 27 June 20...,0,On 27 June 2018 the Facebook page Uncle Sam's ...
5,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: Appealing unsu...,1,Appealing unsuccessfully to spend more money f...
6,Task: Please determine whether the claim is Tr...,,Prediction: True.\nExplanation: The post whic...,1,The post which was critical of Biden and the ...
7,Task: Please determine whether the claim is Tr...,,Prediction: False.\nExplanation: Twitter's adv...,0,Twitter's advanced search tool returns no resu...
8,Task: Please determine whether the claim is Tr...,,Prediction: False.\nExplanation: Beginning wit...,0,"Beginning with the upper left-hand photo, it i..."
9,Task: Please determine whether the claim is Tr...,,Prediction: NEI.\nExplanation: As Senate Major...,2,"As Senate Majority Leader, McConnell received ..."


In [5]:
import pandas as pd
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score

gold = df_test['gold_label']
pred = metadata['pre']

accuracy = accuracy_score(gold, pred)
print("Accuracy:", accuracy)

precision = precision_score(gold, pred, average='weighted')
print('Precision:', precision)

recall = recall_score(gold, pred, average='weighted')
print('Recall:', recall)

f1 = f1_score(gold, pred, average='micro')
print("F1:", f1)

Accuracy: 0.54
Precision: 0.5272527472527472
Recall: 0.54
F1: 0.54


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
import evaluate
rouge = evaluate.load('rouge')
predictions = metadata['exp'].to_list()
references = df_test['gold_exp'].to_list()

result = rouge.compute(predictions=predictions, references=references)
rouge_results = [result['rouge1'], result['rouge2'], result['rougeL']]
rouge_results

  from .autonotebook import tqdm as notebook_tqdm


[0.11232930752357453, 0.03680620726751431, 0.06717691801007096]

In [7]:
bert_score = evaluate.load('bertscore')
predictions = metadata['exp'].to_list()
references = df_test['gold_exp'].to_list()

f_n = []
for p in predictions:
    f_n.append(str(p))

result = bert_score.compute(predictions=f_n, references=references, model_type="bert-base-uncased", batch_size=16)
bertscore_overall = sum(result['f1'])/len(result['f1'])
bertscore_overall



0.2897277420759201

In [8]:
final_score = (f1 + rouge_results[0]) / 2

final_score

0.32616465376178727