## LTP Analysis

In [1]:
import pandas as pd
import re
import numpy
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
def level_1_mapping(column):
    level_1 = {
        'hasty generalization': 'fallacy of logic',
        'slippery slope': 'fallacy of logic',
        'causal oversimplification': 'fallacy of logic',
        'appeal to ridicule': 'appeal to emotion',
        'appeal to nature': 'fallacy of credibility',
        'false causality': 'fallacy of logic',
        'ad populum': 'fallacy of credibility',
        'ad hominem': 'fallacy of credibility',
        'false analogy': 'fallacy of logic',
        'false dilemma': 'fallacy of credibility',
        'appeal to fear': 'appeal to emotion',
        'appeal to authority': 'fallacy of credibility',
        'appeal to worse problem': 'appeal to emotion',
        'circular reasoning': 'fallacy of logic',
        'guilt by association': 'fallacy of credibility',
        'appeal to anger': 'appeal to emotion',
        'straw man': 'fallacy of logic',
        'appeal to tradition': 'fallacy of credibility',
        'equivocation': 'fallacy of logic',
        'fallacy of division': 'fallacy of logic',
        'tu quoque': 'fallacy of credibility',
        'appeal to positive emotion': 'appeal to emotion',
        'appeal to pity': 'appeal to emotion',
        'appeal to emotion (level 1)': 'appeal to emotion'

    }
    return column.map(level_1).fillna(column)

In [3]:
def level_0_mapping(column):
    level_0 = {
        'hasty generalization': 'fallacy detected',
        'slippery slope': 'fallacy detected',
        'causal oversimplification': 'fallacy detected',
        'appeal to ridicule': 'fallacy detected',
        'appeal to nature': 'fallacy detected',
        'false causality': 'fallacy detected',
        'ad populum': 'fallacy detected',
        'ad hominem': 'fallacy detected',
        'false analogy': 'fallacy detected',
        'false dilemma': 'fallacy detected',
        'appeal to fear': 'fallacy detected',
        'appeal to authority': 'fallacy detected',
        'appeal to worse problem': 'fallacy detected',
        'circular reasoning': 'fallacy detected',
        'guilt by association': 'fallacy detected',
        'appeal to anger': 'fallacy detected',
        'strawman': 'fallacy detected',
        'appeal to tradition': 'fallacy detected',
        'equivocation': 'fallacy detected',
        'fallacy of division': 'fallacy detected',
        'tu quoque': 'fallacy detected',
        'appeal to positive emotion': 'fallacy detected',
        'appeal to pity': 'fallacy detected',
        'appeal to emotion (level 1)': 'fallacy detected'
    }

    return column.map(level_0).fillna('no fallacy detected')

In [5]:
file_path = 'canonical/results_canonical_spans/results-cot-mafalda-spans-gemma_canonical_new.csv'  # Replace with the actual file path
results = pd.read_csv(file_path)

# Map actual_label column using the level_1 dictionary
results['expected_label_level1'] =  level_1_mapping(results['expected_label'])
results['expected_label_level0'] = level_0_mapping(results['expected_label'])
results['canonical_level1'] = level_1_mapping(results['canonical'])
results['canonical_level0'] = level_0_mapping(results['canonical'])
results.head()

Unnamed: 0.1,Unnamed: 0,text,expected_label,actual_label,result,canonical,expected_label_level1,expected_label_level0,canonical_level1,canonical_level0
0,0,I keep seeing if an adventure mode needs to ex...,slippery slope,answer: appeal to emotional manipulation.\n\nt...,False,no match,fallacy of logic,fallacy detected,no match,no fallacy detected
1,1,That leads to me believe that most cat lovers ...,hasty generalization,answer: appeal to stereotype.\n\nthe premise s...,False,no match,fallacy of logic,fallacy detected,no match,no fallacy detected
2,2,"Someone once told me they have an""alt"" cause t...",false analogy,answer: straw man fallacy.\n\nthe person has m...,False,straw man,fallacy of logic,fallacy detected,fallacy of logic,no fallacy detected
3,3,Joe Biden will lose to Trump if he is the nomi...,guilt by association,answer: appeal to pity.\n\nthe author suggests...,False,appeal to pity,fallacy of credibility,fallacy detected,appeal to emotion,fallacy detected
4,4,Joe Biden will lose to Trump if he is the nomi...,causal oversimplification,answer: appeal to anger.\n\nthe text relies on...,False,appeal to anger,fallacy of logic,fallacy detected,appeal to emotion,fallacy detected


In [6]:
def calc(expected, canonical, method='weighted'):
    expected_labels = expected
    processed_labels  = canonical
    
    accuracy = accuracy_score(expected_labels, processed_labels)
    precision = precision_score(expected_labels, processed_labels, average=method)  # adjust average method as necessary
    recall = recall_score(expected_labels, processed_labels, average=method)  # adjust average method as necessary
    f1 = f1_score(expected_labels, processed_labels, average=method)  # adjust average method as necessary

    return accuracy, precision, recall, f1

In [17]:
level_0_res = pd.DataFrame()
level_1_res = pd.DataFrame()
level_2_res = pd.DataFrame()

method = 'weighted'        

for prompting_technique in ['cot', 'cot-sc', 'rar-1', 'rar-2', 'tot']:
    for model in ['gemma', 'mistral', 'openchat']:
        file_path = 'canonical/results_canonical_spans/results-{}-mafalda-spans-{}_canonical_new.csv'.format(prompting_technique, model)
        results = pd.read_csv(file_path)
        
        results['expected_label_level1'] =  level_1_mapping(results['expected_label'])
        results['expected_label_level0'] = level_0_mapping(results['expected_label'])
        results['canonical_level1'] = level_1_mapping(results['canonical'])
        results['canonical_level0'] = level_0_mapping(results['canonical'])

        expected_labels_level_0 = results['expected_label_level0']
        processed_labels_level_0  = results['canonical_level0']
        
        accuracy_level_0, precision_level_0, recall_level_0, f1_level_0 = calc(expected_labels_level_0, processed_labels_level_0, method)

        level_0_res = level_0_res._append({
            'model': model,
            'prompting_technique': prompting_technique,
            'accuracy': accuracy_level_0,
            'precision': precision_level_0,
            'recall': recall_level_0,
            'f1': f1_level_0
        }, ignore_index=True)

        expected_labels_level_1 = results['expected_label_level1']
        processed_labels_level_1 = results['canonical_level1']
        
        accuracy_level_1, precision_level_1, recall_level_1, f1_level_1 = calc(expected_labels_level_1, processed_labels_level_1, method)

        level_1_res = level_1_res._append({
            'model': model,
            'prompting_technique': prompting_technique,
            'accuracy': accuracy_level_1,
            'precision': precision_level_1,
            'recall': recall_level_1,
            'f1': f1_level_1
        }, ignore_index=True)

        expected_labels_level_1 = results['expected_label_level1']
        processed_labels_level_1 = results['canonical_level1']
        
        accuracy_level_1, precision_level_1, recall_level_1, f1_level_1 = calc(expected_labels_level_1, processed_labels_level_1, method)

        level_1_res = level_1_res._append({
            'model': model,
            'prompting_technique': prompting_technique,
            'accuracy': accuracy_level_1,
            'precision': precision_level_1,
            'recall': recall_level_1,
            'f1': f1_level_1
        }, ignore_index=True)

        expected_labels_level_2 = results['expected_label']
        processed_labels_level_2 = results['canonical']
        
        accuracy_level_2, precision_level_2, recall_level_2, f1_level_2 = calc(expected_labels_level_2, processed_labels_level_2, method)

        level_2_res = level_2_res._append({
            'model': model,
            'prompting_technique': prompting_technique,
            'accuracy': accuracy_level_2,
            'precision': precision_level_2,
            'recall': recall_level_2,
            'f1': f1_level_2
        }, ignore_index=True)


level_0_res.to_csv("results/level-0-spans.csv", sep=",")
level_1_res.to_csv("results/level-1-spans.csv", sep=",")
level_2_res.to_csv("results/level-2-spans.csv", sep=",")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [8]:
level_0_res

Unnamed: 0,model,prompting_technique,accuracy,precision,recall,f1
0,gemma,cot,0.53125,0.64831,0.53125,0.55622
1,mistral,cot,0.639676,0.705338,0.639676,0.659207
2,openchat,cot,0.55157,0.722928,0.55157,0.568921
3,gemma,cot-sc,0.53125,0.643748,0.53125,0.556512
4,mistral,cot-sc,0.674009,0.718436,0.674009,0.687857
5,openchat,cot-sc,0.511211,0.712323,0.511211,0.52323
6,gemma,rar-1,0.673554,0.671964,0.673554,0.672748
7,mistral,rar-1,0.726087,0.669376,0.726087,0.659297
8,openchat,rar-1,0.705628,0.697025,0.705628,0.700851
9,gemma,rar-2,0.604743,0.627901,0.604743,0.615203


In [9]:
level_0_model = level_0_res.groupby('model')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_0_model

Unnamed: 0_level_0,accuracy,precision,recall,f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gemma,0.572624,0.651648,0.572624,0.588753
mistral,0.680155,0.689971,0.680155,0.663134
openchat,0.595628,0.712465,0.595628,0.600456


In [10]:
level_0_model.to_csv("results/level-0-model-spans.csv", sep=",")

In [11]:
level_1_model = level_1_res.groupby('model')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_1_model

Unnamed: 0_level_0,accuracy,precision,recall,f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gemma,0.319062,0.49906,0.319062,0.342755
mistral,0.420114,0.417903,0.420114,0.394202
openchat,0.343301,0.503793,0.343301,0.348713


In [12]:
level_1_model.to_csv("results/level-1-model-spans.csv", sep=",")

In [13]:
level_0_prompt = level_0_res.groupby('prompting_technique')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_0_prompt

Unnamed: 0_level_0,accuracy,precision,recall,f1
prompting_technique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cot,0.574165,0.692192,0.574165,0.594783
cot-sc,0.572157,0.691502,0.572157,0.589199
rar-1,0.701756,0.679455,0.701756,0.677632
rar-2,0.677924,0.667895,0.677924,0.658009
tot,0.554676,0.692427,0.554676,0.567616


In [14]:
level_0_prompt.to_csv("results/level-0-prompt-spans.csv", sep=",")

In [15]:
level_1_prompt = level_1_res.groupby('prompting_technique')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_1_prompt

Unnamed: 0_level_0,accuracy,precision,recall,f1
prompting_technique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cot,0.291029,0.500985,0.291029,0.311133
cot-sc,0.30957,0.551568,0.30957,0.336845
rar-1,0.464134,0.435904,0.464134,0.42625
rar-2,0.436751,0.409419,0.436751,0.406435
tot,0.302645,0.470051,0.302645,0.328788


In [16]:
level_1_prompt.to_csv("results/level-1-prompt-spans.csv", sep=",")