## LTP Analysis

In [1]:
import pandas as pd
import re
import numpy
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
def level_1_mapping(column):
    level_1 = {
        'hasty generalization': 'fallacy of logic',
        'slippery slope': 'fallacy of logic',
        'causal oversimplification': 'fallacy of logic',
        'appeal to ridicule': 'appeal to emotion',
        'appeal to nature': 'fallacy of credibility',
        'false causality': 'fallacy of logic',
        'ad populum': 'fallacy of credibility',
        'ad hominem': 'fallacy of credibility',
        'false analogy': 'fallacy of logic',
        'false dilemma': 'fallacy of credibility',
        'appeal to fear': 'appeal to emotion',
        'appeal to authority': 'fallacy of credibility',
        'appeal to worse problem': 'appeal to emotion',
        'circular reasoning': 'fallacy of logic',
        'guilt by association': 'fallacy of credibility',
        'appeal to anger': 'appeal to emotion',
        'straw man': 'fallacy of logic',
        'appeal to tradition': 'fallacy of credibility',
        'equivocation': 'fallacy of logic',
        'fallacy of division': 'fallacy of logic',
        'tu quoque': 'fallacy of credibility',
        'appeal to positive emotion': 'appeal to emotion',
        'appeal to pity': 'appeal to emotion',
        'appeal to emotion (level 1)': 'appeal to emotion'

    }
    return column.map(level_1).fillna(column)

In [3]:
def level_0_mapping(column):
    level_0 = {
        'hasty generalization': 'fallacy detected',
        'slippery slope': 'fallacy detected',
        'causal oversimplification': 'fallacy detected',
        'appeal to ridicule': 'fallacy detected',
        'appeal to nature': 'fallacy detected',
        'false causality': 'fallacy detected',
        'ad populum': 'fallacy detected',
        'ad hominem': 'fallacy detected',
        'false analogy': 'fallacy detected',
        'false dilemma': 'fallacy detected',
        'appeal to fear': 'fallacy detected',
        'appeal to authority': 'fallacy detected',
        'appeal to worse problem': 'fallacy detected',
        'circular reasoning': 'fallacy detected',
        'guilt by association': 'fallacy detected',
        'appeal to anger': 'fallacy detected',
        'strawman': 'fallacy detected',
        'appeal to tradition': 'fallacy detected',
        'equivocation': 'fallacy detected',
        'fallacy of division': 'fallacy detected',
        'tu quoque': 'fallacy detected',
        'appeal to positive emotion': 'fallacy detected',
        'appeal to pity': 'fallacy detected',
        'appeal to emotion (level 1)': 'fallacy detected'
    }

    return column.map(level_0).fillna('no fallacy detected')

In [5]:
file_path = 'canonical/results_canonical_spans/results-cot-mafalda-spans-gemma_canonical.csv'  # Replace with the actual file path
results = pd.read_csv(file_path)

# Map actual_label column using the level_1 dictionary
results['expected_label_level1'] =  level_1_mapping(results['expected_label'])
results['expected_label_level0'] = level_0_mapping(results['expected_label'])
results['canonical_level1'] = level_1_mapping(results['canonical'])
results['canonical_level0'] = level_0_mapping(results['canonical'])
results.head()

Unnamed: 0.1,Unnamed: 0,text,expected_label,actual_label,result,canonical,expected_label_level1,expected_label_level0,canonical_level1,canonical_level0
0,0,I keep seeing if an adventure mode needs to ex...,slippery slope,answer: appeal to emotional manipulation.\n\nt...,False,no match,fallacy of logic,fallacy detected,no match,no fallacy detected
1,1,That leads to me believe that most cat lovers ...,hasty generalization,answer: appeal to stereotype.\n\nthe premise s...,False,no match,fallacy of logic,fallacy detected,no match,no fallacy detected
2,2,"Someone once told me they have an""alt"" cause t...",false analogy,answer: straw man fallacy.\n\nthe person has m...,False,straw man,fallacy of logic,fallacy detected,fallacy of logic,no fallacy detected
3,3,Joe Biden will lose to Trump if he is the nomi...,guilt by association,answer: appeal to pity.\n\nthe author suggests...,False,appeal to pity,fallacy of credibility,fallacy detected,appeal to emotion,fallacy detected
4,4,Joe Biden will lose to Trump if he is the nomi...,causal oversimplification,answer: appeal to anger.\n\nthe text relies on...,False,appeal to anger,fallacy of logic,fallacy detected,appeal to emotion,fallacy detected


In [6]:
def calc(expected, canonical, method='weighted'):
    expected_labels = expected
    processed_labels  = canonical
    
    accuracy = accuracy_score(expected_labels, processed_labels)
    precision = precision_score(expected_labels, processed_labels, average=method)  # adjust average method as necessary
    recall = recall_score(expected_labels, processed_labels, average=method)  # adjust average method as necessary
    f1 = f1_score(expected_labels, processed_labels, average=method)  # adjust average method as necessary

    return accuracy, precision, recall, f1

In [17]:
level_0_res = pd.DataFrame()
level_1_res = pd.DataFrame()

method = 'weighted'        

for prompting_technique in ['cot', 'cot-sc', 'rar-1', 'rar-2', 'tot']:
    for model in ['gemma', 'mistral', 'openchat']:
        file_path = 'canonical/results_canonical_spans/results-{}-mafalda-spans-{}_canonical.csv'.format(prompting_technique, model)
        results = pd.read_csv(file_path)
        
        results['expected_label_level1'] =  level_1_mapping(results['expected_label'])
        results['expected_label_level0'] = level_0_mapping(results['expected_label'])
        results['canonical_level1'] = level_1_mapping(results['canonical'])
        results['canonical_level0'] = level_0_mapping(results['canonical'])

        expected_labels_level_0 = results['expected_label_level0']
        processed_labels_level_0  = results['canonical_level0']
        
        accuracy_level_0, precision_level_0, recall_level_0, f1_level_0 = calc(expected_labels_level_0, processed_labels_level_0, method)

        level_0_res = level_0_res._append({
            'model': model,
            'prompting_technique': prompting_technique,
            'accuracy': accuracy_level_0,
            'precision': precision_level_0,
            'recall': recall_level_0,
            'f1': f1_level_0
        }, ignore_index=True)

        expected_labels_level_1 = results['expected_label_level1']
        processed_labels_level_1 = results['canonical_level1']
        
        accuracy_level_1, precision_level_1, recall_level_1, f1_level_1 = calc(expected_labels_level_1, processed_labels_level_1, method)

        level_1_res = level_1_res._append({
            'model': model,
            'prompting_technique': prompting_technique,
            'accuracy': accuracy_level_1,
            'precision': precision_level_1,
            'recall': recall_level_1,
            'f1': f1_level_1
        }, ignore_index=True)

level_0_res.to_csv("results/level-0-spans.csv", sep=",")
level_1_res.to_csv("results/level-1-spans.csv", sep=",")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [18]:
level_0_res

Unnamed: 0,model,prompting_technique,accuracy,precision,recall,f1
0,gemma,cot,0.535714,0.655242,0.535714,0.560276
1,mistral,cot,0.638211,0.703953,0.638211,0.657696
2,openchat,cot,0.55157,0.722928,0.55157,0.568921
3,gemma,cot-sc,0.53125,0.643748,0.53125,0.556512
4,mistral,cot-sc,0.669604,0.716168,0.669604,0.683938
5,openchat,cot-sc,0.506726,0.710263,0.506726,0.518113
6,gemma,rar-1,0.673554,0.675142,0.673554,0.674337
7,mistral,rar-1,0.721739,0.660641,0.721739,0.656442
8,openchat,rar-1,0.692641,0.691119,0.692641,0.691867
9,gemma,rar-2,0.599206,0.628112,0.599206,0.6119


In [19]:
level_0_model = level_0_res.groupby('model')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_0_model

Unnamed: 0_level_0,accuracy,precision,recall,f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gemma,0.572409,0.653712,0.572409,0.589222
mistral,0.676703,0.686401,0.676703,0.660864
openchat,0.590454,0.712443,0.590454,0.596795


In [20]:
level_0_model.to_csv("results/level-0-model-spans.csv", sep=",")

In [21]:
level_1_model = level_1_res.groupby('model')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_1_model

Unnamed: 0_level_0,accuracy,precision,recall,f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gemma,0.315348,0.500211,0.315348,0.34025
mistral,0.418508,0.419074,0.418508,0.393865
openchat,0.336771,0.503197,0.336771,0.344881


In [22]:
level_1_model.to_csv("results/level-1-model-spans.csv", sep=",")

In [23]:
level_0_prompt = level_0_res.groupby('prompting_technique')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_0_prompt

Unnamed: 0_level_0,accuracy,precision,recall,f1
prompting_technique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cot,0.575165,0.694041,0.575165,0.595631
cot-sc,0.569193,0.69006,0.569193,0.586187
rar-1,0.695978,0.675634,0.695978,0.674215
rar-2,0.675848,0.669853,0.675848,0.659478
tot,0.549758,0.691338,0.549758,0.562623


In [24]:
level_0_prompt.to_csv("results/level-0-prompt-spans.csv", sep=",")

In [25]:
level_1_prompt = level_1_res.groupby('prompting_technique')[['accuracy', 'precision', 'recall', 'f1']].mean()
level_1_prompt

Unnamed: 0_level_0,accuracy,precision,recall,f1
prompting_technique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cot,0.291485,0.506443,0.291485,0.312077
cot-sc,0.306607,0.550103,0.306607,0.334341
rar-1,0.455601,0.433166,0.455601,0.421074
rar-2,0.428986,0.408422,0.428986,0.402823
tot,0.3017,0.472669,0.3017,0.328012


In [26]:
level_1_prompt.to_csv("results/level-1-prompt-spans.csv", sep=",")