In [25]:
import pandas as pd
import os
import sys
path = '/Users/llupo/dev/pappa'
if not path in sys.path :
    sys.path.append(path)

from lm_classifiers import LMClassifier
from task_manager import TaskManager


In [16]:
inpath1 = '../data/pappa/human_annotation/dim1.csv'
inpath2 = '../data/pappa/human_annotation/dim2.csv'
inpath3 = '../data/pappa/human_annotation/dim3.csv'


df1 = pd.read_csv(inpath1, sep=';').fillna('NA')
df2 = pd.read_csv(inpath2, sep=';').fillna('NA')
df3 = pd.read_csv(inpath3, sep=';').fillna('NA')

# compute the majority label from columns elin, lena, and oscar
df1['majority'] = df1[['elin', 'lena', 'oscar']].mode(axis=1)[0]
df2['majority'] = df2[['elin', 'lena', 'oscar']].mode(axis=1)[0]
df3['majority'] = df3[['elin', 'lena', 'oscar']].mode(axis=1)[0]

# save the majority labels
outpath1 = '../tmp/pappa/dim1/annotators_majority/raw_predictions.txt'
outpath2 = '../tmp/pappa/dim2/annotators_majority/raw_predictions.txt'
outpath3 = '../tmp/pappa/dim3/annotators_majority/raw_predictions.txt'

# create the output directory if it does not exist
os.makedirs(os.path.dirname(outpath1), exist_ok=True)
os.makedirs(os.path.dirname(outpath2), exist_ok=True)
os.makedirs(os.path.dirname(outpath3), exist_ok=True)

df1['majority'].to_csv(outpath1, index=False, header=False)
df2['majority'].to_csv(outpath2, index=False, header=False)
df3['majority'].to_csv(outpath3, index=False, header=False)

In [52]:
# compute the performance of the majority label on dim1
!python ../main.py \
    --data_file ../data/pappa/human_annotation/dim1.csv \
    --instruction annotators \
    --task_file ../tasks/pappa/dim1.json \
    --prompt_suffix "\\nLabel:" \
    --model_name majority \
    --max_len_model 512 \
    --output_dir ../tmp/pappa/dim1 \
    --evaluation_only True \
    --log_to_file False \
    --raw_predictions_good True

2023-09-19 20:39:28 [main] Working on ../tmp/pappa/dim1/annotators_majority
2023-09-19 20:39:28 [main] Evaluation only. Loading raw predictions.
2023-09-19 20:39:28 [main] Gold labels found. Evaluating predictions.
2023-09-19 20:39:28 [lm_classifiers] Evaluating predictions...
2023-09-19 20:39:28 [lm_classifiers] 
         prediction         gold_elin  ...        gold_oscar          gold_agg
0           PASSIVE           PASSIVE  ...           PASSIVE           PASSIVE
1                NA           PASSIVE  ...                NA                NA
2  ACTIVE_POS_OTHER  ACTIVE_POS_OTHER  ...           PASSIVE  ACTIVE_POS_OTHER
3  ACTIVE_POS_OTHER  ACTIVE_POS_OTHER  ...  ACTIVE_POS_OTHER  ACTIVE_POS_OTHER
4           PASSIVE           PASSIVE  ...           PASSIVE           PASSIVE

[5 rows x 5 columns]

2023-09-19 20:39:28 [lm_classifiers] KAPPA:
         elin    lena   oscar     agg   model  mean_non_agg
elin   100.00   48.68   49.93   67.28   69.49         49.30
lena    48.68  100.00  

In [53]:
# compute the performance of the majority label on dim2
!python ../main.py \
    --data_file ../data/pappa/human_annotation/dim2.csv \
    --instruction annotators \
    --task_file ../tasks/pappa/dim2.json \
    --prompt_suffix "\\nLabel:" \
    --model_name majority \
    --max_len_model 512 \
    --output_dir ../tmp/pappa/dim2 \
    --evaluation_only True \
    --log_to_file False \
    --raw_predictions_good True


2023-09-20 10:30:22 [main] Working on ../tmp/pappa/dim2/annotators_majority
2023-09-20 10:30:22 [main] Evaluation only. Loading raw predictions.
2023-09-20 10:30:22 [main] Gold labels found. Evaluating predictions.
2023-09-20 10:30:22 [lm_classifiers] Evaluating predictions...
2023-09-20 10:30:22 [lm_classifiers] 
  prediction gold_elin gold_lena gold_oscar  gold_agg
0   EXPLICIT  EXPLICIT  EXPLICIT   EXPLICIT  EXPLICIT
1         NA  EXPLICIT        NA         NA        NA
2   EXPLICIT  IMPLICIT  EXPLICIT   EXPLICIT  IMPLICIT
3   EXPLICIT  EXPLICIT  EXPLICIT   EXPLICIT  EXPLICIT
4   EXPLICIT  EXPLICIT  EXPLICIT   EXPLICIT  EXPLICIT

2023-09-20 10:30:23 [lm_classifiers] KAPPA:
         elin    lena   oscar     agg   model  mean_non_agg
elin   100.00   23.55   29.94   94.76   54.07         26.74
lena    23.55  100.00   29.56   31.08   59.72         26.55
oscar   29.94   29.56  100.00   36.24   65.26         29.75
agg     94.76   31.08   36.24  100.00   60.44         54.03
model   54.07  

In [54]:
# compute the performance of the majority label on dim3
!python ../main.py \
    --data_file ../data/pappa/human_annotation/dim3.csv \
    --instruction annotators \
    --task_file ../tasks/pappa/dim3.json \
    --prompt_suffix "\\nLabel:" \
    --model_name majority \
    --max_len_model 512 \
    --output_dir ../tmp/pappa/dim3 \
    --evaluation_only True \
    --log_to_file False \
    --raw_predictions_good True


2023-09-20 10:32:20 [main] Working on ../tmp/pappa/dim3/annotators_majority
2023-09-20 10:32:20 [main] Evaluation only. Loading raw predictions.
2023-09-20 10:32:20 [main] Gold labels found. Evaluating predictions.
2023-09-20 10:32:20 [lm_classifiers] Evaluating predictions...
2023-09-20 10:32:20 [lm_classifiers] 
    prediction    gold_elin    gold_lena   gold_oscar gold_agg
0        IDEAL        IDEAL        IDEAL  DESCRIPTIVE    IDEAL
1           NA  DESCRIPTIVE           NA           NA       NA
2  DESCRIPTIVE        IDEAL  DESCRIPTIVE  DESCRIPTIVE    IDEAL
3        IDEAL        IDEAL        IDEAL        IDEAL    IDEAL
4  DESCRIPTIVE        IDEAL  DESCRIPTIVE  DESCRIPTIVE    IDEAL

2023-09-20 10:32:20 [lm_classifiers] KAPPA:
         elin    lena   oscar     agg   model  mean_non_agg
elin   100.00   34.47   48.82   97.26   66.04         41.64
lena    34.47  100.00   39.52   38.31   60.64         37.00
oscar   48.82   39.52  100.00   52.40   79.48         44.17
agg     97.26   38.31