In [3]:
import collections
import pandas as pd
import numpy as np
import sys
path = '/Users/llupo/dev/pappa'
if not path in sys.path :
    sys.path.append(path)

from classifiers import LMClassifier
from task_manager import TaskManager

In [29]:
import os

#output_dir = '/Users/llupo/dev/pappa/results/pappa/alldim/alldata_long_fewshot_text-davinci-003'
output_dir = '/Users/llupo/dev/pappa/results/pappa/alldim/alldata_long_fewshot_gpt-4'

with open(os.path.join(output_dir, 'raw_predictions.txt'), 'r') as f:
    predictions = f.read().splitlines()

print(len(predictions))
print(predictions[:5])

1910
['passive, explicit, descriptive', 'passive, explicit, descriptive', 'active_positive_other, explicit, ideal', 'active_positive_other, explicit, ideal', 'active_positive_other, explicit, ideal']


In [30]:
from logging import getLogger
logger = getLogger(__name__)

class LMClassifier:
    def __init__(
            self,
            labels_dict,
            label_dims,
            default_label,
            ):

        self.labels_dict = labels_dict
        # check the dimensionality of the labels:
        # dimensionality greater than 1 means dealing with
        # multiple classification tasks at a time
        self.label_dims = label_dims
        assert self.label_dims > 0, "Labels dimensions must be greater than 0."
        self.default_label = default_label

    def generate_predictions(self):
        raise NotImplementedError
    
    def retrieve_predicted_labels(self, predictions, prompts, only_dim=None):

        # convert the predictions to lowercase
        predictions =  list(map(str.lower,predictions))

        # retrieve the labels that are contained in the predictions
        predicted_labels = []
        if self.label_dims == 1:
            # retrieve a single label for each prediction since a single classification task is performed at a time
            logger.info("Retrieving predictions...")
            for prediction in predictions:
                labels_in_prediction = [self.labels_dict.get(label) for label in self.labels_dict.keys() if label in prediction]
                if len(labels_in_prediction) > 0:
                    predicted_labels.append(labels_in_prediction[0])
                else:
                    predicted_labels.append(self.labels_dict.get(self.default_label))
            # Count the number of predictions of each type and print the result
            logger.info(collections.Counter(predicted_labels))
        else:
            # retrieve multiple labels for each prediction since multiple classification tasks are performed at a time
            logger.info(f"Retrieving predictions for {self.label_dims} dimensions...")
            for prediction in predictions:
                labels_in_prediction = []
                for dim in self.labels_dict.keys():
                    dim_label = []
                    for label in self.labels_dict[dim].keys():
                        if label in prediction:
                            dim_label.append(self.labels_dict[dim].get(label))   
                    dim_label = dim_label[0] if len(dim_label) > 0 else self.labels_dict[dim].get(self.default_label)
                    labels_in_prediction.append(dim_label)                                            
                predicted_labels.append(labels_in_prediction)
            # Count the number of predictions of each type and print the result
            logger.info(collections.Counter([",".join(labels) for labels in predicted_labels]))
        
        # Add the data to a DataFrame
        if self.label_dims == 1:
            df = pd.DataFrame({'prompt': prompts, 'prediction': predicted_labels})
        elif self.label_dims > 1:
            if only_dim is not None:
                # retrieve only the predictions for a specific dimension
                logger.info(f"Retrieved predictions for dimension {only_dim}")
                df = pd.DataFrame({'prompt': prompts, 'prediction': pd.DataFrame(predicted_labels).to_numpy()[:,only_dim]})
            else:
                logger.info("Retrieved predictions for all dimensions")
                df = pd.DataFrame(predicted_labels).fillna(self.default_label)
                # rename columns to prediction_n
                df.columns = [f"prediction_dim{i}" for i in range(1, len(df.columns)+1)]
                # add prompts to df
                df['prompt'] = prompts

        return df

labels_dict = {
    "label_dims": 3,
    "labels": {
        "dim1": {
            "not_applicable": "NA",
            "passive": "PASSIVE",
            "active_negative": "ACTIVE_NEG",
            "active_positive_challenging": "ACTIVE_POS_CHALLENGING",
            "active_positive_caring": "ACTIVE_POS_CARING",
            "active_positive_other": "ACTIVE_POS_OTHER"
        },
        "dim2": {
            "not_applicable": "NA",
            "explicit": "EXPLICIT",
            "implicit": "IMPLICIT"
        },
        "dim3": {
            "not_applicable": "NA",
            "descriptive": "DESCRIPTIVE",
            "ideal": "IDEAL"
        }
    },
    "read_function": "pappa"
}

default_label='NA'
labels_allowed=3

classifier = LMClassifier(
    labels_dict=labels_dict['labels'],
    label_dims=labels_dict['label_dims'],
    default_label="not_applicable"
    )

df_predicted_labels = classifier.retrieve_predicted_labels(
    predictions=predictions,
    prompts=None,
    only_dim=None
    )

df_predicted_labels

Unnamed: 0,prediction_dim1,prediction_dim2,prediction_dim3,prompt
0,PASSIVE,EXPLICIT,DESCRIPTIVE,
1,PASSIVE,EXPLICIT,DESCRIPTIVE,
2,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,
3,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,
4,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,
...,...,...,...,...
1905,ACTIVE_POS_CARING,IMPLICIT,DESCRIPTIVE,
1906,ACTIVE_POS_OTHER,IMPLICIT,DESCRIPTIVE,
1907,ACTIVE_NEG,EXPLICIT,DESCRIPTIVE,
1908,ACTIVE_NEG,EXPLICIT,DESCRIPTIVE,


In [42]:
path_data='/Users/llupo/dev/pappa/data/pappa/F_sample_clean_after1993.xlsx'
pd_data = pd.read_excel(path_data, engine='openpyxl')
pd_data.shape


(1910, 19)

In [43]:
# outer join dataframes on index
df_join = df_predicted_labels.merge(pd_data, how='outer', left_index=True, right_index=True)
print(df_join.shape)
df_join.tail(20)

(1910, 23)


Unnamed: 0.1,prediction_dim1,prediction_dim2,prediction_dim3,prompt,Unnamed: 0,text_clean,filename,year,category,text_lemmas,...,label_str,dim1,dim2,dim3,election_period,contains_FATHER,contains_MOTHER,is_positive,dim1_reduced,dim1_binary
1890,ACTIVE_POS_OTHER,EXPLICIT,DESCRIPTIVE,,360779,bland annat har riksdagen beslutat om att förl...,gp05sfu1y,2001,Yttrande,riksdag förlänga föräldrapenning 00 ersättning...,...,"FATHER, MOTHER, PARENTS",PASSIVE,EXPLICIT,DESCRIPTIVE,1998,True,True,0.0,PASSIVE,PASSIV
1891,PASSIVE,EXPLICIT,DESCRIPTIVE,,360783,underhållsstödet bör enligt vår mening ersätta...,gp05sfu1y,2001,Yttrande,underhållsstöde böra mening ersätta ensamståen...,...,"FATHER, PARENTS",ACTIVE_POS_OTHER,EXPLICIT,IDEAL,1998,True,False,1.0,ACTIVE_POS,ACTIVE
1892,ACTIVE_POS_OTHER,EXPLICIT,DESCRIPTIVE,,361787,inom ramen för den förlängda föräldraförsäkrin...,gn05sfu7y,1999,Yttrande,ram förlänga föräldraförsäkring sammanlagd 00 ...,...,"FATHER, MOTHER, PARENTS",ACTIVE_POS_OTHER,EXPLICIT,IDEAL,1998,True,True,1.0,ACTIVE_POS,ACTIVE
1893,PASSIVE,EXPLICIT,IDEAL,,361789,vidare föreslås att ensamstående mödrar bör ku...,gn05sfu7y,1999,Yttrande,föreslå ensamstående böra överlåta ` ` pappada...,...,FATHER,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,1998,True,False,1.0,ACTIVE_POS,ACTIVE
1894,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,,361791,vad gäller förslag om att ensamstående mödrar ...,gn05sfu7y,1999,Yttrande,förslag ensamstående böra överlåta ` ` pappada...,...,FATHER,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,1998,True,False,1.0,ACTIVE_POS,ACTIVE
1895,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,,361794,vad gäller förslag om att ensamstående mödrar ...,gn05sfu7y,1999,Yttrande,förslag ensamstående böra överlåta ` ` pappada...,...,FATHER,ACTIVE_POS_OTHER,EXPLICIT,IDEAL,1998,True,False,1.0,ACTIVE_POS,ACTIVE
1896,ACTIVE_POS_OTHER,EXPLICIT,DESCRIPTIVE,,362347,resultaten pekade på att när det gäller transp...,h70wrfr10,2019,Övrigt,resultat peka transportval vara förebild döttr...,...,FATHER,ACTIVE_POS_OTHER,IMPLICIT,DESCRIPTIVE,2018,True,False,1.0,ACTIVE_POS,ACTIVE
1897,PASSIVE,IMPLICIT,DESCRIPTIVE,,362443,hans pappa skulle gå och hälsa på sin bästa ko...,h80wrfr9,2020,Övrigt,hälsa bra kompis pappa försvann tid,...,FATHER,,,DESCRIPTIVE,2018,True,False,0.0,,
1898,ACTIVE_POS_CARING,IMPLICIT,DESCRIPTIVE,,362773,om barnet dör verkar dock fäder på lång sikt k...,h30wrfr13,2015,Övrigt,barn dö verka far lång sikt drabba psykisk ohä...,...,FATHER,PASSIVE,EXPLICIT,DESCRIPTIVE,2014,True,False,0.0,PASSIVE,PASSIV
1899,ACTIVE_POS_CARING,EXPLICIT,DESCRIPTIVE,,362774,en ny svensk studie visar att en större andel ...,h30wrfr13,2015,Övrigt,ny svensk studie visa andel far lida posttraum...,...,FATHER,PASSIVE,EXPLICIT,DESCRIPTIVE,2014,True,False,0.0,PASSIVE,PASSIV


In [45]:
# save the dataframe
outpath='/Users/llupo/dev/pappa/data/pappa/F_sample_clean_after1993_gpt4predictions.xlsx'
df_join.to_excel(outpath, index=False)

In [None]:
df_predicted_labels = classifier.retrieve_predicted_labels(
    predictions=predictions,
    prompts=prompts,
    only_dim=only_dim
    )