In [165]:
import collections
import pandas as pd
import numpy as np

In [171]:
class LMClassifier:
    def __init__(self, input_texts, labels_dict, gold_labels):
        """
        Args:
            input_texts (List[str]): List of input texts to generate predictions for.
            labels_dict (Dict[str, int]): Dictionary mapping label names to label IDs.
            gold_labels (Union[pd.DataFrame, List[str]]): Gold labels corresponding to the input texts.
        """
        self.input_texts = input_texts
        self.labels_dict = labels_dict
        self.gold_labels = gold_labels

        # check the dimensionality of the labels:
        # dimensionality greater than 1 means dealing with
        # multiple classification tasks at a time
        try:
            self.label_dims = self.labels_dict['label_dims']
        except KeyError:
            self.label_dims = 1
        assert self.label_dims > 0, "Labels dimensions must be greater than 0."

    def generate_predictions(self):
        raise NotImplementedError
    
    def retrieve_predicted_labels(self, predictions, default_label, prompts, only_dim=None):

        # convert the predictions to lowercase
        predictions =  list(map(str.lower,predictions))

        # retrieve the labels that are contained in the predictions
        predicted_labels = []
        if self.label_dims == 1:
            # retrieve a single label for each prediction since a single classification task is performed at a time
            print("Retrieving predictions...")
            for prediction in predictions:
                labels_in_prediction = [self.labels_dict.get(label) for label in self.labels_dict.keys() if label in prediction]
                predicted_labels.append(labels_in_prediction[0]) if len(labels_in_prediction) > 0 else predicted_labels.append(default_label)
            # Count the number of predictions of each type and print the result
            print(collections.Counter(predicted_labels))
        else:
            # retrieve multiple labels for each prediction since multiple classification tasks are performed at a time
            print(f"Retrieving predictions for {self.label_dims} dimensions...")
            for prediction in predictions:
                labels_in_prediction = []
                for dim in self.labels_dict.keys():
                    if dim != 'label_dims':
                        dim_label = []
                        for label in self.labels_dict[dim].keys():
                            if label in prediction:
                                dim_label.append(self.labels_dict[dim].get(label))   
                        dim_label = dim_label[0] if len(dim_label) > 0 else default_label
                        labels_in_prediction.append(dim_label)                                            
                predicted_labels.append(labels_in_prediction)
            # Count the number of predictions of each type and print the result
            print(collections.Counter([",".join(labels) for labels in predicted_labels]))
        
        print(predicted_labels)

        # Add the data to the DataFrame
        if self.label_dims == 1:
            df = pd.DataFrame({'prompt': prompts, 'prediction': predicted_labels})
        elif self.label_dims > 1:
            if only_dim is not None:
                # retrieve only the predictions for a specific dimension
                print(f"Retrieved predictions for dimension {only_dim}")
                df = pd.DataFrame({'prompt': prompts, 'prediction': pd.DataFrame(predicted_labels).to_numpy()[:,only_dim]})
            else:
                print("Retrieved predictions for all dimensions")
                df = pd.DataFrame(predicted_labels).fillna(default_label)
                # rename columns to prediction_n
                df.columns = [f"prediction_dim{i}" for i in range(1, len(df.columns)+1)]
                # add prompts to df
                df['prompt'] = prompts

        return df

# Example usage
predictions = [
    'not_applicable\nRole: NA\nRole: PASSIVE, EXPLICIT, IDEAL',
    'NA, NA, PASSIVE, ACTIVE_POS_OTHER, PASSIVE, ACTIVE',
    'NA',
    'NA\nNA\ACTIVE_POSITIVE_OTHER, EXPLICIT, DESCRIPTIVE\nACTIVE',
    'NA, IMPLICIT, DESCRIPTIVE'
    ]

labels_dict = {
    'label_dims': 3,
    'dim1': {
        'not_applicable': 'NA',
        'passive': 'PASSIVE',
        'active_negative': 'ACTIVE_NEG',
        'active_positive_challenging': 'ACTIVE_POS_CHALLENGING',
        'active_positive_caring': 'ACTIVE_POS_CARING',
        'active_positive_other': 'ACTIVE_POS_OTHER',
    },
    'dim2': {
        'explicit': 'EXPLICIT',
        'implicit': 'IMPLICIT',
    },
    'dim3': {
        'descriptive': 'DESCRIPTIVE',
        'ideal': 'IDEAL'
    }
    }

default_label='NA'
labels_allowed=3

classifier = LMClassifier(
    input_texts=None,
    labels_dict=labels_dict,
    gold_labels=None
    )

df_predicted_labels = classifier.retrieve_predicted_labels(
    predictions=predictions,
    default_label=default_label,
    prompts=None,
    only_dim=0
    )

df_predicted_labels

# for i, prediction in enumerate(predictions):
#     print(f"{prediction}")
#     print(f"----------- \n{result[i]}")
#     print("\n")





Retrieving predictions for 3 dimensions...
Counter({'NA,EXPLICIT,IDEAL': 1, 'PASSIVE,NA,NA': 1, 'NA,NA,NA': 1, 'ACTIVE_POS_OTHER,EXPLICIT,DESCRIPTIVE': 1, 'NA,IMPLICIT,DESCRIPTIVE': 1})
[['NA', 'EXPLICIT', 'IDEAL'], ['PASSIVE', 'NA', 'NA'], ['NA', 'NA', 'NA'], ['ACTIVE_POS_OTHER', 'EXPLICIT', 'DESCRIPTIVE'], ['NA', 'IMPLICIT', 'DESCRIPTIVE']]
Retrieved predictions for dimension 0


Unnamed: 0,prompt,prediction
0,,
1,,PASSIVE
2,,
3,,ACTIVE_POS_OTHER
4,,


In [100]:
predicted_labels = [['not_applicable', 'passive', 'explicit', 'ideal'], ['passive'], [''], ['active_positive_other', 'explicit', 'descriptive'], ['implicit', 'descriptive']]
[len(labels) if len(labels) > 0 else default_label for labels in predicted_labels]

[4, 1, 1, 3, 2]

In [170]:
predicted_labels = [['NA', 'EXPLICIT', 'IDEAL'], ['PASSIVE', 'NA', 'NA'], ['NA', 'NA', 'NA'], ['ACTIVE_POS_OTHER', 'EXPLICIT', 'DESCRIPTIVE'], ['NA', 'IMPLICIT', 'DESCRIPTIVE']]
predicted_labels = pd.DataFrame(predicted_labels).to_numpy()
predicted_labels[:,0]

array(['NA', 'PASSIVE', 'NA', 'ACTIVE_POS_OTHER', 'NA'], dtype=object)

In [78]:
predicted_labels = []
for prediction in predictions:
    labels_in_prediction = [label for label in labels_dict.keys() if label in prediction]
    predicted_labels.append(labels_in_prediction) if len(labels_in_prediction) > 0 else predicted_labels.append([default_label])
print(predicted_labels)
        

[['not_applicable', 'passive', 'explicit', 'ideal'], ['passive'], 'NA', ['active_positive_other', 'explicit', 'descriptive'], ['implicit', 'descriptive']]
