In [44]:
import json
import random
import re
from collections import Counter
from pathlib import Path

import pandas as pd

In [45]:
# TODO
# - agreement/consistency of model

In [46]:
STEP_NUM = {
            'A1': 4,
            'B1': 5,
            'C1': 10,
            'D1': 5,
        }

In [47]:
class AnalyseLLMParticipant:
    def __init__(self, output_file):
        self.output_file = output_file
        self.config, self.results = self.load_data()
        self.questions = self.load_questions()
        self.df = self.split_by_prompt()
        self.parse_generation_for_selection()
        self.response_eq_example()
        self.response_length_gt_steps()
        self.response_eq_all_steps()
        self.proportion_of_all_steps()
        self.meta_object_selection()

    def load_data(self):
        with open(self.output_file) as f:
            data = json.load(f)

        config = data['config']
        results = data['results']

        return config, results

    def load_questions(self):

        questions = Path(f'../resources/data/select.json')
        with open(questions) as f:
            data = json.load(f)

        return data

    def split_by_prompt(self):

        qids = [r['question_id'] for r in self.results]
        templates = [r['template_id'] for r in self.results]
        responses = [r['response'] for r in self.results]


        model_family = self.config['MODEL'].split('/')[0]
        system = []
        user = []
        generation = []

        if model_family in ['']:


            system_prompt = 'system\n\n'
            user_prompt = "user\n\n"
            generation_prompt = 'assistant\n\n'

            for response in responses:
                system.append(response[len(system_prompt):response.index(user_prompt)])
                user.append(response[response.index(user_prompt) + len(user_prompt):response.index(generation_prompt)])
                generation.append(response[response.index(generation_prompt) + len(generation_prompt):])

        elif model_family in ['mistralai']:

            generation_prompt = '   '
            for response in responses:
                system.append(None)
                user.append(response[:response.index(generation_prompt)])
                generation.append(response[response.index(generation_prompt) + len(generation_prompt):])

        elif model_family in ['google', 'microsoft']:

            # generation simply follows system prompt, so split after system prompt
            system_prompt = self.config['SYSTEM_CONTENT']
            # get index at which system prompt ends
            system_prompt_end = len(system_prompt)
            for response in responses:
                system.append(None)
                user.append(response[:system_prompt_end])
                generation.append(response[system_prompt_end:])

        # dictionary with {qid: [system, user, generation]}
        return pd.DataFrame({'qid': qids,
                             'template': templates,
                             'num_steps': [STEP_NUM[t] for t in templates],
                             'system': system,
                             'user': user,
                             'generation': generation})

    def get_template_from_qid(
            self,
            qid,
            return_steps_or_id: str = 'steps',
        ):

        template = self.questions[qid]['template_id']

        if return_steps_or_id == 'steps':
            return STEP_NUM[template]
        else:
            return template


    def parse_generation_for_selection(self):

        def matches_format(generation):
            # regex for matching lists e.g., [1, 2, 3] or [a, b, c]
            r = re.compile(r'\[.*?\]')
            matches = r.findall(generation)
            if matches:
                match = matches[-1]
                try:
                    test = [str(m) for m in match[1:-1].split(',')]
                except ValueError:
                    test = None
                return match

        def valid_generation(template, generation):
            if generation is not None:
                try:
                    valid = [int(m) for m in generation[1:-1].split(',')]
                except ValueError:
                    valid = None
            else:
                valid = None

            # compare to number in different column
            if valid is not None and len(valid) <= STEP_NUM[template]:
                return valid

            return None

        self.df['matches_format'] = self.df['generation'].apply(lambda x: matches_format(x))
        # compare to number in df['num_steps']
        self.df['valid_generation'] = self.df[['template', 'matches_format']].apply(lambda x: valid_generation(x['template'], x['matches_format']), axis=1)


    def response_eq_example(self):

        example = self.config['EXAMPLE']
        # count number of times example == df['matches_format']
        self.df['response_eq_example'] = self.df['matches_format'].apply(lambda x: True if x == example else False)

    def response_length_gt_steps(self):

        # count cases where the assistant generation has more steps than the example
        self.df['response_length_gt_steps'] = self.df.apply(lambda row: True if row['valid_generation'] is not None and len(row['valid_generation']) > row['num_steps'] else False, axis=1)

    def response_eq_all_steps(self):

        # count cases where assistant generation is just the full list of steps
        self.df['response_eq_all_steps'] = self.df.apply(lambda row: True if row['valid_generation'] == [i for i in range(1, row['num_steps'] + 1)] else False, axis=1)

    def proportion_of_all_steps(self):

        # calculate the proportion of steps that the model selected
        def proportion_of_all_steps(generation, template):
            if generation is not None:
                return len(generation) / STEP_NUM[template]
            return None

        self.df['proportion_of_all_steps'] = self.df.apply(lambda row: proportion_of_all_steps(row['valid_generation'], row['template']), axis=1)

    def meta_object_selection(self):

        example = self.questions[list(self.questions.keys())[0]]
        xp = example['explanation']
        meta_steps = [i['step'] for i in xp if i['label'] == 'meta']
        object_steps = [i['step'] for i in xp if i['label'] == 'object']

        def meta_selections(generation):
            if generation is not None:
                return [i for i in generation if i in meta_steps]
            return None

        def meta_selections_pc(generation):
            if generation is not None:
                return len([i for i in generation if i in meta_steps]) / len(meta_steps)
            return None

        def object_selections(generation):
            if generation is not None:
                return [i for i in generation if i in object_steps]
            return None

        def object_selections_pc(generation):
            if generation is not None:
                return len([i for i in generation if i in object_steps]) / len(object_steps)
            return None

        self.df['meta_selections'] = self.df['valid_generation'].apply(lambda x: meta_selections(x))
        self.df['meta_selections_pc'] = self.df['valid_generation'].apply(lambda x: meta_selections_pc(x))
        self.df['object_selections'] = self.df['valid_generation'].apply(lambda x: object_selections(x))
        self.df['object_selections_pc'] = self.df['valid_generation'].apply(lambda x: object_selections_pc(x))


In [57]:
outputs = Path('outputs/select').iterdir()
output = sorted(outputs)[-1]
with output.open() as f:
    data = json.load(f)

config = data['config']
for k, v in config.items():
    print(f'{k}: {v}')

T = AnalyseLLMParticipant(output)
T.df = T.df[T.df['template'] == 'A1']

MODEL: google/gemma-7b
TEMPERATURE: 0.3
BATCH_SIZE: 20
EXAMPLE: [1, 2, 3, 4]
FORMAT_VERSION: 2
SYSTEM_CONTENT: Your role is to select, from a list the steps, those that are most important for inclusion in a summary explanation of that process. Format your output as a list, for example [1, 2, 3, 4]. Output only this short summary paragraph and nothing else.
INPUTS: ../resources/data/select.json
DESCRIPTION: testing


In [58]:
T.df

Unnamed: 0,qid,template,num_steps,system,user,generation,matches_format,valid_generation,response_eq_example,response_length_gt_steps,response_eq_all_steps,proportion_of_all_steps,meta_selections,meta_selections_pc,object_selections,object_selections_pc
200,46e19d8ebb024aa8,A1,4,,"Your role is to select, from a list the steps,...","Question: In 2026, what will be the populatio...","[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
201,970d19defb444708,A1,4,,"Your role is to select, from a list the steps,...",Question: What will be New Caledonia's urban ...,"[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
202,5c772a08b5024293,A1,4,,"Your role is to select, from a list the steps,...",Question: What will be the energy consumption...,"[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
203,b863227a2bb543a7,A1,4,,"Your role is to select, from a list the steps,...",Question: What will be the GDP of Cook Island...,"[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
204,24891bb689534d71,A1,4,,"Your role is to select, from a list the steps,...",Question: What will be Jordan's energy consum...,"[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,3b43f8e3b3f949a9,A1,4,,"Your role is to select, from a list the steps,...",Question: What will be the birth rate of Nort...,"[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
296,ee40aba68e064a09,A1,4,,"Your role is to select, from a list the steps,...","Question: In 2028, what will be the Netherlan...","[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
297,0717bd9203c04977,A1,4,,"Your role is to select, from a list the steps,...",Question: What will be the population density...,"[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0
298,5621b54b5e114102,A1,4,,"Your role is to select, from a list the steps,...","Question: In 2029, what will be Israel's popu...","[1, 2, 3, 4]","[1, 2, 3, 4]",True,False,True,1.0,"[1, 3]",0.666667,"[2, 4]",1.0


In [59]:
T.df['matches_format'].value_counts()

[1, 2, 3, 4]                                                                                                                       90
[1 fundament fundament fundament fundament fundament fundament fundament fundament fundament fundament ...]                         1
[1 fundament fundament fundament fundament fundament fundament fundament fundament fundament fundament fundament fundament ...]     1
Name: matches_format, dtype: int64

In [60]:
T.df['valid_generation'].value_counts()

[1, 2, 3, 4]    90
Name: valid_generation, dtype: int64

In [61]:
T.df['response_eq_example'].value_counts()

True     90
False    10
Name: response_eq_example, dtype: int64

In [62]:
T.df['response_length_gt_steps'].value_counts()

False    100
Name: response_length_gt_steps, dtype: int64

In [63]:
T.df['response_eq_all_steps'].value_counts()

True     90
False    10
Name: response_eq_all_steps, dtype: int64

In [64]:
T.df['proportion_of_all_steps'].value_counts()

1.0    90
Name: proportion_of_all_steps, dtype: int64

In [65]:
T.df['meta_selections'].value_counts()

[1, 3]    90
Name: meta_selections, dtype: int64

In [66]:
T.df['meta_selections_pc'].value_counts().sort_index()

0.666667    90
Name: meta_selections_pc, dtype: int64

In [67]:
T.df['object_selections'].value_counts()

[2, 4]    90
Name: object_selections, dtype: int64

In [68]:
T.df['object_selections_pc'].value_counts().sort_index()

1.0    90
Name: object_selections_pc, dtype: int64

In [69]:
# templates = ['A1', 'B1', 'C1', 'D1']

# output = Path('outputs/2024-05-24T16:16:02.060434.json')
# output = Path('outputs/2024-05-24T16:16:02.061578.json')
# # output = Path('outputs/2024-05-24T16:16:02.062049.json')
# with output.open() as f:
#     data = json.load(f)


# eq_example = []
# length_gt_steps = []
# eq_steps = []
# for t in templates:
#     T = PromptParticipant(output, t)
#     # count occurences != none
#     eq_example.append(sum(T.df['response_eq_example']))
#     length_gt_steps.append(sum(T.df['response_length_gt_steps']))
#     eq_steps.append(sum(T.df['response_eq_all_steps']))

# # create dataframe from results
# comp_df = pd.DataFrame({
#     'template': templates,
#     'num_steps': STEP_NUM.values(),
#     'eq_example': eq_example,
#     'length_gt_steps': length_gt_steps,
#     'eq_steps': eq_steps
# })

# print(comp_df)