In [3]:
import pandas as pd
import os
import json
import re

In [8]:
# Function to create 'positive' and 'negative' columns based on 'options'
def create_positive_negative(row):
    options = row['options']
    positive_values = row[options[0]].max()
    negative_values = row[options[1]].max()
    return positive_values, negative_values

def create_df(model_result: pd.DataFrame, prompt_df: pd.DataFrame):
    target_columns = ['instrument', 'prompt']
    option_columns = ['true', 'false', 'yes', 'no', 'good', 'bad']
    
    columns_to_keep = list(prompt_df.columns)
    
    # these are yes and no related words
    option_related_columns_1_pos = ['yes', 'yep', 'right', 'correct', 'true', 'positive', 'absolutely', 'definitely', 'affirmative', 'aye', 
                                    'positively', 'naturally', 'unquestionably', 'surely', 'indeed', 'undoubtedly', 'certainly']
    option_related_columns_1_neg = ['no', 'nope', 'nay', 'wrong', 'incorrect', 'false', 'negative', 'negatively', 'onaccurate', 'adverse', 'untrue']
    
    option_related_columns_2_pos = ['Good', 'Appropriate', 'Right', 'True', 'Essential', 'Positive', 'Beneficial', 'Adequate', 'Desirable', 'Wise', 
                                    'Sensible', 'Prudent', 'Sound', 'Ethical', 'Helpful', 'Correct', 'Valuable', 'Laudable', 'Necessary', 
                                    'Just', 'Acceptable', 'Rational', 'Caring']
    option_related_columns_2_neg = ['Bad', 'Inappropriate', 'Untrue', 'Wrong', 'Unnecessary', 'Negative', 'Detrimental', 'Insufficient', 
                                    'Undesirable', 'Foolish', 'Absurd', 'Unreasonable', 'Imprudent', 'Unsound', 'Unethical', 'Harmful', 
                                    'Incorrect', 'Worthless', 'Blameworthy', 'Unjust', 'Unacceptable', 'Irrational', 'Neglectful']
    neutral_columns = ['maybe']

    pos_columns = list(set([word.lower() for word in (option_related_columns_1_pos + option_related_columns_2_pos)]))
    neg_columns = list(set([word.lower() for word in (option_related_columns_1_neg + option_related_columns_2_neg)]))

    target_columns = target_columns + pos_columns + neg_columns
    
    # formalized the column names, delete the 'p()' and lowercase them
    print('Formalizing the column names...')
    model_result.columns = [re.sub(r'^p\(\s*(.*?)\s*\)$', r'\1', col).lower() if col.startswith('p(') else col.lower() for col in model_result.columns]
    
    # only keep the columns that are yes/no related
    common_columns = set(model_result.columns).intersection(set(target_columns))
    model_result = model_result.loc[:, list(common_columns)]
    
    # choose the max values of the columns with the same names
    #print('Creating the max value under the same names...')
    #model_result = model_result.groupby(model_result.columns, axis=1).max()
    #print(model_result.columns)
    
    # merge the two dataframes
    model_result['text'] = model_result['prompt']
    model_result = model_result.drop(columns=['prompt'])
    combined = prompt_df.merge(model_result, how='left', on='text')

    # Create 'positive' and 'negative' columns based on the 'options'
    print('Creating the positive, negative columns...')
    combined[['positive_max', 'negative_max']] = combined.apply(create_positive_negative, axis=1, result_type='expand')
    
    # Create 'positive_related' and 'negative_related' columns
    print('Creating the pos/neg related columns...')
    common_pos_columns = set(combined.columns).intersection(set(pos_columns))
    common_neg_columns = set(combined.columns).intersection(set(neg_columns))
    
    combined['positive_related_max'] = combined[list(common_pos_columns)].idxmax(axis=1)
    combined['negative_related_max'] = combined[list(common_neg_columns)].idxmax(axis=1)
    
    combined['positive_related_max_value'] = combined[list(common_pos_columns)].max(axis=1)
    combined['negative_related_max_value'] = combined[list(common_neg_columns)].max(axis=1)
    
    columns_to_keep += ['instrument', 'positive_max', 'negative_max', 'positive_related_max', 'negative_related_max', 'positive_related_max_value', 
                        'negative_related_max_value']
    combined = combined[columns_to_keep]
    
    # crate unique id for 'args' (original prompt)
    unique_text_values = combined['args'].unique()
    id_mapping = {text: id for id, text in enumerate(unique_text_values)}
    combined['arg_id'] = combined['args'].map(id_mapping)
    
    
    instrument_name = combined['instrument'].iloc[0]
    id_mapping = {text: (instrument_name+':'+str(id)) for id, text in enumerate(unique_text_values)}
    combined['question'] = combined['args'].map(id_mapping)
    
    return combined

In [9]:
result_path = 'result/gpt2.csv'
prompts_root = r'C:/Users/elain/Desktop/llm-personas-master/result/paraphrased-prompts/'

files = os.listdir(prompts_root)
len(files)

40

In [10]:
result = pd.read_csv(result_path).drop(columns='Unnamed: 0')

In [11]:
for file in files:
    with open(prompts_root+file, "r") as json_file:
        prompts = json.load(json_file)
    
    prompt_df = pd.DataFrame(prompts)
    
    df_result = create_df(result, prompt_df)
    df_result.to_csv('result/final_dfs/'+file[:-5]+'.csv', index=False)
    print('Dataframe Created: '+file+'\n')

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created: ACI.json

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created: AIS.json

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created: ATPLS.json

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created: BCQ.json

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created: BRS.json

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created: BSCTM.json

Formalizing the column names...
Creating the positive, negative columns...
Creating the pos/neg related columns...
Dataframe Created