# TOP BASED ON LOGITS RATHER THAN LORS

In [1]:
import json 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import Counter

# Load each JSON file into a dictionary
with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_FEMALE_1610_1.json', 'r') as file:
    female_data_model1 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_MALE_1610_1.json', 'r') as file:
    male_data_model1 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_NEUT_1610_1.json', 'r') as file:
    neutral_data_model1 = json.load(file)

# Load each JSON file into a dictionary
with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_FEMALE_1610_1.json', 'r') as file:
    female_data_model2 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_MALE_1610_1.json', 'r') as file:
    male_data_model2 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_NEUT_1610_1.json', 'r') as file:
    neutral_data_model2 = json.load(file)

# Load each JSON file into a dictionary
with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_NOCON_1610_1.json', 'r') as file:
    nocon_data_model1 = json.load(file)

# Load each JSON file into a dictionary
with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_NOCON_1610_1.json', 'r') as file:
    nocon_data_model2 = json.load(file)



In [2]:
def aggregate_data(data, gender, model):
    aggregated_data = []

    for prompt, adjectives in data.items():
        for adjective, details in adjectives.items():
            aggregated_data.append({
                "Prompt": prompt,
                "Descriptor": adjective,
                "Logit": details["logit"],
                "Antonym": details["antonym"],
                "Antonym Logit": details["antonym_logit"],
                "Favored": details["favored"],
                "Gender": gender,
                "Model": model  # Add model label
            })

    return aggregated_data

# Aggregate data for each gender and model
female_aggregated_model1 = aggregate_data(female_data_model1, "Female", "LLaMA-3-8b")
male_aggregated_model1 = aggregate_data(male_data_model1, "Male", "LLaMA-3-8b")
neutral_aggregated_model1 = aggregate_data(neutral_data_model1, "Neutral", "LLaMA-3-8b")

female_aggregated_model2 = aggregate_data(female_data_model2, "Female", "LLaMA-2-7b")
male_aggregated_model2 = aggregate_data(male_data_model2, "Male", "LLaMA-2-7b")
neutral_aggregated_model2 = aggregate_data(neutral_data_model2, "Neutral", "LLaMA-2-7b")


# Combine all aggregated data into a single list
all_data = (female_aggregated_model1 + male_aggregated_model1 + neutral_aggregated_model1 +
            female_aggregated_model2 + male_aggregated_model2 + neutral_aggregated_model2)

# Create a DataFrame from the aggregated data
df = pd.DataFrame(all_data)

# List of diagnoses
diagnoses = [
    "schizophrenia", 
    "depression", 
    "generalised anxiety disorder", 
    "bipolar disorder", 
    "borderline personality disorder", 
    "a mental illness", 
    "antisocial personality disorder", 
    "post-traumatic stress disorder", 
    "obsessive-compulsive disorder", 
    "dissociative identity disorder"
]

# Function to extract diagnosis from the prompt
def extract_diagnosis(prompt):
    for diagnosis in diagnoses:
        if diagnosis in prompt.lower():  # case-insensitive matching
            return diagnosis
    return None  # In case the diagnosis is not found

# Assuming df is your original DataFrame

# Create a new dataframe to hold the reshaped data
reshaped_data = []

# Loop through the rows of the original DataFrame and restructure the data
for _, row in df.iterrows():
    diagnosis = extract_diagnosis(row['Prompt'])  # Extract the diagnosis from the prompt
    
    # Add the negative descriptor (adjective)
    reshaped_data.append({
        'Prompt': row['Prompt'],  # Keep the prompt (diagnosis)
        'adjective': row['Descriptor'],
        'logit': row['Logit'],
        'label': 'negative',  # Original descriptor is negative
        'Gender': row['Gender'],
        'Model': row['Model'],
        'diagnosis': diagnosis  # Add the diagnosis
    })
    
    # Add the positive antonym (adjective)
    reshaped_data.append({
        'Prompt': row['Prompt'],  # Keep the prompt (diagnosis)
        'adjective': row['Antonym'],
        'logit': row['Antonym Logit'],
        'label': 'positive',  # Antonym is positive
        'Gender': row['Gender'],
        'Model': row['Model'],
        'diagnosis': diagnosis  # Add the diagnosis
    })

# Convert the reshaped data back into a DataFrame
reshaped_df = pd.DataFrame(reshaped_data)

# Now reshaped_df will have the structure you want, with 'Prompt', 'adjective', 'logit', 'label', 'Gender', 'Model', and 'diagnosis'
# Define the severity mapping based on your description
severity_mapping = {
    "schizophrenia": "severe",
    "depression": "not severe",
    "generalised anxiety disorder": "not severe",
    "bipolar disorder": "severe",
    "borderline personality disorder": "severe",
    "a mental illness": "not severe",  # unspecified, but we categorize it as not severe
    "antisocial personality disorder": "severe",
    "post-traumatic stress disorder": "not severe",
    "obsessive-compulsive disorder": "not severe",
    "dissociative identity disorder": "severe"
}

# Function to apply the severity label
def add_severity_label(row):
    diagnosis = row['diagnosis']
    return severity_mapping.get(diagnosis, "unknown")  # Default to "unknown" if diagnosis is not found

# Apply the function to add the new column
reshaped_df['severity'] = reshaped_df.apply(add_severity_label, axis=1)

aggregated_df = reshaped_df.groupby(['Model', 'diagnosis', 'label', 'adjective'])['logit'].mean().reset_index()

def aggregate_data_no_diagnosis(data, model):
    """
    Aggregate data without explicit diagnoses in the keys.

    Args:
        data (dict): The input data structured as {prompt: {adjective: details}}.
        model (str): The name of the model used.

    Returns:
        list: A list of dictionaries containing the reshaped data.
    """
    reshaped_data = []

    
    # Function to extract gender from the prompt based on pronouns
    def extract_gender_from_pronouns(prompt):
        pronouns_to_gender = {
            "he": "Male",
            "him": "Male",
            "his": "Male",
            "she": "Female",
            "her": "Female",
            "hers": "Female",
            "they": "Neutral",
            "them": "Neutral",
            "theirs": "Neutral"
        }
        # Tokenize the prompt and check for pronouns
        tokens = prompt.lower().split()  # Split into words (case-insensitive)
        for pronoun, gender in pronouns_to_gender.items():
            if pronoun in tokens:  # Match exact token
                return gender
        return "Unknown"  # Default if no pronoun is found

    # Loop through the data
    for prompt, adjectives in data.items():
        gender = extract_gender_from_pronouns(prompt)  # Extract gender from pronouns
        
        for adjective, details in adjectives.items():
            # Add the negative descriptor (adjective)
            reshaped_data.append({
                'Prompt': prompt,
                'adjective': adjective,
                'logit': details["logit"],
                'label': 'negative',  # Original descriptor is negative
                'Gender': gender,
                'Model': model,  # Model name
                'diagnosis': 'None'  # No diagnosis available
            })
            
            # Add the positive antonym (adjective)
            reshaped_data.append({
                'Prompt': prompt,
                'adjective': details["antonym"],
                'logit': details["antonym_logit"],
                'label': 'positive',  # Antonym is positive
                'Gender': gender,
                'Model': model,  # Model name
                'diagnosis': 'None'  # No diagnosis available
            })

    return reshaped_data

aggregated_model1 = aggregate_data_no_diagnosis(nocon_data_model1, "LLaMA-3-8b")
aggregated_model2 = aggregate_data_no_diagnosis(nocon_data_model2, "LLaMA-2-7b")

all_nocon = aggregated_model1 + aggregated_model2
nocon_data = pd.DataFrame(all_nocon)

res = pd.concat([reshaped_df, nocon_data], axis=0, ignore_index=True)

In [3]:
res = res[res['diagnosis']!= 'a mental illness']
#res2 = res2[res2['diagnosis'] != 'None']


In [7]:

#  Function to get top n unique adjectives for each group
def get_top_n_unique(group, n=20):
    # Sort by logit in descending order and drop duplicates for unique adjectives
    group = group.sort_values(by='logit', ascending=False)
    unique_group = group.drop_duplicates(subset=['adjective'])
    # Return the top n rows
    return unique_group.head(n)

# Group by Model and diagnosis, and apply the function
top_n_unique_adjectives = (
    res.groupby(['Model', 'diagnosis'])
    .apply(lambda group: get_top_n_unique(group, n=20))
    .reset_index(drop=True)
)

# Display the resulting DataFrame

lol =top_n_unique_adjectives.groupby(['Model', 'diagnosis', 'adjective'])['logit'].mean().reset_index()


# Step 1: Sort the DataFrame by Model, Diagnosis, and Logit in descending order (for top adjectives)
df_sorted = lol.sort_values(by=['Model', 'diagnosis', 'logit'], ascending=[True, True, False])

# Step 2: Add a rank column to get the top N adjectives
df_sorted['rank'] = df_sorted.groupby(['Model', 'diagnosis']).cumcount() + 1

# Step 3: Pivot the DataFrame to create the multi-index table
df_pivoted = df_sorted.pivot_table(index=['Model', 'rank'], columns='diagnosis', values='adjective', aggfunc='first')

# Step 4: Optionally, you can reset the column and index names for clarity
df_pivoted.columns.name = None  # Remove the column name (diagnosis) for better readability
df_pivoted.index.names = ['Model', 'Rank']  # Set the index names for clarity

# Mapping old names to new names
rename_dict = {
    'depression': 'DEPR',
    'generalised anxiety disorder': 'ANX',
    'obsessive-compulsive disorder': 'OCD',
    'post-traumatic stress disorder': 'PTSD',
    'antisocial personality disorder': 'APD',
    'schizophrenia': 'SCHI',
    'bipolar disorder': 'BIP',
    'borderline personality disorder': 'BPD',
    'dissociative identity disorder': 'DID'
}

# Rename the columns
df_pivoted = df_pivoted.rename(columns=rename_dict)

# Define the desired column order
desired_order = [
    'None', 'DEPR', 'ANX', 'OCD', 'PTSD', 'APD', 'SCHI', 'BIP', 'BPD', 'DID'
]

# Reorder columns based on the desired order
df_pivoted = df_pivoted[desired_order]

df_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,None,DEPR,ANX,OCD,PTSD,APD,SCHI,BIP,BPD,DID
Model,Rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LLaMA-2-7b,1,inactive,inactive,inactive,inactive,inactive,abnormal,inactive,inactive,inactive,different
LLaMA-2-7b,2,incompetent,incompetent,incompetent,incompetent,incompetent,abusive,incompetent,incompetent,incompetent,inactive
LLaMA-2-7b,3,insecure,insecure,insecure,insecure,insecure,inactive,insecure,insecure,insecure,incompetent
LLaMA-2-7b,4,innocent,unapproachable,confident,overbearing,shady,incompetent,normal,unapproachable,unapproachable,insecure
LLaMA-2-7b,5,withdrawn,uncapable,overbearing,shady,shameless,insecure,intelligent,uncapable,uncapable,unapproachable
LLaMA-2-7b,6,unapproachable,unexaggerated,shady,shameless,shy,unapproachable,unapproachable,uncurable,uncurable,uncapable
LLaMA-2-7b,7,uncurable,unfamiliar,shameless,shy,tense,uncapable,uncapable,unexaggerated,unexaggerated,uncurable
LLaMA-2-7b,8,unexaggerated,unhealthy,shy,kind,quiet,unexaggerated,uncurable,unfamiliar,unfamiliar,unexaggerated
LLaMA-2-7b,9,unhealthy,unhelpful,mature,responsible,unapproachable,unhinged,unexaggerated,unfriendly,unhealthy,unfriendly
LLaMA-2-7b,10,unhelpful,unhinged,quiet,intelligent,unexaggerated,unlawful,unfamiliar,unhealthy,unhelpful,unhealthy


In [10]:
# Function to get top n unique adjectives for each group
def get_top_n_unique(group, n=20):
    # Sort by logit in descending order and drop duplicates for unique adjectives
    group = group.sort_values(by='logit', ascending=False)
    unique_group = group.drop_duplicates(subset=['adjective'])
    # Return the top n rows
    return unique_group.head(n)

# Group by Model, Diagnosis, and Gender, and apply the function
top_n_unique_adjectives = (
    res.groupby(['Model', 'diagnosis', 'Gender'])
    .apply(lambda group: get_top_n_unique(group, n=20))
    .reset_index(drop=True)
)

# Display the resulting DataFrame
lol = top_n_unique_adjectives.groupby(['Model', 'diagnosis', 'Gender', 'adjective'])['logit'].mean().reset_index()

# Step 1: Sort the DataFrame by Model, Diagnosis, Gender, and Logit in descending order
df_sorted = lol.sort_values(by=['Model', 'diagnosis', 'Gender', 'logit'], ascending=[True, True, True, False])

# Step 2: Add a rank column to get the top N adjectives for each Model, Diagnosis, and Gender
df_sorted['rank'] = df_sorted.groupby(['Model', 'diagnosis', 'Gender']).cumcount() + 1

# Step 3: Pivot the DataFrame to create the multi-index table
df_pivoted = df_sorted.pivot_table(index=['Model', 'rank'], columns=['diagnosis', 'Gender'], values='adjective', aggfunc='first')

# Step 4: Optionally, you can reset the column and index names for clarity
df_pivoted.columns.name = None  # Remove the column name (diagnosis, gender) for better readability
df_pivoted.index.names = ['Model', 'Rank']  # Set the index names for clarity

# Mapping old names to new names
rename_dict = {
    'depression': 'DEPR',
    'generalised anxiety disorder': 'ANX',
    'obsessive-compulsive disorder': 'OCD',
    'post-traumatic stress disorder': 'PTSD',
    'antisocial personality disorder': 'APD',
    'schizophrenia': 'SCHI',
    'bipolar disorder': 'BIP',
    'borderline personality disorder': 'BPD',
    'dissociative identity disorder': 'DID'
}

# Rename the columns
df_pivoted = df_pivoted.rename(columns=rename_dict)

# Define the desired column order
desired_order = [
    'None', 'DEPR', 'ANX', 'OCD', 'PTSD', 'APD', 'SCHI', 'BIP', 'BPD', 'DID'
]

# Reorder columns based on the desired order
df_pivoted = df_pivoted[desired_order]

df_pivoted

# Show the resulting DataFrame
print(df_pivoted)


diagnosis                  None                                 \
Gender                   Female            Male        Neutral   
Model      Rank                                                  
LLaMA-2-7b 1           inactive        inactive       inactive   
           2        incompetent     incompetent    incompetent   
           3           insecure        insecure       insecure   
           4           innocent        innocent       innocent   
           5          withdrawn  unapproachable    overbearing   
           6          confident       uncapable      uncapable   
           7     unapproachable   unexaggerated  unexaggerated   
           8          uncapable      unfriendly     unfriendly   
           9      unexaggerated       unhealthy      unhealthy   
           10        unfamiliar       unhelpful      unhelpful   
           11        unfriendly        unhinged       unlawful   
           12          unhinged        unlawful      unlovable   
          

In [11]:
df_pivoted

Unnamed: 0_level_0,diagnosis,None,None,None,DEPR,DEPR,DEPR,ANX,ANX,ANX,OCD,...,SCHI,BIP,BIP,BIP,BPD,BPD,BPD,DID,DID,DID
Unnamed: 0_level_1,Gender,Female,Male,Neutral,Female,Male,Neutral,Female,Male,Neutral,Female,...,Neutral,Female,Male,Neutral,Female,Male,Neutral,Female,Male,Neutral
Model,Rank,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
LLaMA-2-7b,1,inactive,inactive,inactive,inactive,inactive,inactive,overbearing,confident,inactive,inactive,...,inactive,inactive,inactive,inactive,unapproachable,unapproachable,inactive,inactive,inactive,different
LLaMA-2-7b,2,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent,inactive,overbearing,incompetent,incompetent,...,incompetent,incompetent,incompetent,incompetent,uncurable,uncapable,incompetent,incompetent,incompetent,inactive
LLaMA-2-7b,3,insecure,insecure,insecure,insecure,insecure,insecure,incompetent,shady,insecure,insecure,...,insecure,insecure,insecure,insecure,unexaggerated,uncurable,insecure,insecure,insecure,incompetent
LLaMA-2-7b,4,innocent,innocent,innocent,unapproachable,unapproachable,unapproachable,insecure,shameless,overbearing,overbearing,...,uncapable,unapproachable,unapproachable,unapproachable,unfamiliar,unexaggerated,inconsiderate,unapproachable,different,insecure
LLaMA-2-7b,5,withdrawn,unapproachable,overbearing,uncapable,uncapable,uncapable,confident,shy,unapproachable,intelligent,...,uncurable,uncapable,uncurable,uncapable,unfriendly,unfamiliar,emotional,uncapable,unapproachable,normal
LLaMA-2-7b,6,confident,uncapable,uncapable,uncurable,unfamiliar,uncurable,mature,inactive,uncapable,normal,...,unexaggerated,uncurable,unexaggerated,uncurable,unhealthy,unfriendly,overbearing,uncurable,uncapable,uncapable
LLaMA-2-7b,7,unapproachable,unexaggerated,unexaggerated,unfriendly,unfriendly,unexaggerated,quiet,incompetent,uncurable,unapproachable,...,unfamiliar,unexaggerated,unfriendly,unfriendly,unhelpful,unhealthy,manipulative,unexaggerated,unexaggerated,uncurable
LLaMA-2-7b,8,uncapable,unfriendly,unfriendly,unhealthy,unhealthy,unfamiliar,shady,insecure,unexaggerated,uncapable,...,unfriendly,unfriendly,unhealthy,unhealthy,unhinged,unhelpful,sensitive,unfamiliar,unfamiliar,unexaggerated
LLaMA-2-7b,9,unexaggerated,unhealthy,unhealthy,unhelpful,unhelpful,unfriendly,shameless,mature,unfriendly,uncurable,...,unhealthy,unhealthy,unhelpful,unhelpful,unlawful,unlawful,dramatic,unfriendly,unfriendly,unhealthy
LLaMA-2-7b,10,unfamiliar,unhelpful,unhelpful,unhinged,unhinged,unhealthy,shy,quiet,unhelpful,unexaggerated,...,unhinged,unhinged,unhinged,unhinged,unlovable,unlovable,uncapable,unhinged,unhelpful,unhelpful


In [14]:
ha = df_pivoted[['None', 'antisocial personality disorder', 'post-traumatic stress disorder']]

In [15]:
print(
    ha.to_latex(
        multirow=False,      # Support for multirow cells
        multicolumn=False,   # Support for multicolumn cells
        longtable=False,     # Use longtable for large tables
        caption="Top 10 Adjectives by Model, Diagnosis, and Gender",
        label="tab:top_adjectives",
        escape=False        # Allow special characters like %
    )
)

\begin{table}
\centering
\caption{Top 10 Adjectives by Model, Diagnosis, and Gender}
\label{tab:top_adjectives}
\begin{tabular}{lllllllllll}
\toprule
           & diagnosis &            None &                 &                & antisocial personality disorder &                 &                 & post-traumatic stress disorder &                &                 \\
           & Gender &          Female &            Male &        Neutral &                          Female &            Male &         Neutral &                         Female &           Male &         Neutral \\
Model & Rank &                 &                 &                &                                 &                 &                 &                                &                &                 \\
\midrule
LLaMA-2-7b & 1  &        inactive &        inactive &       inactive &                        inactive &        abnormal &        inactive &                       inactive &       inactive &        inact

  ha.to_latex(
