Imports

In [1]:
import json 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import Counter




Loading data

In [2]:
with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_FEMALE_1610_1.json', 'r') as file:
    female_data_model1 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_MALE_1610_1.json', 'r') as file:
    male_data_model1 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_NEUT_1610_1.json', 'r') as file:
    neutral_data_model1 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_FEMALE_1610_1.json', 'r') as file:
    female_data_model2 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_MALE_1610_1.json', 'r') as file:
    male_data_model2 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_NEUT_1610_1.json', 'r') as file:
    neutral_data_model2 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_llms_Meta-Llama-3-8B_incremental_NOCON_1610_1.json', 'r') as file:
    nocon_data_model1 = json.load(file)

with open('/Users/ninaliem/thesis_master/results/rq2/adjective_probabilities_Meta-Llama-2-7B_incremental_NOCON_1610_1.json', 'r') as file:
    nocon_data_model2 = json.load(file)

Calculating log odds ratio

In [3]:
def aggregate_data(data, gender, model):
    aggregated_data = []

    for prompt, adjectives in data.items():
        for adjective, details in adjectives.items():
            aggregated_data.append({
                "Prompt": prompt,
                "Descriptor": adjective,
                "Logit": details["logit"],
                "Antonym": details["antonym"],
                "Antonym Logit": details["antonym_logit"],
                "Favored": details["favored"],
                "Gender": gender,
                "Model": model  # Add model label
            })

    return aggregated_data

female_aggregated_model1 = aggregate_data(female_data_model1, "Female", "LLaMA-3-8b")
male_aggregated_model1 = aggregate_data(male_data_model1, "Male", "LLaMA-3-8b")
neutral_aggregated_model1 = aggregate_data(neutral_data_model1, "Neutral", "LLaMA-3-8b")

female_aggregated_model2 = aggregate_data(female_data_model2, "Female", "LLaMA-2-7b")
male_aggregated_model2 = aggregate_data(male_data_model2, "Male", "LLaMA-2-7b")
neutral_aggregated_model2 = aggregate_data(neutral_data_model2, "Neutral", "LLaMA-2-7b")


all_data = (female_aggregated_model1 + male_aggregated_model1 + neutral_aggregated_model1 +
            female_aggregated_model2 + male_aggregated_model2 + neutral_aggregated_model2)

df = pd.DataFrame(all_data)

diagnoses = [
    "schizophrenia", 
    "depression", 
    "generalised anxiety disorder", 
    "bipolar disorder", 
    "borderline personality disorder", 
    "a mental illness", 
    "antisocial personality disorder", 
    "post-traumatic stress disorder", 
    "obsessive-compulsive disorder", 
    "dissociative identity disorder"
]

def extract_diagnosis(prompt):
    for diagnosis in diagnoses:
        if diagnosis in prompt.lower():  
            return diagnosis
    return None  

reshaped_data = []

for _, row in df.iterrows():
    diagnosis = extract_diagnosis(row['Prompt'])  
    
    reshaped_data.append({
        'Prompt': row['Prompt'],  
        'adjective': row['Descriptor'],
        'logit': row['Logit'],
        'label': 'negative',  
        'Gender': row['Gender'],
        'Model': row['Model'],
        'diagnosis': diagnosis  
    })
    
    reshaped_data.append({
        'Prompt': row['Prompt'],  
        'adjective': row['Antonym'],
        'logit': row['Antonym Logit'],
        'label': 'positive',  
        'Gender': row['Gender'],
        'Model': row['Model'],
        'diagnosis': diagnosis  
    })

reshaped_df = pd.DataFrame(reshaped_data)

severity_mapping = {
    "schizophrenia": "severe",
    "depression": "not severe",
    "generalised anxiety disorder": "not severe",
    "bipolar disorder": "severe",
    "borderline personality disorder": "severe",
    "a mental illness": "not severe",  # unspecified, but we categorize it as not severe
    "antisocial personality disorder": "severe",
    "post-traumatic stress disorder": "not severe",
    "obsessive-compulsive disorder": "not severe",
    "dissociative identity disorder": "severe"
}

def add_severity_label(row):
    diagnosis = row['diagnosis']
    return severity_mapping.get(diagnosis, "unknown") 

reshaped_df['severity'] = reshaped_df.apply(add_severity_label, axis=1)

aggregated_df = reshaped_df.groupby(['Model', 'diagnosis', 'label', 'adjective'])['logit'].mean().reset_index()

def aggregate_data_no_diagnosis(data, model):
    reshaped_data = []
    
    def extract_gender_from_pronouns(prompt):
        pronouns_to_gender = {
            "he": "Male",
            "him": "Male",
            "his": "Male",
            "she": "Female",
            "her": "Female",
            "hers": "Female",
            "they": "Neutral",
            "them": "Neutral",
            "theirs": "Neutral"
        }

        tokens = prompt.lower().split()  # Split into words (case-insensitive)
        for pronoun, gender in pronouns_to_gender.items():
            if pronoun in tokens:  # Match exact token
                return gender
        return "Unknown"  # Default if no pronoun is found

    for prompt, adjectives in data.items():
        gender = extract_gender_from_pronouns(prompt)  # Extract gender from pronouns
        
        for adjective, details in adjectives.items():
            reshaped_data.append({
                'Prompt': prompt,
                'adjective': adjective,
                'logit': details["logit"],
                'label': 'negative',  
                'Gender': gender,
                'Model': model,  
                'diagnosis': 'None' 
            })
            
            reshaped_data.append({
                'Prompt': prompt,
                'adjective': details["antonym"],
                'logit': details["antonym_logit"],
                'label': 'positive', 
                'Gender': gender,
                'Model': model,  
                'diagnosis': 'None'  
            })

    return reshaped_data

aggregated_model1 = aggregate_data_no_diagnosis(nocon_data_model1, "LLaMA-3-8b")
aggregated_model2 = aggregate_data_no_diagnosis(nocon_data_model2, "LLaMA-2-7b")

all_nocon = aggregated_model1 + aggregated_model2
nocon_data = pd.DataFrame(all_nocon)

res = pd.concat([reshaped_df, nocon_data], axis=0, ignore_index=True)

In [4]:
aggregated_df = res.groupby(['Model', 'diagnosis', 'label', 'adjective'])['logit'].mean().reset_index()
aggregated_df = aggregated_df[aggregated_df['diagnosis'] != 'a mental illness']

def calculate_log_odds_ratios(group1, group2):
    """
    Calculate log odds ratio for each adjective in group1 against all adjectives in group2.
    """
    logits_group1 = group1.set_index('adjective')['logit']
    logits_group2 = group2.set_index('adjective')['logit']
    ratios = {}

    for adj1 in logits_group1.index:
        adj_ratios = []
        for adj2 in logits_group2.index:
            logit1 = logits_group1[adj1]
            logit2 = logits_group2[adj2]
            log_odds_ratio = logit1 - logit2  # log odds ratio
            adj_ratios.append(log_odds_ratio)
        ratios[adj1] = np.mean(adj_ratios)  # Store the average log odds ratio for each adjective in group1
    return ratios

def analyze_adjectives(aggregated_df):
    # Step 1: Process each (model, diagnosis) group
    log_odds_per_adjective = {}
    grouped = aggregated_df.groupby(['Model', 'diagnosis'])

    for (model, diagnosis), group in grouped:
        negative_model_df = group[group['label'] == 'negative']
        positive_model_df = group[group['label'] == 'positive']
        
        negative_vs_negative = calculate_log_odds_ratios(negative_model_df, negative_model_df)
        negative_vs_positive = calculate_log_odds_ratios(negative_model_df, positive_model_df)
        negative_vs_combined = calculate_log_odds_ratios(negative_model_df, pd.concat([negative_model_df, positive_model_df]))
        
        positive_vs_positive = calculate_log_odds_ratios(positive_model_df, positive_model_df)
        positive_vs_negative = calculate_log_odds_ratios(positive_model_df, negative_model_df)
        positive_vs_combined = calculate_log_odds_ratios(positive_model_df, pd.concat([negative_model_df, positive_model_df]))

        if model not in log_odds_per_adjective:
            log_odds_per_adjective[model] = {}
        log_odds_per_adjective[model][diagnosis] = {
            'negative_vs_negative': negative_vs_negative,
            'negative_vs_positive': negative_vs_positive,
            'negative_vs_combined': negative_vs_combined,
            'positive_vs_positive': positive_vs_positive,
            'positive_vs_negative': positive_vs_negative,
            'positive_vs_combined': positive_vs_combined
        }

    negative_vs_negative_data = {
        (model, diagnosis): data['negative_vs_negative']
        for model, diagnoses in log_odds_per_adjective.items()
        for diagnosis, data in diagnoses.items()
    }
    negative_vs_negative_df = pd.DataFrame(negative_vs_negative_data).T
    negative_vs_negative_df.index.names = ['Model', 'Diagnosis']

    negative_vs_positive_data = {
        (model, diagnosis): data['negative_vs_positive']
        for model, diagnoses in log_odds_per_adjective.items()
        for diagnosis, data in diagnoses.items()
    }
    negative_vs_positive_df = pd.DataFrame(negative_vs_positive_data).T
    negative_vs_positive_df.index.names = ['Model', 'Diagnosis']

    negative_vs_combined_data = {
        (model, diagnosis): data['negative_vs_combined']
        for model, diagnoses in log_odds_per_adjective.items()
        for diagnosis, data in diagnoses.items()
    }
    negative_vs_combined_df = pd.DataFrame(negative_vs_combined_data).T
    negative_vs_combined_df.index.names = ['Model', 'Diagnosis']

    positive_vs_positive_data = {
        (model, diagnosis): data['positive_vs_positive']
        for model, diagnoses in log_odds_per_adjective.items()
        for diagnosis, data in diagnoses.items()
    }
    positive_vs_positive_df = pd.DataFrame(positive_vs_positive_data).T
    positive_vs_positive_df.index.names = ['Model', 'Diagnosis']

    positive_vs_negative_data = {
        (model, diagnosis): data['positive_vs_negative']
        for model, diagnoses in log_odds_per_adjective.items()
        for diagnosis, data in diagnoses.items()
    }
    positive_vs_negative_df = pd.DataFrame(positive_vs_negative_data).T
    positive_vs_negative_df.index.names = ['Model', 'Diagnosis']

    positive_vs_combined_data = {
        (model, diagnosis): data['positive_vs_combined']
        for model, diagnoses in log_odds_per_adjective.items()
        for diagnosis, data in diagnoses.items()
    }
    positive_vs_combined_df = pd.DataFrame(positive_vs_combined_data).T
    positive_vs_combined_df.index.names = ['Model', 'Diagnosis']

    return {
        'negative_vs_negative': negative_vs_negative_df,
        'negative_vs_positive': negative_vs_positive_df,
        'negative_vs_combined': negative_vs_combined_df,
        'positive_vs_positive': positive_vs_positive_df,
        'positive_vs_negative': positive_vs_negative_df,
        'positive_vs_combined': positive_vs_combined_df
    }

comparison_dfs = analyze_adjectives(aggregated_df)


In [5]:
negative_comparisons = ['negative_vs_negative', 'negative_vs_positive', 'negative_vs_combined']
positive_comparisons = ['positive_vs_positive', 'positive_vs_negative', 'positive_vs_combined']

def aggregate_comparisons(comparison_dfs, comparison_keys, label):
    aggregated_data = []
    
    for comparison_key in comparison_keys:
        df = comparison_dfs[comparison_key].reset_index()
        df_melted = df.melt(
            id_vars=['Model', 'Diagnosis'],
            var_name='adjective',
            value_name='log_odds_ratio'
        )
        df_melted['label'] = label  # Add label column
        df_melted['comparison'] = comparison_key  # Add comparison column
        df_melted = df_melted.rename(columns={'Diagnosis': 'diagnosis'})
        
        aggregated_data.append(df_melted)
    
    combined_df = pd.concat(aggregated_data, ignore_index=True)
    
    averaged_df = combined_df.groupby(['Model', 'diagnosis', 'label', 'adjective']).agg({'log_odds_ratio': 'mean'}).reset_index()

    return averaged_df

negative_averaged = aggregate_comparisons(comparison_dfs, negative_comparisons, 'negative')
positive_averaged = aggregate_comparisons(comparison_dfs, positive_comparisons, 'positive')

averaged = pd.concat([negative_averaged, positive_averaged])

Ranking based on LOR

In [None]:
res2 = averaged[averaged['diagnosis'] != 'a mental illness']

def get_top_n_unique(group, n=10):
    # Sort by logit in descending order and drop duplicates for unique adjectives
    group = group.sort_values(by='log_odds_ratio', ascending=False)
    unique_group = group.drop_duplicates(subset=['adjective'])
    # Return the top n rows
    return unique_group.head(n)

top_n_unique_adjectives = (
    res2.groupby(['Model', 'diagnosis'])
    .apply(lambda group: get_top_n_unique(group, n=20))
    .reset_index(drop=True)
)

top = top_n_unique_adjectives.groupby(['Model', 'diagnosis', 'adjective'])['log_odds_ratio'].mean().reset_index()

df_sorted = top.sort_values(by=['Model', 'diagnosis', 'log_odds_ratio'], ascending=[True, True, False])
df_sorted['rank'] = df_sorted.groupby(['Model', 'diagnosis']).cumcount() + 1
df_pivoted = df_sorted.pivot_table(index=['Model', 'rank'], columns='diagnosis', values='adjective', aggfunc='first')
df_pivoted.columns.name = None  
df_pivoted.index.names = ['Model', 'Rank']  # Set the index names for clarity

df_pivoted

In [18]:
rename_dict = {
    'depression': 'DEPR',
    'generalised anxiety disorder': 'ANX',
    'obsessive-compulsive disorder': 'OCD',
    'post-traumatic stress disorder': 'PTSD',
    'antisocial personality disorder': 'APD',
    'schizophrenia': 'SCHI',
    'bipolar disorder': 'BIP',
    'borderline personality disorder': 'BPD',
    'dissociative identity disorder': 'DID'
}

df_pivoted = df_pivoted.rename(columns=rename_dict)

desired_order = [
    'None', 'DEPR', 'ANX', 'OCD', 'PTSD', 'APD', 'SCHI', 'BIP', 'BPD', 'DID'
]

df_pivoted = df_pivoted[desired_order]

df_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,None,DEPR,ANX,OCD,PTSD,APD,SCHI,BIP,BPD,DID
Model,Rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LLaMA-2-7b,1,inactive,inactive,inactive,inactive,inactive,inactive,inactive,inactive,inactive,inactive
LLaMA-2-7b,2,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent
LLaMA-2-7b,3,insecure,insecure,insecure,insecure,insecure,insecure,insecure,insecure,insecure,insecure
LLaMA-2-7b,4,innocent,overbearing,overbearing,overbearing,overbearing,unexaggerated,uncurable,overbearing,overbearing,different
LLaMA-2-7b,5,withdrawn,unapproachable,unapproachable,withdrawn,withdrawn,unfamiliar,unfamiliar,unapproachable,unapproachable,withdrawn
LLaMA-2-7b,6,unfamiliar,uncapable,uncapable,unfamiliar,uncurable,unfriendly,unfriendly,uncapable,uncurable,innocent
LLaMA-2-7b,7,unfriendly,uncurable,uncurable,unfriendly,unfamiliar,unhelpful,unhealthy,uncurable,unfamiliar,unhelpful
LLaMA-2-7b,8,unhealthy,unfamiliar,unfamiliar,unhealthy,unfriendly,unhinged,unhelpful,unfamiliar,unfriendly,unhinged
LLaMA-2-7b,9,unhelpful,unfriendly,unfriendly,unhelpful,unhealthy,unlawful,unhinged,unfriendly,unhealthy,unlawful
LLaMA-2-7b,10,unhinged,unhealthy,unhealthy,unhinged,unhelpful,unlovable,unlawful,unhealthy,unhelpful,unlovable


In [19]:
print(
    df_pivoted.to_latex(
        multirow=False,      # Support for multirow cells
        multicolumn=False,   # Support for multicolumn cells
        longtable=False,     # Use longtable for large tables
        caption="LOR-based Top 10 Adjectives for None by Model, Diagnosis, and Gender",
        label="tab:top_adjectives",
        escape=False        # Allow special characters like %
    )
)

\begin{table}
\centering
\caption{LOR-based Top 10 Adjectives for None by Model, Diagnosis, and Gender}
\label{tab:top_adjectives}
\begin{tabular}{llllllllllll}
\toprule
           &    &           None &            DEPR &             ANX &            OCD &           PTSD &            APD &           SCHI &             BIP &             BPD &            DID \\
Model & Rank &                &                 &                 &                &                &                &                &                 &                 &                \\
\midrule
LLaMA-2-7b & 1  &       inactive &        inactive &        inactive &       inactive &       inactive &       inactive &       inactive &        inactive &        inactive &       inactive \\
           & 2  &    incompetent &     incompetent &     incompetent &    incompetent &    incompetent &    incompetent &    incompetent &     incompetent &     incompetent &    incompetent \\
           & 3  &       insecure &        insecure &

  df_pivoted.to_latex(


Ranking based on logit, per gender

In [35]:
def get_top_n_unique(group, n=20):
    group = group.sort_values(by='logit', ascending=False)
    unique_group = group.drop_duplicates(subset=['adjective'])
    return unique_group.head(n)

top_n_unique_adjectives = (
    res.groupby(['Model', 'diagnosis', 'Gender'])
    .apply(lambda group: get_top_n_unique(group, n=20))
    .reset_index(drop=True)
)

top = top_n_unique_adjectives.groupby(['Model', 'diagnosis', 'Gender', 'adjective'])['logit'].mean().reset_index()

df_sorted = top.sort_values(by=['Model', 'diagnosis', 'Gender', 'logit'], ascending=[True, True, True, False])

df_sorted['rank'] = df_sorted.groupby(['Model', 'diagnosis', 'Gender']).cumcount() + 1

df_pivoted = df_sorted.pivot_table(index=['Model', 'rank'], columns=['diagnosis', 'Gender'], values='adjective', aggfunc='first')

df_pivoted.columns.name = None  # Remove the column name (diagnosis, gender) for better readability
df_pivoted.index.names = ['Model', 'Rank']  # Set the index names for clarity
rename_dict = {
    'depression': 'DEPR',
    'generalised anxiety disorder': 'ANX',
    'obsessive-compulsive disorder': 'OCD',
    'post-traumatic stress disorder': 'PTSD',
    'antisocial personality disorder': 'APD',
    'schizophrenia': 'SCHI',
    'bipolar disorder': 'BIP',
    'borderline personality disorder': 'BPD',
    'dissociative identity disorder': 'DID'
}

df_pivoted = df_pivoted.rename(columns=rename_dict)

desired_order = [
    'None', 'DEPR', 'ANX', 'OCD', 'PTSD', 'APD', 'SCHI', 'BIP', 'BPD', 'DID'
]

df_pivoted = df_pivoted[desired_order]

df_pivoted


Unnamed: 0_level_0,diagnosis,None,None,None,DEPR,DEPR,DEPR,ANX,ANX,ANX,OCD,...,SCHI,BIP,BIP,BIP,BPD,BPD,BPD,DID,DID,DID
Unnamed: 0_level_1,Gender,Female,Male,Neutral,Female,Male,Neutral,Female,Male,Neutral,Female,...,Neutral,Female,Male,Neutral,Female,Male,Neutral,Female,Male,Neutral
Model,Rank,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
LLaMA-2-7b,1,inactive,inactive,inactive,inactive,inactive,inactive,overbearing,confident,inactive,inactive,...,inactive,inactive,inactive,inactive,unapproachable,unapproachable,inactive,inactive,inactive,different
LLaMA-2-7b,2,incompetent,incompetent,incompetent,incompetent,incompetent,incompetent,inactive,overbearing,incompetent,incompetent,...,incompetent,incompetent,incompetent,incompetent,uncurable,uncapable,incompetent,incompetent,incompetent,inactive
LLaMA-2-7b,3,insecure,insecure,insecure,insecure,insecure,insecure,incompetent,shady,insecure,insecure,...,insecure,insecure,insecure,insecure,unexaggerated,uncurable,insecure,insecure,insecure,incompetent
LLaMA-2-7b,4,innocent,innocent,innocent,unapproachable,unapproachable,unapproachable,insecure,shameless,overbearing,overbearing,...,uncapable,unapproachable,unapproachable,unapproachable,unfamiliar,unexaggerated,inconsiderate,unapproachable,different,insecure
LLaMA-2-7b,5,withdrawn,unapproachable,overbearing,uncapable,uncapable,uncapable,confident,shy,unapproachable,intelligent,...,uncurable,uncapable,uncurable,uncapable,unfriendly,unfamiliar,emotional,uncapable,unapproachable,normal
LLaMA-2-7b,6,confident,uncapable,uncapable,uncurable,unfamiliar,uncurable,mature,inactive,uncapable,normal,...,unexaggerated,uncurable,unexaggerated,uncurable,unhealthy,unfriendly,overbearing,uncurable,uncapable,uncapable
LLaMA-2-7b,7,unapproachable,unexaggerated,unexaggerated,unfriendly,unfriendly,unexaggerated,quiet,incompetent,uncurable,unapproachable,...,unfamiliar,unexaggerated,unfriendly,unfriendly,unhelpful,unhealthy,manipulative,unexaggerated,unexaggerated,uncurable
LLaMA-2-7b,8,uncapable,unfriendly,unfriendly,unhealthy,unhealthy,unfamiliar,shady,insecure,unexaggerated,uncapable,...,unfriendly,unfriendly,unhealthy,unhealthy,unhinged,unhelpful,sensitive,unfamiliar,unfamiliar,unexaggerated
LLaMA-2-7b,9,unexaggerated,unhealthy,unhealthy,unhelpful,unhelpful,unfriendly,shameless,mature,unfriendly,uncurable,...,unhealthy,unhealthy,unhelpful,unhelpful,unlawful,unlawful,dramatic,unfriendly,unfriendly,unhealthy
LLaMA-2-7b,10,unfamiliar,unhelpful,unhelpful,unhinged,unhinged,unhealthy,shy,quiet,unhelpful,unexaggerated,...,unhinged,unhinged,unhinged,unhinged,unlovable,unlovable,uncapable,unhinged,unhelpful,unhelpful
