In [27]:
import json
import numpy as np
from pathlib import Path
import pandas as pd
import random

def read_jsonl(file_path):
    # List to store the extracted data
    data = []

    # Read and process the JSONL file
    with open(file_path, 'r') as file:
        for line in file:
            json_data = json.loads(line)  # Parse each line as JSON
            
            # Extract the custom_id
            custom_id = json_data.get("custom_id")
            
            # Access the nested "content" field inside "choices"
            choices = json_data.get("response", {}).get("body", {}).get("choices", [])
            for choice in choices:
                message = choice.get("message", {})
                content = message.get("content")  # Extract the "content" text
                
                if content:
                    # Append the custom_id and content to the data list
                    data.append({"custom_id": custom_id, "text": content})

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    return df

#helper lambda function
def extract_vote(text):
    if "[[yes]]" in text and "[[no]]" in text:
        return pd.NA
    elif "[[yes]]" in text:
        return 1
    elif "[[no]]" in text:
        return 0
    else:
        return pd.NA

def extract_data(df):
    #extract the index so we can do a join operation later on 
    df['index'] = df['custom_id'].str.split('-').str[-1].astype(int)

    #extract the batch number for solumn naming 
    num = df['custom_id'].str.split('-').str[-2].astype(int)[0]

    #set index
    df = df.set_index("index").drop(columns=["custom_id"])

    #rename
    df  = df.rename(columns={"text" : f"Text_{num}"})

    #get binary votes from models 
    df[f'Vote_{num}'] = df[f'Text_{num}'].apply(lambda x: extract_vote(x))
    return df


def join_votes(df_list): 
    df = pd.concat(df_list, axis="columns")
    df = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9", "Vote_10"]]
    df["Proportion_Yes"] = df.mean(axis="columns")
    df["Proportion_Yes_10"] = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9"]].mean(axis='columns')
    
    #
    df["Proportion_Yes_9a"] = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8"]].mean(axis='columns')
    df["Proportion_Yes_9b"] = df[["Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9"]].mean(axis='columns')
    
    df["Proportion_Yes"] = df.mean(axis="columns")

    df["Majority_Vote"] = df["Proportion_Yes"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)

    #Fix cases where this is still a tie!
    df.loc[df["Majority_Vote"].isna(), "Majority_Vote"] = df["Proportion_Yes_10"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)
    random.seed(537)
    if random.random() > 0.5: 
        df.loc[df["Majority_Vote"].isna(), "Majority_Vote"] = df["Proportion_Yes_9a"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)
    else: 
        df.loc[df["Majority_Vote"].isna(), "Majority_Vote"] = df["Proportion_Yes_9b"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)


    df = df.drop(columns=['Proportion_Yes_10', 'Proportion_Yes_9a', 'Proportion_Yes_9b'])

    return df

# Path to the folder
folder_path = Path('../results/mistral/')

# Get all file paths in the folder
file_paths = list(folder_path.rglob('*')) 


df_list = []
for path in file_paths:
    df = read_jsonl(path)
    df = extract_data(df)
    df_list.append(df)
df = join_votes(df_list)

df


Unnamed: 0_level_0,Vote_0,Vote_1,Vote_2,Vote_3,Vote_4,Vote_5,Vote_6,Vote_7,Vote_8,Vote_9,Vote_10,Proportion_Yes,Majority_Vote
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1,1,1,1,1,1,1,1,1,1,1.0,1
1,1,1,0,0,0,1,,0,0,1,1,0.478175,0
2,,1,1,1,1,1,1,1,1,1,1,1.0,1
3,1,1,1,1,1,,1,1,1,1,,1.0,1
4,1,1,,1,,1,1,1,1,1,1,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2445,1,1,1,1,1,1,1,1,1,1,1,1.0,1
2446,1,1,1,1,1,1,1,1,1,1,1,1.0,1
2447,1,1,1,1,1,1,1,1,1,1,1,1.0,1
2448,0,1,1,1,1,1,1,0,0,0,1,0.63798,1


In [28]:
#Finding errors
votes = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9", "Vote_10"]]
print(votes.isna().sum())

Vote_0     67
Vote_1     66
Vote_2     45
Vote_3     51
Vote_4     60
Vote_5     50
Vote_6     43
Vote_7     44
Vote_8     61
Vote_9     56
Vote_10    69
dtype: int64


In [29]:
df.mean()

Vote_0            0.739095
Vote_1            0.746195
Vote_2            0.746737
Vote_3            0.742845
Vote_4            0.743127
Vote_5            0.751532
Vote_6            0.742054
Vote_7            0.764778
Vote_8            0.758621
Vote_9            0.760344
Vote_10           0.760708
Proportion_Yes    0.747848
Majority_Vote     0.771635
dtype: object

In [30]:
ballotq = pd.read_csv('../data/clean_ballot_qs.csv', index_col=0)
result = pd.concat([ballotq, df], axis='columns')
result = result[["Year", "State", "Topic_Areas", "Election", "Percent_Yes", "Pass", "Proportion_Yes", "Majority_Vote"]]
result

Unnamed: 0,Year,State,Topic_Areas,Election,Percent_Yes,Pass,Proportion_Yes,Majority_Vote
0,2000,Alabama,Civil & Constitutional Law,General,0.600,1,1.0,1
1,2000,Alabama,Judiciary,General,0.570,1,0.478175,0
2,2000,Alabama,Local Government,General,0.630,1,1.0,1
3,2000,Alabama,Human Services,General,0.570,1,1.0,1
4,2000,Alabama,Bond Measures,General,0.630,1,1.0,1
...,...,...,...,...,...,...,...,...
2492,2014,Wyoming,Education: Higher Ed,General,0.297,0,0.652054,1
2493,2016,Wyoming,Budgets,General,0.563,1,0.376835,0
2494,2020,Wyoming,Energy & Electric Utilities,General,0.510,1,0.188418,0
2495,2022,Wyoming,Judiciary,General,0.390,0,0.927273,1


In [31]:
np.sum(result["Majority_Vote"].isna())

np.int64(1)

In [32]:
no_ties = result.dropna(subset="Majority_Vote")
print("Among results without ties, percentage of LLM responses aligning with voters")
print(np.mean(no_ties["Pass"]==no_ties["Majority_Vote"]))

Among results without ties, percentage of LLM responses aligning with voters
0.6987179487179487
