In [24]:
import pandas as pd 
import json 

def read_jsonl(file_path):
    # List to store the extracted data
    data = []

    # Read and process the JSONL file
    with open(file_path, 'r') as file:
        for line in file:
            try:
                json_data = json.loads(line)  # Parse each line as JSON
                task_id = json_data.get("task_id")  # Extract the task_id
                
                # Access the nested "content" field inside "choices"
                choices = json_data.get("choices", [])
                for choice in choices:
                    message = choice.get("message", {})
                    content = message.get("content")  # Extract the text content
                    
                    if content:
                        # Append the task_id and content to the data list
                        data.append({"task_id": task_id, "text": content})
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    return df

#helper lambda function
def extract_vote(text):
    if "[[yes]]" in text and "[[no]]" in text:
        return pd.NA
    elif "[[yes]]" in text:
        return 1
    elif "[[no]]" in text:
        return 0
    else:
        return pd.NA


def extract_data(df, i):
    # Filter rows where 'task_id' starts with the desired prefix
    filtered_df = df[df['task_id'].str.startswith(f'task-{i}-')].copy()    

    # Extract the index to enable a join operation later
    filtered_df['index'] = filtered_df['task_id'].str.split('-').str[-1].astype(int)
    
    # Extract the batch number for column naming
    num = filtered_df['task_id'].str.split('-').str[-2].astype(int).iloc[0]
    
    # Set the index
    filtered_df = filtered_df.set_index('index').drop(columns=['task_id'])
    
    # Rename the column for text
    filtered_df = filtered_df.rename(columns={"text": f"Text_{num}"})
    
    # Get binary votes from models
    filtered_df[f'Vote_{num}'] = filtered_df[f'Text_{num}'].apply(lambda x: extract_vote(x))
    
    return filtered_df


def join_votes(df_list): 
    df = pd.concat(df_list, axis="columns")
    df = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9"]]
    df["Proportion_Yes"] = df.mean(axis="columns")

    #Not totally sure what to do with ties, for now just making it pd.NA ... 
    df["Majority_Vote"] = df["Proportion_Yes"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)
    return df


df1 = read_jsonl('../results/llama/all_responses.jsonl')
df2 = read_jsonl('../results/llama/all_responses2.jsonl')

df = pd.concat([df1, df2], ignore_index=True)

# Extract numeric parts
df['first_number'] = df['task_id'].str.extract(r'task-(\d+)-')[0].astype(int)
df['last_number'] = df['task_id'].str.extract(r'-(\d+)$')[0].astype(int)

# Sort by first_number and then by last_number
df = df.sort_values(by=['first_number', 'last_number']).drop(columns=['first_number', 'last_number'])

# Reset index if needed
df = df.reset_index(drop=True)

df_list = []
for i in range(10): 
    split_df = extract_data(df, i)
    df_list.append(split_df)

df = join_votes(df_list)



In [26]:
#Finding errors
votes = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9"]]
print(votes.isna().sum())

Vote_0     65
Vote_1     80
Vote_2     73
Vote_3     57
Vote_4     71
Vote_5    317
Vote_6    355
Vote_7    313
Vote_8    319
Vote_9    334
dtype: int64


In [27]:
df.mean()

Vote_0            0.819079
Vote_1            0.816301
Vote_2            0.805693
Vote_3            0.815984
Vote_4            0.826463
Vote_5            0.783028
Vote_6             0.79972
Vote_7            0.783425
Vote_8             0.78191
Vote_9            0.791493
Proportion_Yes    0.803987
Majority_Vote     0.844771
dtype: object

In [28]:
ballotq = pd.read_csv('../data/clean_ballot_qs.csv', index_col=0)
result = pd.concat([ballotq, df], axis='columns')
result = result[["Year", "State", "Topic_Areas", "Election", "Percent_Yes", "Pass", "Proportion_Yes", "Majority_Vote"]]
result

Unnamed: 0,Year,State,Topic_Areas,Election,Percent_Yes,Pass,Proportion_Yes,Majority_Vote
0,2000,Alabama,Civil & Constitutional Law,General,0.600,1,1.0,1
1,2000,Alabama,Judiciary,General,0.570,1,0.8,1
2,2000,Alabama,Local Government,General,0.630,1,1.0,1
3,2000,Alabama,Human Services,General,0.570,1,0.9,1
4,2000,Alabama,Bond Measures,General,0.630,1,1.0,1
...,...,...,...,...,...,...,...,...
2492,2014,Wyoming,Education: Higher Ed,General,0.297,0,0.7,1
2493,2016,Wyoming,Budgets,General,0.563,1,0.7,1
2494,2020,Wyoming,Energy & Electric Utilities,General,0.510,1,0.6,1
2495,2022,Wyoming,Judiciary,General,0.390,0,0.9,1


In [29]:
import numpy as np
no_ties = result.dropna(subset="Majority_Vote")
print("Among results without ties, percentage of LLM responses aligning with voters")
print(np.mean(no_ties["Pass"]==no_ties["Majority_Vote"]))

Among results without ties, percentage of LLM responses aligning with voters
0.6956699346405228
