In [49]:
import pandas as pd 
import json 
import random

def read_jsonl(file_path):
    # List to store the extracted data
    data = []

    # Read and process the JSONL file
    with open(file_path, 'r') as file:
        for line in file:
            try:
                json_data = json.loads(line)  # Parse each line as JSON
                task_id = json_data.get("task_id")  # Extract the task_id
                
                # Access the nested "content" field inside "choices"
                choices = json_data.get("choices", [])
                for choice in choices:
                    message = choice.get("message", {})
                    content = message.get("content")  # Extract the text content
                    
                    if content:
                        # Append the task_id and content to the data list
                        data.append({"task_id": task_id, "text": content})
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    return df

#helper lambda function
def extract_vote(text):
    if "[[yes]]" in text and "[[no]]" in text:
        return pd.NA
    elif "[[yes]]" in text:
        return 1
    elif "[[no]]" in text:
        return 0
    else:
        return pd.NA


def extract_data(df, i):
    # Filter rows where 'task_id' starts with the desired prefix
    filtered_df = df[df['task_id'].str.startswith(f'task-{i}-')].copy()    

    # Extract the index to enable a join operation later
    filtered_df['index'] = filtered_df['task_id'].str.split('-').str[-1].astype(int)
    
    # Extract the batch number for column naming
    num = filtered_df['task_id'].str.split('-').str[-2].astype(int).iloc[0]
    
    # Set the index
    filtered_df = filtered_df.set_index('index').drop(columns=['task_id'])
    
    # Rename the column for text
    filtered_df = filtered_df.rename(columns={"text": f"Text_{num}"})
    
    # Get binary votes from models
    filtered_df[f'Vote_{num}'] = filtered_df[f'Text_{num}'].apply(lambda x: extract_vote(x))
    
    return filtered_df


def join_votes(df_list): 
    df = pd.concat(df_list, axis="columns")
    df = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9", "Vote_10"]]
    df["Proportion_Yes"] = df.mean(axis="columns")
    df["Proportion_Yes_10"] = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9"]].mean(axis='columns')
    
    #
    df["Proportion_Yes_9a"] = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8"]].mean(axis='columns')
    df["Proportion_Yes_9b"] = df[["Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9"]].mean(axis='columns')
    
    df["Proportion_Yes"] = df.mean(axis="columns")

    df["Majority_Vote"] = df["Proportion_Yes"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)

    #Fix cases where this is still a tie!
    df.loc[df["Majority_Vote"].isna(), "Majority_Vote"] = df["Proportion_Yes_10"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)
    random.seed(537)
    if random.random() > 0.5: 
        df.loc[df["Majority_Vote"].isna(), "Majority_Vote"] = df["Proportion_Yes_9a"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)
    else: 
        df.loc[df["Majority_Vote"].isna(), "Majority_Vote"] = df["Proportion_Yes_9b"].apply(lambda x: 1 if x > 0.5 else 0 if x < 0.5 else pd.NA)


    df = df.drop(columns=['Proportion_Yes_10', 'Proportion_Yes_9a', 'Proportion_Yes_9b'])

    return df


df1 = read_jsonl('../results/llama/all_responses.jsonl')
df2 = read_jsonl('../results/llama/all_responses2.jsonl')
df3 = read_jsonl('../results/llama/all_responses_3.jsonl')

df = pd.concat([df1, df2, df3], ignore_index=True)

# Extract numeric parts
df['first_number'] = df['task_id'].str.extract(r'task-(\d+)-')[0].astype(int)
df['last_number'] = df['task_id'].str.extract(r'-(\d+)$')[0].astype(int)

# Sort by first_number and then by last_number
df = df.sort_values(by=['first_number', 'last_number']).drop(columns=['first_number', 'last_number'])

# Reset index if needed
df = df.reset_index(drop=True)

df_list = []
for i in range(11): 
    split_df = extract_data(df, i)
    df_list.append(split_df)

df = join_votes(df_list)



In [50]:
#Finding errors
votes = df[["Vote_0", "Vote_1", "Vote_2", "Vote_3", "Vote_4", "Vote_5", "Vote_6", "Vote_7", "Vote_8", "Vote_9", "Vote_10"]]
print(votes.isna().sum())

Vote_0      65
Vote_1      80
Vote_2      73
Vote_3      57
Vote_4      71
Vote_5     317
Vote_6     355
Vote_7     313
Vote_8     319
Vote_9     334
Vote_10    201
dtype: int64


In [51]:
df

Unnamed: 0_level_0,Vote_0,Vote_1,Vote_2,Vote_3,Vote_4,Vote_5,Vote_6,Vote_7,Vote_8,Vote_9,Vote_10,Proportion_Yes,Majority_Vote
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1,1,1,1,1,1,1,1,1,1,1.0,1
1,1,1,1,0,1,0,1,1,1,1,1,0.811582,1
2,1,1,1,1,1,1,1,1,1,1,1,1.0,1
3,1,1,1,1,1,1,1,1,0,1,1,0.905791,1
4,1,1,1,1,1,1,,1,1,,1,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2492,0,0,1,1,1,0,1,1,1,1,1,0.724781,1
2493,1,1,1,0,0,0,1,1,1,1,1,0.717374,1
2494,1,1,1,1,0,0,1,1,0,0,0,0.557845,1
2495,1,1,0,1,1,1,1,1,1,1,1,0.905791,1


In [52]:
import numpy as np
ballotq = pd.read_csv('../data/clean_ballot_qs.csv', index_col=0)
result = pd.concat([ballotq, df], axis='columns')
result = result[["Year", "State", "Topic_Areas", "Election", "Percent_Yes", "Pass", "Proportion_Yes", "Majority_Vote"]]
np.sum(result["Majority_Vote"].isna())

np.int64(0)

In [53]:
import numpy as np
no_ties = result.dropna(subset="Majority_Vote")
print("Among results without ties, percentage of LLM responses aligning with voters")
print(np.mean(no_ties["Pass"]==no_ties["Majority_Vote"]))

Among results without ties, percentage of LLM responses aligning with voters
0.6924309171005206
