In [1]:
import glob
import json
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
from helpers.metrics import get_bootstrap
from helpers.process_results import (
    majority_vote,
    process_crowdsourcing_data,
    process_gpt_response,
)
from scipy.stats import t
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split

warnings.filterwarnings("ignore")

In [2]:
np.random.seed(1234)

# Accuracy and Basic Data Statistics

## Prepare data and get statistics

### Create the ground truth dataframe for Q1

In [3]:
votes_df = pd.read_json("../data/processed_data/votes_df.json")

ground_truth_df = votes_df[["debate_id", "more_convincing_arguments"]]
ground_truth_df = ground_truth_df.rename(columns={"more_convincing_arguments": "q1"})
ground_truth_df = (
    ground_truth_df.groupby("debate_id")
    .apply(lambda x: majority_vote(x, "q1"))
    .to_frame()
    .rename(columns={0: "q1"})
    .reset_index()
)

ground_truth_df.head(10)

Unnamed: 0,debate_id,q1
0,0,Con
1,1,Con
2,3,Pro
3,4,Con
4,5,Con
5,7,Con
6,8,Pro
7,9,Tie
8,10,Pro
9,11,Con


### Collect all LLM data in one DF

In [4]:
propositions_df = pd.read_json("../data/raw_data/propositions.json")
PoliPropDataset = list(propositions_df.debate_id.unique())

full_list = []
all_results_files = glob.glob("../results/*/q*/*q*")

for file in all_results_files:
    model = file.split("/")[2]
    question = file.split("/")[3]

    df = pd.read_json(file)
    df["model"] = model
    df["question"] = question

    full_list.append(df)

full_df = pd.concat(full_list)
full_df = full_df[full_df.debate_id.isin(PoliPropDataset)]
full_df = process_gpt_response(full_df)

full_df = full_df[
    [
        "question",
        "debate_length",
        "model",
        "debate_id",
        "voter_id",
        "gpt_response",
        "agreed_before",
        "agreed_after",
        "correct_form",
        "answer_extracted",
    ]
]
full_df = full_df.rename(
    columns={
        "gpt_response": "response",
        "agreed_before": "q2",
        "agreed_after": "q3",
    }
)
full_df = full_df.merge(ground_truth_df, on="debate_id")
full_df.head()

Unnamed: 0,question,debate_length,model,debate_id,voter_id,response,q2,q3,correct_form,answer_extracted,q1
0,q1,full,GPT-3.5,358,,Pro,,,True,True,Con
1,q2,full,GPT-3.5,358,imabench,Con,Tie,Tie,True,True,Con
2,q2,full,GPT-3.5,358,9spaceking,Con,Tie,Tie,True,True,Con
3,q3,full,GPT-3.5,358,imabench,Pro,Tie,Tie,True,True,Con
4,q3,full,GPT-3.5,358,9spaceking,Pro,Tie,Tie,True,True,Con


### Get Datasets

In [5]:
# ensuring all the models and questions have the same debate ids and voter ids
# display(full_df.groupby(["question", "model"]).debate_id.nunique()) # UNCOMMENT to view
# display(full_df.groupby(["question", "model"]).voter_id.nunique()) # UNCOMMENT line to view

In [19]:
propositions_df

Unnamed: 0,debate_id,proposition
0,358,The September 11Th Attacks Were Orchestrated B...
1,376,The September 11Th Attacks Were Orchestrated B...
2,384,The Buildings Collapsed On September 11Th Nece...
3,551,Poverty Can Be Eliminated By Creating A Privat...
4,553,"In The Case Of District Of Columbia V. Heller,..."
...,...,...
1042,76836,Voluntary Abortion Should Be Legal.
1043,76983,Women Make Better Politicians Than Men.
1044,76991,Women Should Be Allowed To Serve On U.S. Subma...
1047,77285,"Having A ""Green"" Infrastructure Would Help Ame..."


In [25]:
propositions_df[propositions_df.debate_id == 553].proposition.values[0]

"In The Case Of District Of Columbia V. Heller, Heller'S Argument Is Consistent With The Constituion."

In [6]:
short_debates = (
    full_df[full_df.debate_length == "full"]
    .groupby(["question", "model"])
    .debate_id.unique()
)
short_current_set = set(short_debates[0])
for sds in short_debates:
    short_current_set = short_current_set.intersection(set(sds))


trimmed_debates = full_df.groupby(["question", "model"]).debate_id.unique()
trimmed_current_set = set(trimmed_debates[0])
for tds in trimmed_debates:
    trimmed_current_set = trimmed_current_set.intersection(set(tds))

SHORT = list(short_current_set)
TRIMMED = list(trimmed_current_set)
ABORTION = list(
    propositions_df[
        (propositions_df.proposition.str.lower().str.contains("abortion"))
    ].debate_id.unique()
)
GAY_MARRIAGE = list(
    propositions_df[
        (
            propositions_df.proposition.str.lower().str.contains("same sex")
            | propositions_df.proposition.str.lower().str.contains("gay")
            | propositions_df.proposition.str.lower().str.contains("same-sex")
        )
        & (propositions_df.proposition.str.lower().str.contains("marriage"))
    ].debate_id.unique()
)

CAPITAL_PUNISHMENT = list(
    propositions_df[
        (
            propositions_df.proposition.str.lower().str.contains("death penalty")
            | propositions_df.proposition.str.lower().str.contains("capital punishment")
        )
    ].debate_id.unique()
)

ISSUES = list(set(ABORTION + GAY_MARRIAGE + CAPITAL_PUNISHMENT))

DATASETS = [TRIMMED, SHORT, ISSUES]
DATASET_NAMES = ["Trimmed", "Short", "Issues"]

In [7]:
votes_filtered_df = pd.read_json("../data/filtered_data/votes_filtered_df.json")

num_debates = []
num_votes = []
for debate_ids in [TRIMMED, SHORT, ISSUES]:
    num_debates.append(len(debate_ids))
    num_votes.append(
        (len(votes_filtered_df[votes_filtered_df.debate_id.isin(debate_ids)]))
    )

datasets_df = pd.DataFrame(
    {
        "Dataset": DATASET_NAMES,
        "Number of debates": num_debates,
        "Number of votes": num_votes,
    }
)
datasets_df

Unnamed: 0,Dataset,Number of debates,Number of votes
0,Trimmed,833,4871
1,Short,276,1538
2,Issues,127,836


### Prepare MTurk data

In [8]:
# load all crowdsourcing files
crowd_files = glob.glob("../data/raw_data/crowd/*")
crowd_dfs = []
for file in crowd_files:
    crowd_df = pd.read_csv(file)
    crowd_dfs.append(crowd_df)

# create one dataframe of all crowdsourcing data
crowd_df = pd.concat(crowd_dfs).reset_index(drop=True)
crowd_df = process_crowdsourcing_data(crowd_df)
crowd_df = crowd_df.groupby(["debate_id", "voter_id"]).sample(1)
crowd_df = pd.melt(
    crowd_df,
    id_vars=["debate_id", "voter_id"],
    value_vars=["q1", "q2", "q3"],
    var_name="question",
    value_name="response",
)

crowd_df["voter_id"] = crowd_df.apply(
    lambda x: np.nan if x.question == "q1" else x.voter_id, axis=1
)
crowd_df = crowd_df.merge(
    full_df.groupby(["question", "debate_id", "voter_id"], dropna=False)
    .first()
    .reset_index()[["question", "debate_id", "voter_id", "q1", "q2", "q3"]],
    on=["question", "debate_id", "voter_id"],
)

crowd_df.head()

Unnamed: 0,debate_id,voter_id,question,response,q1,q2,q3
0,706,,q1,Con,Pro,,
1,706,,q1,Con,Pro,,
2,706,,q1,Pro,Pro,,
3,706,,q1,Con,Pro,,
4,706,,q1,Con,Pro,,


### Get table for correct form and answer extracted

In [9]:
correct_form_df = (
    full_df.groupby(["question", "model"])[["correct_form", "answer_extracted"]]
    .mean()
    .reset_index()
)

correct_form_df[["correct_form", "answer_extracted"]] = (
    correct_form_df[["correct_form", "answer_extracted"]] * 100
)

print(correct_form_df.to_latex(index=False, float_format="%.2f", position="h"))

\begin{table}[h]
\begin{tabular}{llrr}
\toprule
question & model & correct_form & answer_extracted \\
\midrule
q1 & GPT-3.5 & 99.88 & 100.00 \\
q1 & GPT-4 & 99.06 & 100.00 \\
q1 & Llama & 0.00 & 94.95 \\
q1 & Mistral & 62.79 & 95.07 \\
q2 & GPT-3.5 & 99.84 & 99.88 \\
q2 & GPT-4 & 100.00 & 100.00 \\
q2 & Llama & 0.00 & 97.13 \\
q2 & Mistral & 67.13 & 100.00 \\
q3 & GPT-3.5 & 99.82 & 99.94 \\
q3 & GPT-4 & 99.61 & 99.98 \\
q3 & Llama & 0.00 & 91.50 \\
q3 & Mistral & 17.22 & 79.72 \\
\bottomrule
\end{tabular}
\end{table}



## Get the accuracies for each configuration

In [10]:
questions = []
datasets = []
models = []
accuracies = []
confidence_intervals = []
pro_recalls = []
con_recalls = []

for question in full_df.question.unique():
    accuracy, recalls, _, ci = get_bootstrap(
        crowd_df[(crowd_df.debate_id.isin(ISSUES)) & (crowd_df.question == question)],
        question,
    )

    questions.append(question)
    datasets.append("Issues")
    models.append("MTurk")
    accuracies.append(accuracy)
    confidence_intervals.append(ci)
    pro_recalls.append(recalls[0])
    con_recalls.append(recalls[1])

    for model in full_df.model.unique():
        for dataset, name in zip(
            [TRIMMED, SHORT, ISSUES], ["Trimmed", "Short", "Issues"]
        ):

            temp_df = full_df[
                (full_df.question == question)
                & (full_df.model == model)
                & (full_df.debate_id.isin(dataset))
            ]
            accuracy, recalls, _, ci = get_bootstrap(temp_df, question)

            questions.append(question)
            datasets.append(name)
            models.append(model)
            accuracies.append(accuracy)
            confidence_intervals.append(ci)
            pro_recalls.append(recalls[0])
            con_recalls.append(recalls[1])

In [11]:
results = pd.DataFrame(
    {
        "Q": questions,
        "Dataset": datasets,
        "Model": models,
        "Pro Recall (\%)": pro_recalls,
        "Con Recall (\%)": con_recalls,
        "Accuracy (\%)": accuracies,
        "Accuracy CI (95\%)": confidence_intervals,
    }
)

results["Dataset"] = pd.Categorical(results["Dataset"], ["Trimmed", "Short", "Issues"])
results["Model"] = pd.Categorical(
    results["Model"], ["GPT-3.5", "GPT-4", "Llama", "Mistral", "MTurk"]
)
results = results.sort_values(["Q", "Dataset", "Model"])

print(results.to_latex(index=False, float_format="%.2f", position="h"))

\begin{table}[h]
\begin{tabular}{lllrrrl}
\toprule
Q & Dataset & Model & Pro Recall (\%) & Con Recall (\%) & Accuracy (\%) & Accuracy CI (95\%) \\
\midrule
q1 & Trimmed & GPT-3.5 & 68.77 & 30.73 & 42.74 & (33.0, 52.0) \\
q1 & Trimmed & GPT-4 & 54.89 & 77.07 & 60.50 & (51.0, 70.0) \\
q1 & Trimmed & Llama & 49.84 & 0.24 & 23.65 & (15.0, 32.0) \\
q1 & Trimmed & Mistral & 74.13 & 13.00 & 36.01 & (27.0, 46.0) \\
q1 & Short & GPT-3.5 & 64.29 & 27.70 & 39.13 & (30.0, 48.0) \\
q1 & Short & GPT-4 & 57.14 & 77.03 & 61.96 & (53.0, 72.0) \\
q1 & Short & Llama & 35.71 & 0.00 & 17.75 & (11.0, 27.0) \\
q1 & Short & Mistral & 73.47 & 20.27 & 39.13 & (29.0, 49.0) \\
q1 & Issues & GPT-3.5 & 69.09 & 29.03 & 44.09 & (35.0, 54.0) \\
q1 & Issues & GPT-4 & 69.09 & 66.13 & 62.20 & (53.0, 72.0) \\
q1 & Issues & Llama & 47.27 & 0.00 & 24.41 & (16.0, 33.0) \\
q1 & Issues & Mistral & 70.91 & 9.68 & 37.01 & (28.0, 46.0) \\
q1 & Issues & MTurk & 56.34 & 39.69 & 44.19 & (34.0, 54.0) \\
q2 & Trimmed & GPT-3.5 & 28.51

In [None]:
results

### Get Crowd Data

## Process the crowdsourcing results

In [None]:
# debates_df = pd.read_json("../data/filtered_data/debates_filtered_df.json")
# debates_df["start_date"] = pd.to_datetime(debates_df["start_date"])

# users_df = pd.read_json("../data/processed_data/users_df.json")
# users_df = users_df.reset_index(names="voter_id")

In [None]:
## TODO: figure out what i want to fo with these responses
# full_df[(full_df.gpt_response.str.contains("Con") & (full_df.gpt_response != "Con")) |
#              (full_df.gpt_response.str.contains("Pro") & (full_df.gpt_response != "Pro")) |
#              (full_df.gpt_response.str.contains("Tie") & (full_df.gpt_response != "Tie"))].groupby(["model", "question"]).count()

In [None]:
# binary_df = []
# binary_class_files = glob.glob("../results/binary_class/*")

# for file in binary_class_files:
#     model = file.split(".json")[0].split("/")[-1]
#     df = pd.read_json(file)
#     df["model"] = model

#     binary_df.append(df)

# binary_df = pd.concat(binary_df)
# binary_df = process_gpt_response(binary_df)
# binary_df.head(2)

In [None]:
# models = []
# accuracies = []
# pro_recalls = []
# con_recalls = []
# confidence_intervals = []

# for model in list(binary_df.model.unique()):
#     model_df = binary_df[binary_df.model == model]
#     accuracy, recalls, precisions, bounds = get_bootstrap(model_df, "agreed_before")
#     models.append(model)
#     accuracies.append(accuracy)
#     pro_recalls.append(recalls[0])
#     con_recalls.append(recalls[1])
#     confidence_intervals.append(bounds)

# results = pd.DataFrame(
#     {
#         "Model": models,
#         "Pro Recall (\%)": pro_recalls,
#         "Con Recall (\%)": con_recalls,
#         "Accuracy": accuracies,
#         "Accuracy CI (95\%)": confidence_intervals,
#     }
# )

# results = results.sort_values(["Model"]).reset_index(drop=True)
# results["Model"] = results.Model.str.capitalize()
# results["Model"] = results.Model.str.replace("Gpt", "GPT")
# results

In [None]:
# print(results.to_latex(index=False, float_format="%.2f", position="h"))

# Accuracy

In [None]:
voter_agg = votes_df[["debate_id", "voter_id", "more_convincing_arguments"]].merge(
    ground_truth, on="debate_id"
)

voter_agg_temp = voter_agg[voter_agg.debate_id.isin(PoliProp)]
models.append("VoterAgg")
questions.append("1")
datasets.append("Trimmed")
accuracies.append(
    (
        voter_agg_temp.more_convincing_arguments_x
        == voter_agg_temp.more_convincing_arguments_y
    ).sum()
    / len(voter_agg_temp)
    * 100
)
# pro_recalls.append("--")
# con_recalls.append("--")
confidence_intervals.append("--")


voter_agg_temp = voter_agg[voter_agg.debate_id.isin(PoliPropShort)]
models.append("VoterAgg")
questions.append("1")
datasets.append("Short")
accuracies.append(
    (
        voter_agg_temp.more_convincing_arguments_x
        == voter_agg_temp.more_convincing_arguments_y
    ).sum()
    / len(voter_agg_temp)
    * 100
)
pro_recalls.append("--")
con_recalls.append("--")
confidence_intervals.append("--")

voter_agg_temp = voter_agg[voter_agg.debate_id.isin(PoliPropCrowd)]
models.append("VoterAgg")
questions.append("1")
datasets.append("Crowd")
accuracies.append(
    (
        voter_agg_temp.more_convincing_arguments_x
        == voter_agg_temp.more_convincing_arguments_y
    ).sum()
    / len(voter_agg_temp)
    * 100
)
pro_recalls.append("--")
con_recalls.append("--")
confidence_intervals.append("--")

In [None]:
sns.set_palette(sns.color_palette("bright"))
fig, axs = plt.subplots(1, 3, sharey=True, sharex=True, figsize=(15, 5))
axs = axs.flatten()
models = ["GPT-3.5", "GPT-4", "Llama", "Mistral"]
baselines = results[~results.Model.isin(models)]
for i, ax in enumerate(axs):
    sns.barplot(
        data=results[(results.Model.isin(models)) & (results.Q == str(i + 1))],
        x="Model",
        y="Accuracy",
        hue="Dataset",
        ax=ax,
    )

    c = 3
    for _, row in baselines[baselines.Q == str(i + 1)].iterrows():
        if row["Model"] != "Voteragg":
            ax.axhline(
                y=row["Accuracy"],
                c=sns.color_palette("colorblind")[c],
                label=row["Model"],
            )
            c += 1
    ax.set_ylim([0, 100])
    ax.set_title(f"Question {i+1}")
    ax.legend()

plt.suptitle("Barplots of Accuracies")
plt.show()

# Abortion Table

In [None]:
abortion_files = glob.glob("../results/abortion/*/*")

In [12]:
crowd_df

Unnamed: 0,debate_id,voter_id,question,response,q1,q2,q3
0,706,,q1,Con,Pro,,
1,706,,q1,Con,Pro,,
2,706,,q1,Pro,Pro,,
3,706,,q1,Con,Pro,,
4,706,,q1,Con,Pro,,
...,...,...,...,...,...,...,...
2249,76836,Ragnar,q3,Pro,Pro,Pro,Pro
2250,76836,Splenic_Warrior,q3,Pro,Pro,Tie,Tie
2251,76836,birdlandmemories,q3,Pro,Pro,Con,Con
2252,76836,debatability,q3,Pro,Pro,Tie,Tie


In [None]:
def process_llm_response(
    df: pd.DataFrame, column: str = "gpt_response", reasoning=False
) -> pd.DataFrame:
    df = df.reset_index(names="vote_id")
    if reasoning:
        df.rename(columns={"gpt_response": "gpt_answer"}, inplace=True)
        df["gpt_response"] = df.gpt_answer.apply(
            lambda x: x.title().split("Answer: ")[-1]
        )
        df = df.drop(columns="gpt_answer")
    df[column] = df[column].str.replace(".", "")
    df[column] = df[column].str.replace(" ", "")

    df[column] = df[column].apply(
        lambda x: (
            x.replace(" ", "")
            if any(vote in x for vote in ["Pro", "Con", "Tie"]) and len(x) <= 10
            else "other"
        )
    )
    return df

In [None]:
models = []
datasets = []
big_issues = []
reasoning = []
accuracies = []
pro_recalls = []
con_recalls = []
cis = []

for file in abortion_files:
    for debates, dataset in zip(
        [PoliPropAbortion, PoliPropCrowdAbortion], ["Abortion", "CrowdAbortion"]
    ):
        df = pd.read_json(file)
        df = df[df.debate_id.isin(debates)]
        model = file.split("/")[3]

        if "BI" in file:
            big_issues.append("Yes")
        else:
            big_issues.append("No")

        if "R" in file:
            reasoning.append("Yes")
            df = process_llm_response(df, reasoning=True)
        else:
            reasoning.append("No")
            df = process_llm_response(df, reasoning=False)

        accuracy, recalls, precisions, bounds = get_bootstrap(
            df, "agreed_before", "gpt_response"
        )

        models.append(model)
        datasets.append(dataset)
        accuracies.append(accuracy)
        pro_recalls.append(recalls[0])
        con_recalls.append(recalls[1])
        cis.append(bounds)

In [None]:
results = pd.DataFrame(
    {
        "Model": models,
        "Dataset": datasets,
        "Big Issues": big_issues,
        "Reasoning": reasoning,
        "Accuracy": accuracies,
        "95 \% CI": cis,
    }
)

results = results.sort_values(["Model", "Big Issues", "Reasoning"]).reset_index(
    drop=True
)
results["Model"] = results.Model.str.capitalize()
results["Model"] = results.Model.str.replace("Gpt", "GPT")
results = results.sort_values(["Dataset", "Model", "Big Issues", "Reasoning"])

In [None]:
print(results.to_latex(index=False, float_format="%.2f", position="h"))

# Regression

In [52]:
# some things needed for regression
def to_stance(row, column):
    if row[column] == "Pro":
        return 1
    elif row[column] == "Con":
        return -1
    else:
        return 0


with open("task_configs.json") as f:
    task_config = json.load(f)

demographic_features = task_config["demographic_columns"]
demographic_features.remove("birthday")
big_issues_features = task_config["big_issue_columns"]

In [87]:
current_issue = "Abortion"

if current_issue == "Gay Marriage":
    debates = GAY_MARRIAGE
    path_to_stances = "../data/processed_data/gay_marriage_props.json"
elif current_issue == "Abortion":
    debates = ABORTION
    path_to_stances = "../data/processed_data/abortion_props.json"
elif current_issue == "Capital Punishment":
    debates = CAPITAL_PUNISHMENT
    path_to_stances = "../data/processed_data/capital_punishment_props.json"

In [88]:
# merge df with user demographics
users_df = pd.read_json("../data/processed_data/users_df.json")
users_df = users_df.reset_index().rename(columns={"index": "voter_id"})

issue_df = votes_df[votes_df.debate_id.isin(debates)][
    ["debate_id", "voter_id", "agreed_before"]
]
issue_df = issue_df.merge(users_df, on="voter_id")

# merge with debates df to add start date for age calculation
debates_df = pd.read_json("../data/processed_data/debates_df.json")
debates_df["start_date"] = pd.to_datetime(debates_df["start_date"])
issue_df = issue_df.merge(
    debates_df[["debate_id", "start_date"]], on="debate_id"
)


# age calculation
issue_df["birthday"] = pd.to_datetime(issue_df.birthday)
issue_df["age"] = (
    issue_df.start_date - issue_df.birthday
) / pd.Timedelta("365 days")
issue_df["age"] = (issue_df.age.max() - issue_df.age) / (
    issue_df.age.max() - issue_df.age.min()
)
issue_df["age"] = issue_df.age.fillna(issue_df.age.mean())


# merge with the stance
stances = pd.read_json(path_to_stances)
issue_df = issue_df.merge(stances, on="debate_id")


# Turn stances into -1,0,1
issue_df["agreed_before"] = issue_df.apply(
    lambda x: to_stance(x, "agreed_before"), axis=1
)
issue_df["stance"] = issue_df.apply(
    lambda x: to_stance(x, "stance"), axis=1
)
issue_df["agreed_before"] = (
    issue_df.agreed_before * issue_df.stance
)

# display df
issue_df.head()

Unnamed: 0,debate_id,voter_id,agreed_before,birthday,education,ethnicity,gender,income,interested,party,...,torture,united_nations,war_in_afghanistan,war_on_terror,welfare,num_big_issues,start_date,age,proposition,stance
0,1289,Mangani,-1,NaT,Some College,Latino,Male,"$75,000 to $100,000",in Women,Undecided,...,,Pro,Pro,Con,Con,25,2008-10-25,0.78175,Abortion Is An Inalienable Right.,1
1,1289,JBlake,0,1984-10-01,Graduate Degree,White,Male,,in Women,Democratic Party,...,Con,Pro,Con,Con,Pro,47,2008-10-25,0.667191,Abortion Is An Inalienable Right.,1
2,1289,InquireTruth,0,NaT,Graduate Degree,White,Male,"$75,000 to $100,000",in Women,Republican Party,...,Con,Con,Con,Con,Con,48,2008-10-25,0.78175,Abortion Is An Inalienable Right.,1
3,1289,KRFournier,-1,NaT,Bachelors Degree,White,Male,"$50,000 to $75,000",in Women,Republican Party,...,,Und,,,Con,27,2008-10-25,0.78175,Abortion Is An Inalienable Right.,1
4,1824,Mangani,1,NaT,Some College,Latino,Male,"$75,000 to $100,000",in Women,Undecided,...,,Pro,Pro,Con,Con,25,2007-12-24,0.78175,Abortion Should Be Illegal.,-1


In [89]:
# create features
df_dummies = pd.get_dummies(
    issue_df[["debate_id", "agreed_before"] + demographic_features]
)
df_dummies_BI = pd.get_dummies(
    issue_df[
        ["debate_id", "agreed_before"] + demographic_features + big_issues_features
    ]
)

features = [
    col for col in df_dummies.columns if col not in ["debate_id", "agreed_before"]
]
features_BI = [
    col for col in df_dummies_BI.columns if col not in ["debate_id", "agreed_before"]
]

In [90]:
# train models and collect results
num_splits = 20
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

models = []
accuracies = []
confidence_intervals = []

for clf in [LR_clf, GB_clf]:
    scores = []
    scores_BI = []
    for i in range(num_splits):
        # split on the debates
        train, test = train_test_split(list(df.debate_id.unique()))
        # get train and test set
        df_train = df_dummies.copy()[df_dummies.debate_id.isin(train)]
        df_test = df_dummies.copy()[df_dummies.debate_id.isin(test)]

        df_train_BI = df_dummies_BI.copy()[df_dummies_BI.debate_id.isin(train)]
        df_test_BI = df_dummies_BI.copy()[df_dummies_BI.debate_id.isin(test)]

        # get features and outputs
        X_train = pd.get_dummies(df_train[features])
        X_test = pd.get_dummies(df_test[features])
        y_train = df_train["agreed_before"]
        y_test = df_test["agreed_before"]

        # get features and outputs
        X_train_BI = pd.get_dummies(df_train_BI[features_BI])
        X_test_BI = pd.get_dummies(df_test_BI[features_BI])
        y_train_BI = df_train_BI["agreed_before"]
        y_test_BI = df_test_BI["agreed_before"]

        score = clf.fit(X_train, y_train).score(X_test, y_test)
        scores.append(score)

        score = clf.fit(X_train_BI, y_train_BI).score(X_test_BI, y_test_BI)
        scores_BI.append(score)

    if clf == LR_clf:
        models.append("Logistic Regression")
        models.append("Logistic Regression")
    elif clf == GB_clf:
        models.append("Gradient Boosting")
        models.append("Gradient Boosting")

    sample_mean = np.mean(scores)
    sample_std = np.std(scores, ddof=1)  # using ddof=1 for sample standard deviation

    # Step 2: Determine the t-value for a 95% confidence interval
    confidence_level = 0.95
    degrees_of_freedom = len(scores) - 1
    t_value = t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

    # Step 3: Calculate confidence interval
    margin_of_error = t_value * (sample_std / np.sqrt(len(scores)))
    confidence_interval = (
        round((sample_mean - margin_of_error) * 100, 2),
        round((sample_mean + margin_of_error) * 100, 2),
    )

    accuracies.append(round(sample_mean * 100, 2))
    confidence_intervals.append(confidence_interval)

    sample_mean = np.mean(scores_BI)
    sample_std = np.std(scores_BI, ddof=1)  # using ddof=1 for sample standard deviation

    # Step 2: Determine the t-value for a 95% confidence interval
    confidence_level = 0.95
    degrees_of_freedom = len(scores_BI) - 1
    t_value = t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

    # Step 3: Calculate confidence interval
    margin_of_error = t_value * (sample_std / np.sqrt(len(scores_BI)))
    confidence_interval = (
        round((sample_mean - margin_of_error) * 100, 2),
        round((sample_mean + margin_of_error) * 100, 2),
    )

    accuracies.append(round(sample_mean * 100, 2))
    confidence_intervals.append(confidence_interval)

In [91]:
regressions = pd.DataFrame(
    {
        "Model": models,
        "Issue": [current_issue] * 4,
        "Big Issues": ["No", "Yes", "No", "Yes"],
        "Reasoning": ["--"] * 4,
        "Accuracy": accuracies,
        "95 \% CI": confidence_intervals,
    }
)

print(regressions.to_latex(index=False, float_format="%.2f", position="h"))

\begin{table}[h]
\begin{tabular}{llllrl}
\toprule
Model & Issue & Big Issues & Reasoning & Accuracy & 95 \% CI \\
\midrule
Logistic Regression & Abortion & No & -- & 55.14 & (52.52, 57.76) \\
Logistic Regression & Abortion & Yes & -- & 59.74 & (55.57, 63.91) \\
Gradient Boosting & Abortion & No & -- & 56.14 & (54.42, 57.85) \\
Gradient Boosting & Abortion & Yes & -- & 59.29 & (56.83, 61.75) \\
\bottomrule
\end{tabular}
\end{table}



In [100]:
accuracy, _, _, ci = get_bootstrap(
    crowd_df[(crowd_df.debate_id.isin(ABORTION)) & (crowd_df.question == "q2")],
    "q2",
)
print(accuracy," & ", ci)

39.81  &  (31.0, 50.0)


In [97]:
accuracy, _, _, ci = get_bootstrap(
    crowd_df[(crowd_df.debate_id.isin(GAY_MARRIAGE)) & (crowd_df.question == "q2")],
    "q2",
)
print(accuracy," & ", ci)

41.43  &  (31.0, 51.0)


In [98]:
accuracy, _, _, ci = get_bootstrap(
    crowd_df[(crowd_df.debate_id.isin(CAPITAL_PUNISHMENT)) & (crowd_df.question == "q2")],
    "q2",
)
print(accuracy," & ", ci)

25.83  &  (17.0, 35.0)


In [None]:
def get_full_name(row):
    name = row.Model
    if row["Big Issues"] == "Yes":
        name += "-BI"
    if row["Reasoning"] == "Yes":
        name += "-R"
    return name


def get_BI_R(row):
    name = ""
    if row["Big Issues"] == "Yes":
        if name == "":
            name += "BI"
        else:
            name += "-BI"
    if row["Reasoning"] == "Yes":
        if name == "":
            name += "R"
        else:
            name += "-R"
    if name == "":
        return "None"
    return name


results["full_name"] = results.apply(lambda x: get_full_name(x), axis=1)
results["BI-R"] = results.apply(lambda x: get_BI_R(x), axis=1)

In [None]:
baselines

In [None]:
sns.set_palette(sns.color_palette("bright"))

models = ["GPT-3.5", "GPT-4", "Llama", "Mistral"]
baselines = results[~results.Model.isin(models)]

sns.barplot(
    data=results[results.Model.isin(models)], x="Model", y="Accuracy", hue="BI-R"
)
c = 4
for _, row in baselines.iterrows():
    plt.axhline(
        y=row["Accuracy"], label=row["full_name"], c=sns.color_palette("bright")[c]
    )
    c += 1
plt.legend()
plt.ylim([0, 100])
plt.tight_layout()

In [None]:
print(results.to_latex(index=False, float_format="%.2f", position="h"))

# "Stacked" Model

In [None]:
full_q1 = full_df[(full_df.question == "q1")]
full_q1 = full_q1[["debate_id", "gpt_response", "model"]]
full_q1

In [None]:
full_q2 = full_df[(full_df.question == "q2")]
full_q2 = full_q2[["debate_id", "voter_id", "gpt_response", "model"]]
full_q2.head(2)

In [None]:
full_q3 = full_df[full_df.question == "q3"]
full_q3 = full_q3[["debate_id", "voter_id", "gpt_response", "model"]]
full_q3

In [None]:
df1 = pd.get_dummies(
    full_q1.pivot(index="debate_id", columns="model", values="gpt_response").dropna()
).merge(ground_truth, on="debate_id")
df1["more_convincing_arguments"] = df1["more_convincing_arguments"].apply(
    lambda x: 1 if x == "Pro" else x
)
df1["more_convincing_arguments"] = df1["more_convincing_arguments"].apply(
    lambda x: 0 if x == "Tie" else x
)
df1["more_convincing_arguments"] = df1["more_convincing_arguments"].apply(
    lambda x: -1 if x == "Con" else x
)
df1

In [None]:
df2 = pd.get_dummies(
    full_q2.pivot(
        index=["debate_id", "voter_id"], columns="model", values="gpt_response"
    )
).merge(
    full_df[["debate_id", "voter_id", "agreed_before"]], on=["debate_id", "voter_id"]
)
df2["agreed_before"] = df2["agreed_before"].apply(lambda x: 1 if x == "Pro" else x)
df2["agreed_before"] = df2["agreed_before"].apply(lambda x: 0 if x == "Tie" else x)
df2["agreed_before"] = df2["agreed_before"].apply(lambda x: -1 if x == "Con" else x)
df2 = df2.drop_duplicates()
df2

In [None]:
df3 = pd.get_dummies(
    full_q3.pivot(
        index=["debate_id", "voter_id"], columns="model", values="gpt_response"
    )
).merge(
    full_df[["debate_id", "voter_id", "agreed_after"]], on=["debate_id", "voter_id"]
)
df3["agreed_after"] = df3["agreed_after"].apply(lambda x: 1 if x == "Pro" else x)
df3["agreed_after"] = df3["agreed_after"].apply(lambda x: 0 if x == "Tie" else x)
df3["agreed_after"] = df3["agreed_after"].apply(lambda x: -1 if x == "Con" else x)
df3 = df3.drop_duplicates()
df3

In [None]:
# Q1
num_splits = 20
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

for clf in [LR_clf, GB_clf]:
    scores = []
    for i in range(num_splits):
        # split on the debates
        train, test = train_test_split(list(df1.debate_id.unique()))

        # get train and test set
        df1_train = df1.copy()[df1.debate_id.isin(train)]
        df1_test = df1.copy()[df1.debate_id.isin(test)]

        # get features and outputs
        X_train = df1_train.drop(columns=["debate_id", "more_convincing_arguments"])
        X_test = df1_test.drop(columns=["debate_id", "more_convincing_arguments"])
        y_train = df1_train["more_convincing_arguments"]
        y_test = df1_test["more_convincing_arguments"]

        score = GB_clf.fit(X_train, y_train).score(X_test, y_test)
        scores.append(score)

    # Step 1: Calculate sample mean and sample standard deviation
    sample_mean = np.mean(scores)
    sample_std = np.std(scores, ddof=1)  # using ddof=1 for sample standard deviation

    # Step 2: Determine the t-value for a 95% confidence interval
    confidence_level = 0.95
    degrees_of_freedom = len(scores) - 1
    t_value = t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

    # Step 3: Calculate confidence interval
    margin_of_error = t_value * (sample_std / np.sqrt(len(scores)))
    confidence_interval = (
        round((sample_mean - margin_of_error) * 100, 2),
        round((sample_mean + margin_of_error) * 100, 2),
    )

    print("Sample Mean:", round(sample_mean * 100, 2))
    print("Confidence Interval (95%):", confidence_interval)

In [None]:
# Q2, Q3

num_splits = 20
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

for df, output in zip([df2, df3], ["agreed_before", "agreed_after"]):
    for clf in [LR_clf, GB_clf]:
        scores = []
        for i in range(num_splits):
            # split on the debates
            train, test = train_test_split(list(df.debate_id.unique()))

            # get train and test set
            df_train = df.copy()[df.debate_id.isin(train)]
            df_test = df.copy()[df.debate_id.isin(test)]

            # get features and outputs
            X_train = df_train.drop(columns=["debate_id", "voter_id", output])
            X_test = df_test.drop(columns=["debate_id", "voter_id", output])
            y_train = df_train[output]
            y_test = df_test[output]

            score = GB_clf.fit(X_train, y_train).score(X_test, y_test)
            scores.append(score)

        sample_mean = np.mean(scores)
        sample_std = np.std(
            scores, ddof=1
        )  # using ddof=1 for sample standard deviation

        # Step 2: Determine the t-value for a 95% confidence interval
        confidence_level = 0.95
        degrees_of_freedom = len(scores) - 1
        t_value = t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

        # Step 3: Calculate confidence interval
        margin_of_error = t_value * (sample_std / np.sqrt(len(scores)))
        confidence_interval = (
            round((sample_mean - margin_of_error) * 100, 2),
            round((sample_mean + margin_of_error) * 100, 2),
        )

        print("Sample Mean:", round(sample_mean * 100, 2))
        print("Confidence Interval (95%):", confidence_interval)