In [1]:
import pandas as pd
import glob
import json
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [2]:
cd ..

/Users/paularescala/Documents/Professional/Masters-Thesis-2023/debate-gpt


In [3]:
from debate_gpt.results_analysis.metrics import get_bootstrap
from debate_gpt.results_analysis.helpers import get_train_test, get_metrics

# Load Data

In [4]:
q1 = pd.read_json("data/tidy/llm_outputs/q1.json")
q2 = pd.read_json("data/tidy/llm_outputs/q2.json")
q3 = pd.read_json("data/tidy/llm_outputs/q3.json")
binary = pd.read_json("data/tidy/llm_outputs/q2-binary.json")
issues = pd.read_json("data/tidy/llm_outputs/q2-issues.json")

with open("data/tidy/datasets/datasets.json") as f:
    dataset_dict = json.load(f)

In [None]:
model_list = list(q1.model.unique())

# Answer Extraction

In [None]:
dfs = []
for df, question in zip([q1, q2, q3], ["q1", "q2", "q3"]):
    df["correct_form"] = df.processed_gpt_response == df.gpt_response
    df["answer_extracted"] = df.processed_gpt_response.isin(["Pro", "Con", "Tie"])
    df = df.groupby("model")[["correct_form", "answer_extracted"]].mean().reset_index()
    df["question"] = question
    df["correct_form"] = df.correct_form * 100
    df["answer_extracted"] = df.answer_extracted * 100
    dfs.append(df)

In [None]:
answer_extraction_df = pd.concat(dfs)[
    ["question", "model", "correct_form", "answer_extracted"]
]

In [None]:
with open("data/tidy/latex_tables/answer_extraction.txt", "w") as f:
    f.write(
        answer_extraction_df.to_latex(index=False, float_format="%.2f", position="h")
    )

# Basic Results

In [None]:
questions = []
datasets = []
models = []
accuracies = []
confidence_intervals = []

for df, question in zip([q1, q2, q3], ["q1", "q2", "q3"]):
    for dataset in ["Trimmed", "Issues"]:
        for model in model_list:
            model_df = df[df.model == model]
            temp_df = model_df[model_df.debate_id.isin(dataset_dict[dataset])]
            accuracy, _, _, ci = get_bootstrap(temp_df)

            questions.append(question)
            models.append(model)
            datasets.append(dataset)
            accuracies.append(accuracy)
            confidence_intervals.append(ci)

In [None]:
primary_table = pd.DataFrame(
    {
        "Question": questions,
        "Model": models,
        "Dataset": datasets,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

primary_table["Model"] = (
    primary_table.Model.str.capitalize()
    .str.replace("Gpt-", "GPT-")
    .str.replace("Mturk", "MTurk")
)
primary_table["Model"] = pd.Categorical(
    primary_table["Model"], ["Llama", "Mistral", "GPT-3.5", "GPT-4", "MTurk"]
)
primary_table["Dataset"] = pd.Categorical(
    primary_table["Dataset"], ["Trimmed", "Short", "Issues"]
)
primary_table = primary_table.sort_values(["Question", "Dataset", "Model"])

In [None]:
primary_table.head()

In [None]:
with open("data/tidy/latex_tables/primary_results.txt", "w") as f:
    f.write(primary_table.to_latex(index=False, float_format="%.2f", position="h"))

# Binary

In [None]:
datasets = []
models = []
types = []
accuracies = []
confidence_intervals = []

for dataset in ["Trimmed", "Issues"]:
    for model in model_list:
        model_df = binary[binary.model == model]
        temp_df = model_df[model_df.debate_id.isin(dataset_dict[dataset])]
        accuracy, _, _, ci = get_bootstrap(temp_df)

        models.append(model)
        datasets.append(dataset)
        accuracies.append(accuracy)
        confidence_intervals.append(ci)
        types.append("Binary")

        model_df = q2[q2.model == model]
        temp_df = model_df[model_df.debate_id.isin(list(temp_df.debate_id.unique()))]
        accuracy, _, _, ci = get_bootstrap(temp_df)
        models.append(model)
        datasets.append(dataset)
        accuracies.append(accuracy)
        confidence_intervals.append(ci)
        types.append("3-class")

In [None]:
binary_table = pd.DataFrame(
    {
        "Model": models,
        "Dataset": datasets,
        "Classes": types,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

binary_table["Model"] = (
    binary_table.Model.str.capitalize()
    .str.replace("Gpt-", "GPT-")
    .str.replace("Mturk", "MTurk")
)
binary_table["Model"] = pd.Categorical(
    binary_table["Model"], ["Llama", "Mistral", "GPT-3.5", "GPT-4", "MTurk"]
)
binary_table["Dataset"] = pd.Categorical(
    binary_table["Dataset"], ["Trimmed", "Short", "Issues"]
)
binary_table = binary_table.sort_values(["Dataset", "Classes", "Model"])
binary_table

In [None]:
with open("data/tidy/latex_tables/binary_results.txt", "w") as f:
    f.write(binary_table.to_latex(index=False, float_format="%.2f", position="h"))

# Issues

In [None]:
issues["big_issues"] = issues.apply(
    lambda x: False if x.model == "MTurk" else x.big_issues, axis=1
)
issues["reasoning"] = issues.apply(
    lambda x: True if x.model == "MTurk" else x.reasoning, axis=1
)

In [None]:
models = []
accuracies = []
confidence_intervals = []
big_issues = []
reasoning = []

for model in model_list:
    for bi in [True, False]:
        for r in [True, False]:
            model_df = issues[issues.model == model]
            temp_df = model_df[(model_df.reasoning == r) & (model_df.big_issues == bi)]
            accuracy, _, _, ci = get_bootstrap(temp_df)

            models.append(model)
            accuracies.append(accuracy)
            confidence_intervals.append(ci)
            big_issues.append(bi)
            reasoning.append(r)

In [None]:
issues_table = pd.DataFrame(
    {
        "Model": models,
        "Big Issues": big_issues,
        "Reasoning": reasoning,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

issues_table["Model"] = (
    issues_table.Model.str.capitalize()
    .str.replace("Gpt-", "GPT-")
    .str.replace("Mturk", "MTurk")
)
issues_table["Model"] = pd.Categorical(
    issues_table["Model"], ["Llama", "Mistral", "GPT-3.5", "GPT-4", "MTurk"]
)
issues_table = issues_table.sort_values(["Model", "Big Issues", "Reasoning"])
issues_table = issues_table.dropna(subset=["Accuracy (\%)"])

In [None]:
issues_table

## Regressions

In [None]:
abortion_df = pd.read_json("data/tidy/regression_files/abortion.json")
gay_marriage_df = pd.read_json("data/tidy/regression_files/gay_marriage.json")
capital_punishment_df = pd.read_json(
    "data/tidy/regression_files/capital_punishment.json"
)
issues_dfs = [abortion_df, gay_marriage_df, capital_punishment_df]

with open("config/task_configs.json") as f:
    task_config = json.load(f)

demographic_features = task_config["demographic_columns"]
demographic_features.remove("birthday")
big_issues_features = task_config["big_issue_columns"]

In [None]:
dummies_dataframes = []
dummies_dataframes_BI = []
features = []
features_BI = []

for df in issues_dfs:
    df_dummies = pd.get_dummies(
        df[["debate_id", "agreed_before"] + demographic_features]
    )
    df_dummies_BI = pd.get_dummies(
        df[["debate_id", "agreed_before"] + demographic_features + big_issues_features]
    )
    dummies_dataframes.append(df_dummies)
    dummies_dataframes_BI.append(df_dummies_BI)
    features.append(
        [col for col in df_dummies.columns if col not in ["debate_id", "agreed_before"]]
    )
    features_BI.append(
        [
            col
            for col in df_dummies_BI.columns
            if col not in ["debate_id", "agreed_before"]
        ]
    )

In [None]:
kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

models = []
big_issues = []
accuracies = []
confidence_intervals = []


for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):
    scores = []
    scores_BI = []

    for i, df in enumerate(issues_dfs):
        debate_ids = np.array(list(df.debate_id.unique()))

        for train_index, test_index in kf.split(debate_ids):
            # get features and outputs
            X_train, y_train, X_test, y_test = get_train_test(
                train_index, test_index, debate_ids, dummies_dataframes[i], features[i]
            )

            X_train_BI, y_train_BI, X_test_BI, y_test_BI = get_train_test(
                train_index,
                test_index,
                debate_ids,
                dummies_dataframes_BI[i],
                features_BI[i],
            )

            # get scores
            score = classifier.fit(X_train, y_train).score(X_test, y_test)
            scores.append(score)

            score = classifier.fit(X_train_BI, y_train_BI).score(X_test_BI, y_test_BI)
            scores_BI.append(score)

    accuracy, ci = get_metrics(scores)
    accuracy_BI, ci_BI = get_metrics(scores_BI)

    models += [classifier_name, classifier_name]
    big_issues += ["No", "Yes"]
    accuracies += [accuracy, accuracy_BI]
    confidence_intervals += [ci, ci_BI]

In [None]:
regression_results = pd.DataFrame(
    {
        "Model": models,
        "Big Issues": big_issues,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

issues_table_complete = pd.concat([issues_table, regression_results])
issues_table_complete

In [None]:
with open("data/tidy/latex_tables/issues_results.txt", "w") as f:
    f.write(
        issues_table_complete.to_latex(index=False, float_format="%.2f", position="h")
    )

# Stacked Model

In [18]:
df = pd.get_dummies(
    q1[q1.model != "MTurk"]
    .pivot(index="debate_id", columns="model", values="processed_gpt_response")
    .dropna()
).merge(q1[["debate_id", "ground_truth"]].drop_duplicates(), on="debate_id")


df["ground_truth"] = df.ground_truth.apply(lambda x: 1 if x == "Pro" else x)
df["ground_truth"] = df.ground_truth.apply(lambda x: 0 if x == "Tie" else x)
df["ground_truth"] = df.ground_truth.apply(lambda x: -1 if x == "Con" else x)


features = [col for col in df.columns if col not in ["debate_id", "ground_truth"]]

kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

models = []
accuracies = []
confidence_intervals = []
questions = []

for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):
    scores = []

    debate_ids = np.array(list(df.debate_id.unique()))

    for train_index, test_index in kf.split(debate_ids):
        # get features and outputs
        X_train, y_train, X_test, y_test = get_train_test(
            train_index, test_index, debate_ids, df, features, "ground_truth"
        )

        # get scores
        score = classifier.fit(X_train, y_train).score(X_test, y_test)
        scores.append(score)

    accuracy, ci = get_metrics(scores)

    questions.append("1")
    models.append(classifier_name)
    accuracies.append(accuracy)
    confidence_intervals.append(ci)



In [21]:
df = pd.get_dummies(
    q2[q2.model != "MTurk"]
    .pivot(index=["debate_id", "voter_id"], columns="model", values="processed_gpt_response")
    .dropna()
).merge(q2[["debate_id", "voter_id", "ground_truth"]].drop_duplicates(), on=["debate_id", "voter_id"])

df["ground_truth"] = df.ground_truth.apply(lambda x: 1 if x == "Pro" else x)
df["ground_truth"] = df.ground_truth.apply(lambda x: 0 if x == "Tie" else x)
df["ground_truth"] = df.ground_truth.apply(lambda x: -1 if x == "Con" else x)

features = [col for col in df.columns if col not in ["debate_id", "voter_id", "ground_truth"]]

kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):
    scores = []

    debate_ids = np.array(list(df.debate_id.unique()))

    for train_index, test_index in kf.split(debate_ids):
        # get features and outputs
        X_train, y_train, X_test, y_test = get_train_test(
            train_index, test_index, debate_ids, df, features, "ground_truth"
        )

        # get scores
        score = classifier.fit(X_train, y_train).score(X_test, y_test)
        scores.append(score)

    accuracy, ci = get_metrics(scores)

    questions.append("2")
    models.append(classifier_name)
    accuracies.append(accuracy)
    confidence_intervals.append(ci)



In [22]:
df = pd.get_dummies(
    q3[q3.model != "MTurk"]
    .pivot(index=["debate_id", "voter_id"], columns="model", values="processed_gpt_response")
    .dropna()
).merge(q3[["debate_id", "voter_id", "ground_truth"]].drop_duplicates(), on=["debate_id", "voter_id"])

df["ground_truth"] = df.ground_truth.apply(lambda x: 1 if x == "Pro" else x)
df["ground_truth"] = df.ground_truth.apply(lambda x: 0 if x == "Tie" else x)
df["ground_truth"] = df.ground_truth.apply(lambda x: -1 if x == "Con" else x)

features = [col for col in df.columns if col not in ["debate_id", "voter_id", "ground_truth"]]

kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):
    scores = []

    debate_ids = np.array(list(df.debate_id.unique()))

    for train_index, test_index in kf.split(debate_ids):
        # get features and outputs
        X_train, y_train, X_test, y_test = get_train_test(
            train_index, test_index, debate_ids, df, features, "ground_truth"
        )

        # get scores
        score = classifier.fit(X_train, y_train).score(X_test, y_test)
        scores.append(score)

    accuracy, ci = get_metrics(scores)

    questions.append("3")
    models.append(classifier_name)
    accuracies.append(accuracy)
    confidence_intervals.append(ci)



In [23]:
stacked_model = pd.DataFrame(
    {
        "Question": questions,
        "Model": models,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

stacked_model

Unnamed: 0,Question,Model,Accuracy (\%),95\% Confidence Interval
0,1,Logistic Regression,61.94,"(58.54, 65.34)"
1,1,Gradient Boosting,60.77,"(57.31, 64.23)"
2,2,Logistic Regression,57.25,"(52.59, 61.91)"
3,2,Gradient Boosting,56.57,"(52.2, 60.94)"
4,3,Logistic Regression,54.75,"(49.84, 59.65)"
5,3,Gradient Boosting,53.75,"(49.31, 58.19)"


In [24]:
with open("data/tidy/latex_tables/stacked_model.txt", "w") as f:
    f.write(
        stacked_model.to_latex(index=False, float_format="%.2f", position="h")
    )