In [2]:
cd ..

/Users/paularescala/Documents/Professional/Masters-Thesis-2023/debate-gpt


In [3]:
import pandas as pd
import glob
import json
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import seaborn as sns
from debate_gpt.results_analysis.analysis_helpers import get_train_test, get_metrics, get_bootstrap, calculate_cohens_kappa
import matplotlib.pyplot as plt

# Load Data

In [4]:
q1 = pd.read_json("data/tidy/llm_outputs/q1.json")
q2 = pd.read_json("data/tidy/llm_outputs/q2.json")
q3 = pd.read_json("data/tidy/llm_outputs/q3.json")
binary = pd.read_json("data/tidy/llm_outputs/q2-binary.json")
issues = pd.read_json("data/tidy/llm_outputs/q2-issues.json")
votes_df = pd.read_json("data/processing/processed_data/votes_df.json")

with open("data/tidy/datasets/datasets.json") as f:
    dataset_dict = json.load(f)

In [7]:
model_list = list(q1.model.unique())

# IAA

In [None]:
iaa_df = issues.pivot(
    index=["debate_id", "voter_id"],
    columns=["model", "big_issues", "reasoning"],
    values="processed_gpt_response",
)
names = []
for column in iaa_df.columns:
    name = column[0] + ("-bi" if column[1] else "") + ("-r" if column[2] else "")
    names.append(name)

iaa_df.columns = names
iaa_df

In [None]:
names1 = []
names2 = []
kappas = []
iaa_df.columns
for column1 in iaa_df.columns:
    for column2 in iaa_df.columns:
        tmp_df = iaa_df[[column1, column2]]
        kappa, _, _ = calculate_cohens_kappa(tmp_df)

        names1.append(column1)
        names2.append(column2)
        kappas.append(kappa)

In [None]:
kappa_df = pd.DataFrame({"model1": names1, "model2": names2, "kappa": kappas}).pivot(
    index="model1", columns="model2", values="kappa"
)

plt.figure(figsize=(10, 10))
sns.heatmap(kappa_df, annot=True)
plt.show()

# Answer Extraction

In [None]:
dfs = []
for df, question in zip([q1, q2, q3], ["q1", "q2", "q3"]):
    df["correct_form"] = df.processed_gpt_response == df.gpt_response
    df["answer_extracted"] = df.processed_gpt_response.isin(["Pro", "Con", "Tie"])
    df = df.groupby("model")[["correct_form", "answer_extracted"]].mean().reset_index()
    df["question"] = question
    df["correct_form"] = df.correct_form * 100
    df["answer_extracted"] = df.answer_extracted * 100
    dfs.append(df)

In [None]:
answer_extraction_df = pd.concat(dfs)[
    ["question", "model", "correct_form", "answer_extracted"]
]

In [None]:
with open("data/tidy/latex_tables/answer_extraction.txt", "w") as f:
    f.write(
        answer_extraction_df.to_latex(index=False, float_format="%.2f", position="h")
    )

# Basic Results

In [10]:
questions = []
datasets = []
models = []
accuracies = []
confidence_intervals = []

for df, question in zip([q1, q2, q3], ["1", "2", "3"]):
    for dataset in ["Trimmed", "Issues"]:
        for model in model_list:
            if (model == "MTurk") and (dataset == "Trimmed"):
                continue
            model_df = df[df.model == model]
            temp_df = model_df[model_df.debate_id.isin(dataset_dict[dataset])]
            accuracy, _, _, ci = get_bootstrap(temp_df)

            questions.append(question)
            models.append(model)
            datasets.append(dataset)
            accuracies.append(accuracy)
            confidence_intervals.append(ci)

            if (question == "1") & (model == "llama"):
                voter_agg_df = (
                    votes_df[["debate_id", "voter_id", "more_convincing_arguments"]]
                    .merge(
                        temp_df.groupby("debate_id")
                        .ground_truth.first()
                        .to_frame()
                        .reset_index(),
                        on="debate_id",
                    )
                    .dropna()
                )
                voter_agg = (
                    (
                        voter_agg_df.more_convincing_arguments
                        == voter_agg_df.ground_truth
                    ).sum()
                    / len(voter_agg_df)
                    * 100
                )

                # get confidendce interval
                stats = []
                for _ in range(1000):
                    temp_df = voter_agg_df.sample(len(voter_agg_df), replace=True)
                    stats.append(
                        (
                            temp_df.more_convincing_arguments == temp_df.ground_truth
                        ).sum()
                        / len(temp_df)
                        * 100
                    )

                questions.append(question)
                models.append("VoterAgg")
                datasets.append(dataset)
                accuracies.append(voter_agg)
                confidence_intervals.append(
                    (round(sorted(stats)[25], 2), round(sorted(stats)[975], 2))
                )

In [11]:
primary_table = pd.DataFrame(
    {
        "Question": questions,
        "Model": models,
        "Dataset": datasets,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

primary_table["Model"] = (
    primary_table.Model.str.capitalize()
    .str.replace("Gpt-", "GPT-")
    .str.replace("Mturk", "MTurk")
    .str.replace("Voteragg", "VoterAgg")
)
primary_table["Model"] = pd.Categorical(
    primary_table["Model"],
    ["Llama", "Mistral", "GPT-3.5", "GPT-4", "VoterAgg", "MTurk"],
)
primary_table["Dataset"] = pd.Categorical(
    primary_table["Dataset"], ["Trimmed", "Short", "Issues"]
)
primary_table = primary_table.sort_values(["Question", "Dataset", "Model"])

In [13]:
# with open("latex_tables/primary_results.txt", "w") as f:
#     f.write(primary_table.to_latex(index=False, float_format="%.2f", position="h"))

# Binary

In [None]:
datasets = []
models = []
types = []
accuracies = []
confidence_intervals = []

for dataset in ["Trimmed", "Issues"]:
    for model in model_list:
        model_df = binary[binary.model == model]
        temp_df = model_df[model_df.debate_id.isin(dataset_dict[dataset])]
        accuracy, _, _, ci = get_bootstrap(temp_df)

        models.append(model)
        datasets.append(dataset)
        accuracies.append(accuracy)
        confidence_intervals.append(ci)
        types.append("Binary")

        model_df = q2[q2.model == model]
        temp_df = model_df[model_df.debate_id.isin(list(temp_df.debate_id.unique()))]
        accuracy, _, _, ci = get_bootstrap(temp_df)
        models.append(model)
        datasets.append(dataset)
        accuracies.append(accuracy)
        confidence_intervals.append(ci)
        types.append("3-class")

In [None]:
binary_table = pd.DataFrame(
    {
        "Model": models,
        "Dataset": datasets,
        "Classes": types,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

binary_table["Model"] = (
    binary_table.Model.str.capitalize()
    .str.replace("Gpt-", "GPT-")
    .str.replace("Mturk", "MTurk")
)
binary_table["Model"] = pd.Categorical(
    binary_table["Model"], ["Llama", "Mistral", "GPT-3.5", "GPT-4", "MTurk"]
)
binary_table["Dataset"] = pd.Categorical(
    binary_table["Dataset"], ["Trimmed", "Short", "Issues"]
)
binary_table = binary_table.sort_values(["Dataset", "Classes", "Model"])
binary_table

In [None]:
with open("data/tidy/latex_tables/binary_results.txt", "w") as f:
    f.write(binary_table.to_latex(index=False, float_format="%.2f", position="h"))

# Issues

In [None]:
issues["big_issues"] = issues.apply(
    lambda x: False if x.model == "MTurk" else x.big_issues, axis=1
)
issues["reasoning"] = issues.apply(
    lambda x: True if x.model == "MTurk" else x.reasoning, axis=1
)

In [None]:
models = []
accuracies = []
confidence_intervals = []
big_issues = []
reasoning = []

for model in model_list:
    for bi in [True, False]:
        for r in [True, False]:
            model_df = issues[issues.model == model]
            temp_df = model_df[(model_df.reasoning == r) & (model_df.big_issues == bi)]
            accuracy, _, _, ci = get_bootstrap(temp_df)

            models.append(model)
            accuracies.append(accuracy)
            confidence_intervals.append(ci)
            big_issues.append(bi)
            reasoning.append(r)

In [None]:
issues_table = pd.DataFrame(
    {
        "Model": models,
        "Big Issues": big_issues,
        "Reasoning": reasoning,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

issues_table["Model"] = (
    issues_table.Model.str.capitalize()
    .str.replace("Gpt-", "GPT-")
    .str.replace("Mturk", "MTurk")
)
issues_table["Model"] = pd.Categorical(
    issues_table["Model"], ["Llama", "Mistral", "GPT-3.5", "GPT-4", "MTurk"]
)
issues_table = issues_table.sort_values(["Model", "Big Issues", "Reasoning"])
issues_table = issues_table.dropna(subset=["Accuracy (\%)"])

In [None]:
issues_table

# Regressions

In [None]:
abortion_df = pd.read_json("data/tidy/regression_files/abortion.json")
abortion_BI_df = pd.read_json("data/tidy/regression_files/abortion_BI.json")
gay_marriage_df = pd.read_json("data/tidy/regression_files/gay_marriage.json")
gay_marriage_BI_df = pd.read_json("data/tidy/regression_files/gay_marriage_BI.json")
capital_punishment_df = pd.read_json(
    "data/tidy/regression_files/capital_punishment.json"
)
capital_punishment_BI_df = pd.read_json(
    "data/tidy/regression_files/capital_punishment_BI.json"
)
issues_dfs = {
    "abortion_df": abortion_df,
    "abortion_BI_df": abortion_BI_df,
    "gay_marriage_df": gay_marriage_df,
    "gay_marriage_BI_df": gay_marriage_BI_df,
    "capital_punishment_df": capital_punishment_df,
    "capital_punishment_BI_df": capital_punishment_BI_df,
}

In [None]:
kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

models = []
big_issues = []
accuracies = []
confidence_intervals = []


for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):
    scores = []
    scores_BI = []

    for issue in issues_dfs:
        df = issues_dfs[issue]
        debate_ids = np.array(list(df.debate_id.unique()))

        for train_index, test_index in kf.split(debate_ids):
            # get features and outputs
            X_train, y_train, X_test, y_test = get_train_test(
                train_index, test_index, debate_ids, df
            )
            if "BI" not in issue:
                # get scores
                score = classifier.fit(X_train, y_train).score(X_test, y_test)
                scores.append(score)
            else:
                score = classifier.fit(X_train, y_train).score(X_test, y_test)
                scores_BI.append(score)

    accuracy, ci = get_metrics(scores)
    accuracy_BI, ci_BI = get_metrics(scores_BI)

    models += [classifier_name, classifier_name]
    big_issues += ["No", "Yes"]
    accuracies += [accuracy, accuracy_BI]
    confidence_intervals += [ci, ci_BI]

In [None]:
regression_results = pd.DataFrame(
    {
        "Model": models,
        "Big Issues": big_issues,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)
regression_results

In [None]:
# issues_table_complete = pd.concat([issues_table, regression_results])
# issues_table_complete
# with open("data/tidy/latex_tables/issues_results.txt", "w") as f:
#     f.write(
#         issues_table_complete.to_latex(index=False, float_format="%.2f", position="h")
#     )

# Stacked Model

In [None]:
q1_stacked = pd.read_json("data/tidy/regression_files/q1-stacked.json")
q2_stacked = pd.read_json("data/tidy/regression_files/q2-stacked.json")
q3_stacked = pd.read_json("data/tidy/regression_files/q3-stacked.json")

stacked_dfs = {"q1": q1_stacked, "q2": q2_stacked, "q3": q3_stacked}

In [None]:
kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

models = []
accuracies = []
confidence_intervals = []
questions = []

for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):

    for question in stacked_dfs:
        df = stacked_dfs[question]
        debate_ids = np.array(list(df.debate_id.unique()))
        scores = []
        for train_index, test_index in kf.split(debate_ids):
            # get features and outputs
            X_train, y_train, X_test, y_test = get_train_test(
                train_index, test_index, debate_ids, df, "ground_truth"
            )

            # get scores
            score = classifier.fit(X_train, y_train).score(X_test, y_test)
            scores.append(score)

        accuracy, ci = get_metrics(scores)

        questions.append(question)
        models.append(classifier_name)
        accuracies.append(accuracy)
        confidence_intervals.append(ci)

In [None]:
stacked_model = pd.DataFrame(
    {
        "Question": questions,
        "Model": models,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)

stacked_model

In [None]:
# with open("data/tidy/latex_tables/stacked_model.txt", "w") as f:
#     f.write(
#         stacked_model.to_latex(index=False, float_format="%.2f", position="h")
#     )

# Stacked Issues

In [None]:
abortion_df = pd.read_json("data/tidy/regression_files/abortion_BI_stacked.json")
gay_marriage_df = pd.read_json(
    "data/tidy/regression_files/gay_marriage_BI_stacked.json"
)
capital_punishment_df = pd.read_json(
    "data/tidy/regression_files/capital_punishment_BI_stacked.json"
)
issues_dfs = {
    "abortion_df": abortion_df,
    "gay_marriage_df": gay_marriage_df,
    "capital_punishment_df": capital_punishment_df,
}

In [None]:
kf = KFold(n_splits=20)
LR_clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=500)
GB_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)

models = []
accuracies = []
confidence_intervals = []


for classifier, classifier_name in zip(
    [LR_clf, GB_clf], ["Logistic Regression", "Gradient Boosting"]
):
    scores = []

    for issue in issues_dfs:
        df = issues_dfs[issue]
        debate_ids = np.array(list(df.debate_id.unique()))

        for train_index, test_index in kf.split(debate_ids):
            # get features and outputs
            X_train, y_train, X_test, y_test = get_train_test(
                train_index, test_index, debate_ids, df
            )

            score = classifier.fit(X_train, y_train).score(X_test, y_test)
            scores.append(score)

    accuracy, ci = get_metrics(scores)
    models.append(classifier_name)
    accuracies.append(accuracy)
    confidence_intervals.append(ci)

In [None]:
regression_results = pd.DataFrame(
    {
        "Model": models,
        "Accuracy (\%)": accuracies,
        "95\% Confidence Interval": confidence_intervals,
    }
)
regression_results