In [2]:
import pandas as pd
import json
import numpy as np
import tiktoken

In [3]:
debates_df = pd.read_json("../data/filtered_data/debates_titles.json")
rounds_df = pd.read_json("../data/processed_data/rounds_df.json")
users_df = pd.read_json("../data/processed_data/users_df.json")
votes_df = pd.read_json("../data/filtered_data/votes_filtered_df.json")

  users_df = pd.read_json("../data/processed_data/users_df.json")
  users_df = pd.read_json("../data/processed_data/users_df.json")
  users_df = pd.read_json("../data/processed_data/users_df.json")


In [3]:
demographic_columns = [
    "birthday",
    "education",
    "ethnicity",
    "gender",
    "income",
    "party",
    "political_ideology",
    "religious_ideology",
]

In [4]:
def get_debate(rounds_df, debate_id: int) -> str:
    """Return the debate with id `debate_id`.

    The format of the debate is as follows:
    {Round 0: {
        Pro: pro argument,
        Con: con argument,
        },
     Round 1: {
        Pro: pro argument,
        Con: con argument,
        },
     etc.
    }
    """
    rounds = {}
    debate_df = rounds_df[rounds_df.debate_id == debate_id]

    for _, row in debate_df.iterrows():
        round_key = f"Round {row['round']}"
        if round_key in rounds.keys():
            old_dict = rounds[round_key]
            old_dict[row.side] = row.text
            rounds[round_key] = old_dict
        else:
            rounds[round_key] = {row.side: row.text}

    debate = str(json.dumps(rounds))
    return debate


def get_user_info(
    users_df: pd.DataFrame, voter_id: str, demographic_columns: list[str]
) -> str:
    """Return a string containing all the demographic information of user `voter_id`
    in the following format: Label: Value, Label: Value
    """
    voter_info = users_df.loc[voter_id][demographic_columns]
    voter_info = voter_info[~voter_info.isna()]
    voter_info = [
        col.replace("_", " ").title() + ": " + voter_info[col]
        for col in voter_info.index
    ]

    return "\n".join(voter_info)

In [5]:
debates_df = debates_df[
    ~((debates_df.proposition == "drop") | (debates_df.proposition == "skip"))
]
debates_df["debate"] = debates_df.apply(
    lambda x: get_debate(rounds_df, x.debate_id), axis=1
)
debates_df["word_count"] = debates_df.apply(
    lambda x: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(x.debate))
    * (750 / 1000),
    axis=1,
)
debates_df = debates_df[debates_df["word_count"] <= 1500]

In [6]:
df = debates_df.merge(votes_df)
df["user_info"] = df.apply(
    lambda x: get_user_info(users_df, x.voter_id, demographic_columns), axis=1
)

In [7]:
df = df[["debate_id", "voter_id", "proposition", "debate", "user_info"]]

In [15]:
df.debate_id.nunique()

118

In [8]:
df.to_csv("../data/processed_data/crowdsourcing_full.csv")

In [9]:
abortion_debates = df[df.proposition.str.lower().str.contains("abortion")]
abortion_debate_ids = abortion_debates.debate_id.unique()
abortion_debate_ids

array([ 1289,  1382,  1453,  1548,  1805,  1826,  1897,  1911,  2072,
        2407,  3529, 27538,  1609])

In [10]:
debate_ids = df.debate_id.unique()
non_abortion_ids = [idx for idx in debate_ids if idx not in abortion_debate_ids]

In [11]:
num_samples = 40
num_to_select = num_samples - len(abortion_debate_ids)
selected_ids = list(np.random.choice(non_abortion_ids, num_to_select)) + list(
    abortion_debate_ids
)

In [13]:
num_to_select

27

In [12]:
df[df.debate_id.isin(selected_ids)].groupby("debate_id").sample(1).reset_index().to_csv(
    "../data/processed_data/crowdsourcing_sample.csv"
)