In [1]:
import json
import glob

import pandas as pd
import numpy as np

from pydantic import BaseModel
from openai import OpenAI

from tqdm import tqdm

tqdm.pandas()

In [2]:
client = OpenAI()

In [3]:
context_prompt = """
You are a QUD parsing expert.
Your task is to clean and extract the central question from a moderator's statement.
The statement may already be clear and concise. In that case, simply repeat it.
The statement may be verbose and contain irrelevant information. In that case, remove the irrelevant information.
Stay as close to the original statement as possible.

Examples for this question generation are:

Statement: Mr. Vice President, your campaign stresses the value of your eight year experience, and the question arises as to whether that experience was as an observer or as a participant or as an initiator of policy- making. Would you tell us please specifically what major proposals you have made in the last eight years that have been adopted by the Administration?
Question: What major proposals you have made in the last eight years that have been adopted by the Administration?

Statement: New question. Are there issues of character that distinguish you from Vice President Gore?
Question: Are there issues of character that distinguish you from your opponent?

Statement: Mr. Vice President, Im struck by your discussion of women and the sanctity of life. And it leads me to recall your own phrase, that you are haunted by the lives which children in our inner cities live. Certainly the evidence is compelling. Theres an explosion of single parent families. And by any measure, these single parent families, many with unwanted children, are the source of poverty, school drop outs, crime, which many people in the inner city simply feel is out of control. If it haunts you so, why over the eight years of the Reagan- Bush administration have so many programs designed to help the inner cities been eliminated or cut?
Question: Why over the eight years of the your administration have so many programs designed to help the inner cities been eliminated or cut?

Statement: How do you bring back— specifically bring back jobs, American manufacturers? How do you make them bring the jobs back?
Question: How do you bring back American manufacturing jobs?

Please generate one question and one question only without any prefaces.
"""

In [4]:
def get_clean_question(prompt):
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": context_prompt},
            {"role": "user", "content": prompt},
        ],
    )
    return completion.choices[0].message.content

In [5]:
data_base_dir = "data/by_date"

dates = []
dfs = {}
for fpath in glob.glob(data_base_dir + "/*"):
    df = pd.read_csv(fpath)
    date = df["date"].iloc[0]
    dfs[date] = df
    dates.append(date)

dates.sort()

In [6]:
dfs[dates[0]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert,party,question_clean
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,,,,,,Democratic,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,,,,,,Republican,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Howard Smith,Three minutes and twenty seconds for each cand...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
64,Richard Nixon,"Thank you, Mr. Smith. Senator Kennedy. First o...",Pres,1960,1960-09-26,1,,,,,,,Republican,
65,Howard Smith,"Senator Kennedy, your conclusion.",Pres,1960,1960-09-26,0,,,,,,,Moderator,
66,John Kennedy,The point was made by Mr. Nixon that the Sovie...,Pres,1960,1960-09-26,1,,,,,,,Democratic,


In [7]:
sum([len(df) for df in dfs.values()])

2716

In [8]:
for i, d in enumerate(dates):
    print(f"{i}\t{d}\t{len(dfs[d])}\t{len(dfs[d].columns)}")

0	1960-09-26	68	14
1	1976-09-23	94	14
2	1980-09-21	76	14
3	1984-10-07	134	14
4	1988-09-25	161	14
5	1992-10-11	92	14
6	1996-10-06	144	14
7	2000-10-03	166	14
8	2004-09-30	142	14
9	2008-09-26	189	14
10	2012-10-03	210	14
11	2016-09-26	308	14
12	2020-09-29	932	14


In [9]:
i = 0

In [12]:
pd.read_csv(f"{data_base_dir}/{dates[i]}.csv").head(10)

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert,party,question_clean
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,,,,,,Democratic,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,,,,,,Republican,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
5,Bob Fleming,"Senator, the Vice President in his campaign ha...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
6,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,Which political party's policies and leadershi...,"Senator, the Vice President in his campaign ha...",0.955318,0.945188,0.880888,0.71502,Democratic,Which point of view and which party do we want...
7,Howard Smith,"Mr. Nixon, would you like to comment on that s...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
8,Richard Nixon,I have no comment. Mr.,Pres,1960,1960-09-26,1,,,,,,,Republican,
9,Howard Smith,The next question; Mr. Novins.,Pres,1960,1960-09-26,0,,,,,,,Moderator,


In [11]:
dfs[dates[i]]["question_clean"] = dfs[dates[i]].progress_apply(
    lambda row: (
        get_clean_question(row["question"]) if not pd.isna(row["question"]) else None
    ),
    axis=1,
)

100%|██████████| 68/68 [00:08<00:00,  7.87it/s]


In [13]:
dfs[dates[i]].head(10)

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert,party,question_clean
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,,,,,,Democratic,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,,,,,,Republican,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
5,Bob Fleming,"Senator, the Vice President in his campaign ha...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
6,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,Which political party's policies and leadershi...,"Senator, the Vice President in his campaign ha...",0.955318,0.945188,0.880888,0.71502,Democratic,Why do you think people should vote for you ra...
7,Howard Smith,"Mr. Nixon, would you like to comment on that s...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
8,Richard Nixon,I have no comment. Mr.,Pres,1960,1960-09-26,1,,,,,,,Republican,
9,Howard Smith,The next question; Mr. Novins.,Pres,1960,1960-09-26,0,,,,,,,Moderator,


In [14]:
dfs[dates[i]].to_csv(f"{data_base_dir}/{dates[i]}.csv", index=False)

In [16]:
pd.read_csv(f"{data_base_dir}/{dates[i]}.csv").head(10)

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert,party,question_clean
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,,,,,,Democratic,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,,,,,,Moderator,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,,,,,,Republican,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
5,Bob Fleming,"Senator, the Vice President in his campaign ha...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
6,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,Which political party's policies and leadershi...,"Senator, the Vice President in his campaign ha...",0.955318,0.945188,0.880888,0.71502,Democratic,Why do you think people should vote for you ra...
7,Howard Smith,"Mr. Nixon, would you like to comment on that s...",Pres,1960,1960-09-26,0,,,,,,,Moderator,
8,Richard Nixon,I have no comment. Mr.,Pres,1960,1960-09-26,1,,,,,,,Republican,
9,Howard Smith,The next question; Mr. Novins.,Pres,1960,1960-09-26,0,,,,,,,Moderator,


In [17]:
for i, d in enumerate(dates):
    dfs[dates[i]]["question_clean"] = dfs[dates[i]].progress_apply(
        lambda row: (
            get_clean_question(row["question"]) if not pd.isna(row["question"]) else None
        ),
        axis=1,
    )
    dfs[dates[i]].to_csv(f"{data_base_dir}/{dates[i]}.csv", index=False)

100%|██████████| 68/68 [00:07<00:00,  9.49it/s]
100%|██████████| 94/94 [00:13<00:00,  6.80it/s]
100%|██████████| 76/76 [00:08<00:00,  8.94it/s]
100%|██████████| 134/134 [00:23<00:00,  5.81it/s]
100%|██████████| 161/161 [00:18<00:00,  8.93it/s]
100%|██████████| 92/92 [00:07<00:00, 12.38it/s]
100%|██████████| 144/144 [00:22<00:00,  6.42it/s]
100%|██████████| 166/166 [00:19<00:00,  8.62it/s]
100%|██████████| 142/142 [00:17<00:00,  8.03it/s]
100%|██████████| 189/189 [00:12<00:00, 14.77it/s]
100%|██████████| 210/210 [00:20<00:00, 10.16it/s]
100%|██████████| 308/308 [00:17<00:00, 17.88it/s]
100%|██████████| 932/932 [00:28<00:00, 32.34it/s]
