In [30]:
import os
from bs4 import BeautifulSoup
import json

In [31]:
subject = "Joe Biden"
compiled_transcripts_raw_file = "compiled_transcripts/biden/biden.json"
output_dir = "finalized_data/biden"
output_file_prefix = "biden"

# ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)





# example: [
# {'speaker': 'Joe Biden', 'text': "And let's see if we can get an agreement to kickstart this. And then fight over what's left and see if I can get it done without Republicans, if need be."},
# {'speaker': 'Josh Boak', 'text': "I'll answer whatever you need."}
# ]
compiled_convos = []
with open(compiled_transcripts_raw_file, "r") as f:
    compiled_transcripts_raw = json.load(f)
    for convo in compiled_transcripts_raw:
        compiled_convos.append(
            {
                "speaker": convo["speaker"],
                "text": convo["text"],
            }
        )


# preview the data by printing the first 10 conversations
for i in range(10):
    print(compiled_convos[i])

{'speaker': 'D.L. Hughley', 'text': 'Hello, President Biden. How are you, young man?'}
{'speaker': 'Joe Biden', 'text': "How are you doing? It's been a while, man. How are you doing?"}
{'speaker': 'D.L. Hughley', 'text': 'Yeah. The last time we talked, I had COVID, and we actually talked so long. By the time we finished, my 14-day quarantine was up. So -- so, we had a good conversation. How are you doing?'}
{'speaker': 'Joe Biden', 'text': "I'm doing well. I'm doing well."}
{'speaker': 'D.L. Hughley', 'text': "I'm curious because it is Election Day. And the last time we talked, you weren't president, and now, you are. And I think I've been certainly impressed by your legislative accomplishments. I think you've done things. I think you've basically kept a lot of the promises you made on the campaign trail. Do you think that that's resonated with the American people?"}
{'speaker': 'Joe Biden', 'text': 'Well, you know, people are really hurting, and -- and they were hurting a lot when we 

In [57]:
# here we create a series of JSON cronological conversations with varying context lengths in the format of
# {
# context:
# 	“”
# 		INTERVIEWER: INTERVIEWER_RESPONSE_1
# 		BIDEN: BIDEN_RESPONSE_1
# 		up to n=3, always ending with INTERVIEWER
#       ...
# 		INTERVIEWER: INTERVIEWER_RESPONSE_n
# 	“”
# subject: “Joe Biden”
# response (the response to the last question in the context): “BIDEN_RESPONSE_n”
# }

# we'll create this as follows:
# for each text spoken the subject, we'll create n-1 length datas with the previous (n-1)*2 texts
# so if n=3, we'll create a dataset with context of the previous 5 texts, the previous 3 texts, and the previous 1 text

dataset = []
i = 5
n = 3
while i < len(compiled_convos):
    # ignore case
    if subject.lower() in compiled_convos[i]["speaker"].lower():
        for m in range(n):
            # print(compiled_convos[i - n + m * 2]["speaker"])
            if i - n + m * 2 >= 0:
                # if the speaker is the subject, we'll add the speaker name otherwise we'll replace the name with "INTERVIEWER"

                name = "PersonA" if subject.lower(
                ) not in compiled_convos[i - n + m * 2]["speaker"].lower() else compiled_convos[i - n + m * 2]["speaker"]
                dataset.append(
                    {

                        "context": "".join(
                            [
                                name
                                + ": "
                                + compiled_convos[i - n + m * 2]["text"]
                                + "\n"
                                for m in range(n - m)
                            ]
                        ),
                        "subject": subject,
                        "response": compiled_convos[i]["text"],
                    }
                )

    i += 1

for i in range(10):
    print(dataset[i])


{'context': "Person A: Yeah. The last time we talked, I had COVID, and we actually talked so long. By the time we finished, my 14-day quarantine was up. So -- so, we had a good conversation. How are you doing?\nPerson A: I'm curious because it is Election Day. And the last time we talked, you weren't president, and now, you are. And I think I've been certainly impressed by your legislative accomplishments. I think you've done things. I think you've basically kept a lot of the promises you made on the campaign trail. Do you think that that's resonated with the American people?\nPerson A: As you know, you know, we -- we inherited a -- a hell of a mess. Unemployment was sky-high. We cut it to more than half. Black unemployment was sky-high. We've cut it down to one of the lowest rates it's ever been. You know, the fundamental thing I've tried to do is change the whole dynamic how we look at the economy.\n", 'subject': 'Joe Biden', 'response': 'Well, you know, people are really hurting, an

In [58]:
test_split = 0.2
train_dataset = dataset[: int(len(dataset) * (1 - test_split))]
test_dataset = dataset[int(len(dataset) * (1 - test_split)) :]
print('train dataset size:', len(train_dataset))
print('test dataset size:', len(test_dataset))
print('total dataset size:', len(dataset))

# save the dataset
out_path = os.path.join(output_dir, output_file_prefix + "_all.json")
with open(out_path, "w") as f:
    json.dump(dataset, f)
out_path = os.path.join(output_dir, output_file_prefix + "_train.json")
with open(out_path, "w") as f:
    json.dump(train_dataset, f)
out_path = os.path.join(output_dir, output_file_prefix + "_test.json")
with open(out_path, "w") as f:
    json.dump(test_dataset, f)



train dataset size: 5320
test dataset size: 1331
total dataset size: 6651
