

In [16]:
import os
import time
import requests
from bs4 import BeautifulSoup
import json

In [53]:
raw_html_in_dir = './raw_html/biden'
json_array_out_dir = './compiled_transcripts/biden'
subject_name = 'biden'
# ensure the output directory exists
os.makedirs(json_array_out_dir, exist_ok=True)

# get the list of files in the input directory
file_list = os.listdir(raw_html_in_dir)


In [54]:

# iterate over the files in the input directory
all_interview_data = []
for file_name in file_list:
    # read in the html file as a string
    with open(os.path.join(raw_html_in_dir, file_name), 'r') as f:
        html_string = f.read()
        soup = BeautifulSoup(html_string, "html.parser")

        media_rows = soup.find_all("div", class_="media topic-media-row mediahover")

        for row in media_rows:
            speaker_label = row.find("div", class_="speaker-label")
            transcript_text = row.find("div", class_="transcript-text-block")

            if speaker_label and transcript_text:
                speaker = speaker_label.get_text(strip=True)
                text = transcript_text.get_text(strip=True)
                all_interview_data.append({"speaker": speaker, "text": text})

        interview_json = json.dumps(all_interview_data, indent=2)

print("compiled {} statements".format(len(all_interview_data)))

compiled 4030 statements


In [55]:
print("compiled {} statements".format(len(all_interview_data)))
# randomly sample 20 statements and print them out (used to determing parsing logic)
import random
for i in range(20):
    print(random.choice(all_interview_data))
    

compiled 4030 statements
{'speaker': 'Joe Biden', 'text': "I'll tell you a real quick story. One of the things my dad -- when he was up in Philadelphia, he was in his late 70s, he was still working full time. I went up to see him, I was a US senator, and I was -- I had to be in Philly and we're going to go to a restaurant to lunch. And I had two of my staff with me, were heading to lunch, who knew my dad well."}
{'speaker': 'Joe Biden', 'text': "More women than men in our administration by 50-some -- 52 or whatever percent. More African American judges has been appointed, not just Supreme Court, but all through the lower courts as well. When you have 24 out of every 100 students in the grade schools in America speaking Spanish, Spanish speakers, how in God's name is it not in the interest of the United States to reach down and lift those people up, just like Irish immigrants in the 1800s or just like Italians after World War II?"}
{'speaker': 'Joe Biden', 'text': "Especially in the bus

In [56]:
inital_size = len(all_interview_data)

# remove all text in brackets from the text
import re
for i in range(len(all_interview_data)):
    all_interview_data[i]['text'] = re.sub(r'\[.*?\]', '', all_interview_data[i]['text'])

# remove all empty statements or only whitespace statements
all_interview_data = [x for x in all_interview_data if x['text'] and not x['text'].isspace()]

# remove all statements that are less than 5 characters
all_interview_data = [x for x in all_interview_data if len(x['text']) > 5]

print("removed {} statements".format(inital_size - len(all_interview_data)))


In [57]:
# save the data to a json file
with open(os.path.join(json_array_out_dir, subject_name + '.json'), 'w') as f:
    json.dump(all_interview_data, f, indent=2)
    