In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import re

In [2]:
def scrape_transcript_to_csv(url, csv_file_path, debate_number):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')

        speaker_names, times, texts, debates = [], [], [], []
        current_speaker, current_time = None, None

        for paragraph in paragraphs:
            match = re.match(r'^(.*?):\s*\((.*?)\)\s*(.*)', paragraph.text)
            if match:
                current_speaker, current_time, text = match.groups()
                speaker_names.append(current_speaker.strip())
                times.append(current_time.strip())
                texts.append(text.strip())
                debates.append(debate_number)  # Add the debate number
            else:
                if current_speaker and texts:
                    texts[-1] += ' ' + paragraph.text.strip()

        if not speaker_names or not times or not texts:
            print("Failed to extract data correctly.")
            return

        data = list(zip(speaker_names, times, texts, debates))

        write_mode = 'w' if debate_number == 1 else 'a'
        with open(csv_file_path, write_mode, newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile)
            if debate_number == 1:
                csv_writer.writerow(['speaker', 'time', 'text', 'debate'])
            csv_writer.writerows(data)

        print(f"Data from {url} appended to CSV file: {csv_file_path}")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [3]:
def scrape_transcript_to_dataframe(url, debate_number, csv_file_path):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        
        data = {'speaker': [], 'time': [], 'text': [], 'debate': []}
        current_speaker, current_time = None, None
        
        for paragraph in paragraphs:
            match = re.match(r'^(.*?):\s*\((.*?)\)\s*(.*)', paragraph.text)
            if match:
                current_speaker, current_time, text = match.groups()
                data['speaker'].append(current_speaker.strip())
                data['time'].append(current_time.strip())
                data['text'].append(text.strip())
                data['debate'].append(debate_number)
            else:
                if current_speaker and data['text']:
                    data['text'][-1] += ' ' + paragraph.text.strip()
        
        if not data['speaker'] or not data['time'] or not data['text']:
            print("Failed to extract data correctly.")
            return
        
        df = pd.DataFrame(data)
        mode = 'w' if debate_number == 1 else 'a'
        header = True if debate_number == 1 else False
        df.to_csv(csv_file_path, mode=mode, index=False, header=header, encoding='utf-8')
        print(f"Data from {url} appended to CSV file: {csv_file_path}")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [4]:
csv_file_path = 'debate_transcripts.csv'
urls = [
    "https://www.rev.com/blog/transcripts/donald-trump-joe-biden-1st-presidential-debate-transcript-2020",
    "https://www.rev.com/blog/transcripts/donald-trump-joe-biden-final-presidential-debate-transcript-2020",
]

for i, url in enumerate(urls, start=1):
    scrape_transcript_to_dataframe(url, i, csv_file_path)

Data from https://www.rev.com/blog/transcripts/donald-trump-joe-biden-1st-presidential-debate-transcript-2020 appended to CSV file: debate_transcripts.csv
Data from https://www.rev.com/blog/transcripts/donald-trump-joe-biden-final-presidential-debate-transcript-2020 appended to CSV file: debate_transcripts.csv


In [5]:
df = pd.read_csv(csv_file_path)
df['speaker'].value_counts()

speaker
President Donald J. Trump    314
Vice President Joe Biden     250
Chris Wallace                225
Donald Trump                 193
Kristen Welker               188
Joe Biden                    131
Name: count, dtype: int64

In [6]:
replacement = {'President Donald J. Trump': 'Donald Trump', 'Vice President Joe Biden': 'Joe Biden', 'Chris Wallace': 'Moderator', 'Kristen Welker': 'Moderator'}
df['speaker'] = df['speaker'].map(lambda x: replacement.get(x, x)) # replace only those who are in the dictionary
df['speaker'].value_counts()

speaker
Donald Trump    507
Moderator       413
Joe Biden       381
Name: count, dtype: int64

In [7]:
# df_filtered = df[df['speaker'] != "Moderator"].reset_index(drop=True)

In [8]:
def time_format(t):
    if len(t) == 5:  # This means the string is in the format 'MM:SS'
        t = '00:' + t  # Prepend '00:' to represent the hour
    return t

df['time'] = df['time'].apply(time_format)
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time

In [9]:
df.to_csv(csv_file_path)