In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
def change_speaker_names(x):
    if x == 'Vice President Joe Biden':
        return 'Joe Biden'
    elif x == 'President Donald J. Trump':
        return 'Donald Trump'
    else:
        return x

def add_hour(x):
    temp = x.split(':')
    if len(temp) == 2:
        x = f'00:{x}'
    return x

def overflow(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    if second >= 60:
        minute += (second // 60)
        second = second % 60
    if minute >= 60:
        hour = (minute // 60)
        minute = minute % 60
    return hour, minute, second

def underflow(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    if second < 0:
        minute -= abs(second // 60)
        second = abs(second % 60)
    if minute < 0:
        hour = abs(minute // 60)
        minute = abs(minute % 60)
    return hour, minute, second

def convert_time(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    hour = str(hour)
    minute = str(minute)
    second = str(second)
    return f'{hour.zfill(2)}:{minute.zfill(2)}:{second.zfill(2)}'
        
def time_in_seconds(x):
    return 3600 * x['hour'] + 60 * x['minute'] + x['second']

In [3]:
debate1 = pd.read_csv('./data/kaggle_debate/us_election_2020_1st_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})
debate2 = pd.read_csv('./data/kaggle_debate/us_election_2020_2nd_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})
debate_vp = pd.read_csv('./data/kaggle_debate/us_election_2020_vice_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})

In [4]:
# A little preprocessing
####################################
## Debate #1
debate1['time'] = debate1['minute']
debate1['time'] = debate1['time'].apply(add_hour)
debate1['second'] = debate1['time'].apply(lambda x: int(x.split(':')[2]))
debate1['minute'] = debate1['time'].apply(lambda x: int(x.split(':')[1]))
debate1['hour'] = debate1['time'].apply(lambda x: int(x.split(':')[0]))
debate1 = debate1[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Change speaker names
debate1['speaker'] = debate1['speaker'].apply(change_speaker_names)

# Fixes timing issues (of resets)
hour, minute, second = debate1.iloc[[178]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate1.iloc[179:]['hour'] = debate1.iloc[179:]['hour'] + hour
debate1.iloc[179:]['minute'] = debate1.iloc[179:]['minute'] + minute
debate1.iloc[179:]['second'] = debate1.iloc[179:]['second'] + second

# Fixes potential overflow
temp = debate1.iloc[179:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate1.iloc[179:]['hour'] = temp[0]
debate1.iloc[179:]['minute'] = temp[1]
debate1.iloc[179:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate1.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate1['hour'] = debate1['hour'] - hour
debate1['minute'] = debate1['minute'] - minute
debate1['second'] = debate1['second'] - second

# Fixes potential underflow
temp = debate1[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate1['hour'] = temp[0]
debate1['minute'] = temp[1]
debate1['second'] = temp[2]

# Fix overall time
debate1['time'] = debate1[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate1['time_seconds'] = debate1[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

In [5]:
####################################
## Debate #2
debate2['time'] = debate2['minute']
debate2['time'] = debate2['time'].apply(add_hour)
debate2['second'] = debate2['time'].apply(lambda x: int(x.split(':')[2]))
debate2['minute'] = debate2['time'].apply(lambda x: int(x.split(':')[1]))
debate2['hour'] = debate2['time'].apply(lambda x: int(x.split(':')[0]))
debate2 = debate2[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Fixes first timing issue (of resets)
hour, minute, second = debate2.iloc[[88]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2.iloc[89:337]['hour'] = debate2.iloc[89:337]['hour'] + hour
debate2.iloc[89:337]['minute'] = debate2.iloc[89:337]['minute'] + minute
debate2.iloc[89:337]['second'] = debate2.iloc[89:337]['second'] + second
debate2.iloc[89:337]

# Fixes potential overflow
temp = debate2.iloc[89:337][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate2.iloc[89:337]['hour'] = temp[0]
debate2.iloc[89:337]['minute'] = temp[1]
debate2.iloc[89:337]['second'] = temp[2]

# Fixes first timing issue (of resets)
hour, minute, second = debate2.iloc[[336]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2.iloc[337:]['hour'] = debate2.iloc[337:]['hour'] + hour
debate2.iloc[337:]['minute'] = debate2.iloc[337:]['minute'] + minute
debate2.iloc[337:]['second'] = debate2.iloc[337:]['second'] + second

# Fixes potential overflow
temp = debate2.iloc[337:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate2.iloc[337:]['hour'] = temp[0]
debate2.iloc[337:]['minute'] = temp[1]
debate2.iloc[337:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate2.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2['hour'] = debate2['hour'] - hour
debate2['minute'] = debate2['minute'] - minute
debate2['second'] = debate2['second'] - second

# Fixes potential underflow
temp = debate2[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate2['hour'] = temp[0]
debate2['minute'] = temp[1]
debate2['second'] = temp[2]

# Fix overall time
debate2['time'] = debate2[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate2['time_seconds'] = debate2[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

In [6]:
####################################
## Debate VP
debate_vp['time'] = debate_vp['minute']
debate_vp['time'] = debate_vp['time'].apply(add_hour)
debate_vp['second'] = debate_vp['time'].apply(lambda x: int(x.split(':')[2]))
debate_vp['minute'] = debate_vp['time'].apply(lambda x: int(x.split(':')[1]))
debate_vp['hour'] = debate_vp['time'].apply(lambda x: int(x.split(':')[0]))
debate_vp = debate_vp[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Adjust the reset to 00:00
hour, minute, second = debate_vp.iloc[[135]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp.iloc[135:]['hour'] = debate_vp.iloc[135:]['hour'] - hour
debate_vp.iloc[135:]['minute'] = debate_vp.iloc[135:]['minute'] - minute
debate_vp.iloc[135:]['second'] = debate_vp.iloc[135:]['second'] - second

# Fixes potential underflow
temp = debate_vp.iloc[135:][['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate_vp.iloc[135:]['hour'] = temp[0]
debate_vp.iloc[135:]['minute'] = temp[1]
debate_vp.iloc[135:]['second'] = temp[2]

# Fixes first timing issue (of resets)
hour, minute, second = debate_vp.iloc[[134]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp.iloc[135:]['hour'] = debate_vp.iloc[135:]['hour'] + hour
debate_vp.iloc[135:]['minute'] = debate_vp.iloc[135:]['minute'] + minute
debate_vp.iloc[135:]['second'] = debate_vp.iloc[135:]['second'] + second
debate_vp.iloc[135:]

# Fixes potential overflow
temp = debate_vp.iloc[135:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate_vp.iloc[135:]['hour'] = temp[0]
debate_vp.iloc[135:]['minute'] = temp[1]
debate_vp.iloc[135:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate_vp.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp['hour'] = debate_vp['hour'] - hour
debate_vp['minute'] = debate_vp['minute'] - minute
debate_vp['second'] = debate_vp['second'] - second

# Fixes potential underflow
temp = debate_vp[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate_vp['hour'] = temp[0]
debate_vp['minute'] = temp[1]
debate_vp['second'] = temp[2]

# Fix overall time
debate_vp['time'] = debate_vp[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate_vp['time_seconds'] = debate_vp[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

In [7]:
debate1[debate1['speaker'] == 'Joe Biden']

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
2,Joe Biden,00:01:29,0,1,29,"How you doing, man?",89
4,Joe Biden,00:01:31,0,1,31,I’m well.,91
9,Joe Biden,00:04:09,0,4,9,"Well, first of all, thank you for doing this a...",249
11,Joe Biden,00:04:16,0,4,16,The American people have a right to have a say...,256
12,Joe Biden,00:04:52,0,4,52,"Now, what’s at stake here is the President’s m...",292
...,...,...,...,...,...,...,...
751,Joe Biden,01:28:42,1,28,42,Five states have had mail-in ballots for the l...,5322
756,Joe Biden,01:29:41,1,29,41,I am concerned that any court would settle thi...,5381
761,Joe Biden,01:30:24,1,30,24,Mail service delivers [crosstalk 01:07:21] 185...,5424
779,Joe Biden,01:32:35,1,32,35,Yes. And here’s the deal. We count the ballots...,5555


In [8]:
debate1[debate1['speaker'] == 'Donald Trump']

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
3,Donald Trump,00:01:31,0,1,31,How are you doing?,91
6,Donald Trump,00:02:41,0,2,41,"Thank you very much, Chris. I will tell you ve...",161
7,Donald Trump,00:03:33,0,3,33,And we won the election and therefore we have ...,213
10,Donald Trump,00:04:14,0,4,14,"Thank you, Joe.",254
14,Donald Trump,00:06:14,0,6,14,There aren’t a hundred million people with pre...,374
...,...,...,...,...,...,...,...
777,Donald Trump,01:32:22,1,32,22,You think that’s good?,5542
780,Donald Trump,01:33:12,1,33,12,It’s already been established. Take a look at ...,5592
783,Donald Trump,01:33:46,1,33,46,I want to see an honest ballot cut-,5626
785,Donald Trump,01:33:52,1,33,52,I want to see an honest ballot count.,5632


In [9]:
debate2[debate2['speaker'] == 'Joe Biden']

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
9,Joe Biden,00:10:55,0,10,55,"220,000 Americans dead. You hear nothing else ...",655
10,Joe Biden,00:11:38,0,11,38,"The expectation is we’ll have another 200,000 ...",698
11,Joe Biden,00:12:01,0,12,1,What I would do is make sure we have everyone ...,721
12,Joe Biden,00:12:23,0,12,23,We’re in a situation now where the New England...,743
20,Joe Biden,00:14:08,0,14,8,Make sure it’s totally transparent. Have the s...,848
...,...,...,...,...,...,...,...
495,Joe Biden,01:35:25,1,35,25,"He takes everything out of context, but the po...",5725
501,Joe Biden,01:35:43,1,35,43,"No, I’m going to rejoin Paris Accord and make ...",5743
508,Joe Biden,01:37:15,1,37,15,"I will say, I’m an American President. I repre...",5835
509,Joe Biden,01:37:37,1,37,37,"We can grow this economy, we can deal with the...",5857


In [10]:
debate2[debate2['speaker'] == 'Donald Trump']

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
1,Donald Trump,00:07:19,0,7,19,How are you doing? How are you?,439
5,Donald Trump,00:08:46,0,8,46,"So as you know, 2.2 million people modeled out...",526
6,Donald Trump,00:09:23,0,9,23,There was a very big spike in Texas. It’s now ...,563
7,Donald Trump,00:09:46,0,9,46,"I can tell you from personal experience, I was...",586
14,Donald Trump,00:12:55,0,12,55,"No, it’s not a guarantee, but it will be by th...",775
...,...,...,...,...,...,...,...
499,Donald Trump,01:35:41,1,35,41,Is he going to get China to do it?,5741
502,Donald Trump,01:35:48,1,35,48,[Crosstalk 00:24:31].,5748
504,Donald Trump,01:36:05,1,36,5,We have to make our country totally successful...,5765
505,Donald Trump,01:36:19,1,36,19,"Before the plague came in, just before, I was ...",5779


In [11]:
debate_vp[debate_vp['speaker'] == 'Kamala Harris']

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
2,Kamala Harris,00:02:13,0,2,13,"Thank you, Susan. Well, the American people ha...",133
3,Kamala Harris,00:03:21,0,3,21,"Can you imagine if you knew on January 28th, a...",201
5,Kamala Harris,00:04:18,0,4,18,… right to reelection based on this.,258
14,Kamala Harris,00:07:10,0,7,10,Absolutely. Whatever the vice president is cla...,430
16,Kamala Harris,00:07:53,0,7,53,"No. But Susan, this is important. And I want t...",473
...,...,...,...,...,...,...,...
312,Kamala Harris,01:19:51,1,19,51,We have it within our power in these next 27 d...,4791
321,Kamala Harris,01:26:05,1,26,5,"First of all, I love hearing from our young le...",5165
322,Kamala Harris,01:26:24,1,26,24,"And brings me to Joe, Joe Biden. One of the re...",5184
323,Kamala Harris,01:26:51,1,26,51,Joe has a longstanding reputation of working a...,5211


In [12]:
debate_vp[debate_vp['speaker'] == 'Mike Pence']

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
7,Mike Pence,00:04:55,0,4,55,"Susan, thank you. And I want to thank the Comm...",295
8,Mike Pence,00:05:57,0,5,57,And I believe it saved hundreds of thousands o...,357
10,Mike Pence,00:06:59,0,6,59,"… of America first. And the American people, I...",419
12,Mike Pence,00:07:04,0,7,4,… of the sacrifices they have made. It’s saved...,424
17,Mike Pence,00:07:55,0,7,55,"Susan, I have to weigh in here-",475
...,...,...,...,...,...,...,...
314,Mike Pence,01:21:02,1,21,2,"Well, Susan, first and foremost, I think we’re...",4862
315,Mike Pence,01:21:51,1,21,51,But when you talk about accepting the outcome ...,4911
316,Mike Pence,01:22:53,1,22,53,"So let me just say, I think we’re going to win...",4973
318,Mike Pence,01:24:20,1,24,20,"Brecklin, it’s a wonderful question. And let m...",5060


Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
0,Susan Page,00:00:00,0,0,0,Good evening. From the University of Utah in S...,0
1,Susan Page,00:01:21,0,1,21,"These are tumultuous times, but we can and wil...",81
2,Kamala Harris,00:02:13,0,2,13,"Thank you, Susan. Well, the American people ha...",133
3,Kamala Harris,00:03:21,0,3,21,"Can you imagine if you knew on January 28th, a...",201
4,Susan Page,00:04:17,0,4,17,"Thank you, Senator Harris-",257
...,...,...,...,...,...,...,...
322,Kamala Harris,01:26:24,1,26,24,"And brings me to Joe, Joe Biden. One of the re...",5184
323,Kamala Harris,01:26:51,1,26,51,Joe has a longstanding reputation of working a...,5211
324,Kamala Harris,01:27:17,1,27,17,"Brecklin, when you think about the future, I d...",5237
325,Susan Page,01:27:44,1,27,44,"Thank you, Senator Harris. Thank you, vice pre...",5264
