In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import string
import re

In [2]:
def change_speaker_names(x):
    if x == 'Vice President Joe Biden':
        return 'Joe Biden'
    elif x == 'President Donald J. Trump':
        return 'Donald Trump'
    else:
        return x

def add_hour(x):
    temp = x.split(':')
    if len(temp) == 2:
        x = f'00:{x}'
    return x

def overflow(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    if second >= 60:
        minute += (second // 60)
        second = second % 60
    if minute >= 60:
        hour = (minute // 60)
        minute = minute % 60
    return hour, minute, second

def underflow(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    if second < 0:
        minute -= abs(second // 60)
        second = abs(second % 60)
    if minute < 0:
        hour = abs(minute // 60)
        minute = abs(minute % 60)
    return hour, minute, second

def convert_time(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    hour = str(hour)
    minute = str(minute)
    second = str(second)
    return f'{hour.zfill(2)}:{minute.zfill(2)}:{second.zfill(2)}'
        
def time_in_seconds(x):
    return 3600 * x['hour'] + 60 * x['minute'] + x['second']

def format_time(x):
    hour = str(int(x // 3600))
    x = x % 3600
    minute = str(int(x // 60))
    x = x % 60
    second = str(int(x))
    return f'{hour.zfill(2)}:{minute.zfill(2)}:{second.zfill(2)}'

In [3]:
debate1 = pd.read_csv('./data/kaggle_debate/us_election_2020_1st_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})
debate2 = pd.read_csv('./data/kaggle_debate/us_election_2020_2nd_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})
debate_vp = pd.read_csv('./data/kaggle_debate/us_election_2020_vice_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})

In [4]:
# A little preprocessing
####################################
## Debate #1
debate1['time'] = debate1['minute']
debate1['time'] = debate1['time'].apply(add_hour)
debate1['second'] = debate1['time'].apply(lambda x: int(x.split(':')[2]))
debate1['minute'] = debate1['time'].apply(lambda x: int(x.split(':')[1]))
debate1['hour'] = debate1['time'].apply(lambda x: int(x.split(':')[0]))
debate1 = debate1[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Change speaker names
debate1['speaker'] = debate1['speaker'].apply(change_speaker_names)

# Fixes timing issues (of resets)
hour, minute, second = debate1.iloc[[178]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate1.iloc[179:]['hour'] = debate1.iloc[179:]['hour'] + hour
debate1.iloc[179:]['minute'] = debate1.iloc[179:]['minute'] + minute
debate1.iloc[179:]['second'] = debate1.iloc[179:]['second'] + second

# Fixes potential overflow
temp = debate1.iloc[179:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate1.iloc[179:]['hour'] = temp[0]
debate1.iloc[179:]['minute'] = temp[1]
debate1.iloc[179:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate1.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate1['hour'] = debate1['hour'] - hour
debate1['minute'] = debate1['minute'] - minute
debate1['second'] = debate1['second'] - second

# Fixes potential underflow
temp = debate1[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate1['hour'] = temp[0]
debate1['minute'] = temp[1]
debate1['second'] = temp[2]

# Fix overall time
debate1['time'] = debate1[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate1['time_seconds'] = debate1[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

In [5]:
####################################
## Debate #2
debate2['time'] = debate2['minute']
debate2['time'] = debate2['time'].apply(add_hour)
debate2['second'] = debate2['time'].apply(lambda x: int(x.split(':')[2]))
debate2['minute'] = debate2['time'].apply(lambda x: int(x.split(':')[1]))
debate2['hour'] = debate2['time'].apply(lambda x: int(x.split(':')[0]))
debate2 = debate2[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Fixes first timing issue (of resets)
hour, minute, second = debate2.iloc[[88]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2.iloc[89:337]['hour'] = debate2.iloc[89:337]['hour'] + hour
debate2.iloc[89:337]['minute'] = debate2.iloc[89:337]['minute'] + minute
debate2.iloc[89:337]['second'] = debate2.iloc[89:337]['second'] + second
debate2.iloc[89:337]

# Fixes potential overflow
temp = debate2.iloc[89:337][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate2.iloc[89:337]['hour'] = temp[0]
debate2.iloc[89:337]['minute'] = temp[1]
debate2.iloc[89:337]['second'] = temp[2]

# Fixes first timing issue (of resets)
hour, minute, second = debate2.iloc[[336]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2.iloc[337:]['hour'] = debate2.iloc[337:]['hour'] + hour
debate2.iloc[337:]['minute'] = debate2.iloc[337:]['minute'] + minute
debate2.iloc[337:]['second'] = debate2.iloc[337:]['second'] + second

# Fixes potential overflow
temp = debate2.iloc[337:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate2.iloc[337:]['hour'] = temp[0]
debate2.iloc[337:]['minute'] = temp[1]
debate2.iloc[337:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate2.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2['hour'] = debate2['hour'] - hour
debate2['minute'] = debate2['minute'] - minute
debate2['second'] = debate2['second'] - second

# Fixes potential underflow
temp = debate2[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate2['hour'] = temp[0]
debate2['minute'] = temp[1]
debate2['second'] = temp[2]

# Fix overall time
debate2['time'] = debate2[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate2['time_seconds'] = debate2[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

In [6]:
####################################
## Debate VP
debate_vp['time'] = debate_vp['minute']
debate_vp['time'] = debate_vp['time'].apply(add_hour)
debate_vp['second'] = debate_vp['time'].apply(lambda x: int(x.split(':')[2]))
debate_vp['minute'] = debate_vp['time'].apply(lambda x: int(x.split(':')[1]))
debate_vp['hour'] = debate_vp['time'].apply(lambda x: int(x.split(':')[0]))
debate_vp = debate_vp[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Adjust the reset to 00:00
hour, minute, second = debate_vp.iloc[[135]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp.iloc[135:]['hour'] = debate_vp.iloc[135:]['hour'] - hour
debate_vp.iloc[135:]['minute'] = debate_vp.iloc[135:]['minute'] - minute
debate_vp.iloc[135:]['second'] = debate_vp.iloc[135:]['second'] - second

# Fixes potential underflow
temp = debate_vp.iloc[135:][['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate_vp.iloc[135:]['hour'] = temp[0]
debate_vp.iloc[135:]['minute'] = temp[1]
debate_vp.iloc[135:]['second'] = temp[2]

# Fixes first timing issue (of resets)
hour, minute, second = debate_vp.iloc[[134]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp.iloc[135:]['hour'] = debate_vp.iloc[135:]['hour'] + hour
debate_vp.iloc[135:]['minute'] = debate_vp.iloc[135:]['minute'] + minute
debate_vp.iloc[135:]['second'] = debate_vp.iloc[135:]['second'] + second
debate_vp.iloc[135:]

# Fixes potential overflow
temp = debate_vp.iloc[135:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate_vp.iloc[135:]['hour'] = temp[0]
debate_vp.iloc[135:]['minute'] = temp[1]
debate_vp.iloc[135:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate_vp.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp['hour'] = debate_vp['hour'] - hour
debate_vp['minute'] = debate_vp['minute'] - minute
debate_vp['second'] = debate_vp['second'] - second

# Fixes potential underflow
temp = debate_vp[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate_vp['hour'] = temp[0]
debate_vp['minute'] = temp[1]
debate_vp['second'] = temp[2]

# Fix overall time
debate_vp['time'] = debate_vp[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate_vp['time_seconds'] = debate_vp[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

In [7]:
joe = debate1[debate1['speaker'] == 'Joe Biden']

In [8]:
donald = debate1[debate1['speaker'] == 'Donald Trump']

In [9]:
vectorized_format_time = np.vectorize(format_time)

In [10]:
low, high = debate1['time_seconds'].iloc[[0, -1]].to_numpy()
seconds, step_size = np.linspace(low, high, num=90, retstep=True)
seconds = seconds.round()
index_to_seconds = {}
for i, second in enumerate(seconds):
    index_to_seconds[i] = second
times = vectorized_format_time(seconds)

In [11]:
seconds

array([   0.,   63.,  127.,  190.,  253.,  317.,  380.,  443.,  507.,
        570.,  633.,  696.,  760.,  823.,  886.,  950., 1013., 1076.,
       1140., 1203., 1266., 1330., 1393., 1456., 1520., 1583., 1646.,
       1709., 1773., 1836., 1899., 1963., 2026., 2089., 2153., 2216.,
       2279., 2343., 2406., 2469., 2533., 2596., 2659., 2723., 2786.,
       2849., 2912., 2976., 3039., 3102., 3166., 3229., 3292., 3356.,
       3419., 3482., 3546., 3609., 3672., 3736., 3799., 3862., 3926.,
       3989., 4052., 4115., 4179., 4242., 4305., 4369., 4432., 4495.,
       4559., 4622., 4685., 4749., 4812., 4875., 4939., 5002., 5065.,
       5128., 5192., 5255., 5318., 5382., 5445., 5508., 5572., 5635.])

In [12]:
index_to_seconds

{0: 0.0,
 1: 63.0,
 2: 127.0,
 3: 190.0,
 4: 253.0,
 5: 317.0,
 6: 380.0,
 7: 443.0,
 8: 507.0,
 9: 570.0,
 10: 633.0,
 11: 696.0,
 12: 760.0,
 13: 823.0,
 14: 886.0,
 15: 950.0,
 16: 1013.0,
 17: 1076.0,
 18: 1140.0,
 19: 1203.0,
 20: 1266.0,
 21: 1330.0,
 22: 1393.0,
 23: 1456.0,
 24: 1520.0,
 25: 1583.0,
 26: 1646.0,
 27: 1709.0,
 28: 1773.0,
 29: 1836.0,
 30: 1899.0,
 31: 1963.0,
 32: 2026.0,
 33: 2089.0,
 34: 2153.0,
 35: 2216.0,
 36: 2279.0,
 37: 2343.0,
 38: 2406.0,
 39: 2469.0,
 40: 2533.0,
 41: 2596.0,
 42: 2659.0,
 43: 2723.0,
 44: 2786.0,
 45: 2849.0,
 46: 2912.0,
 47: 2976.0,
 48: 3039.0,
 49: 3102.0,
 50: 3166.0,
 51: 3229.0,
 52: 3292.0,
 53: 3356.0,
 54: 3419.0,
 55: 3482.0,
 56: 3546.0,
 57: 3609.0,
 58: 3672.0,
 59: 3736.0,
 60: 3799.0,
 61: 3862.0,
 62: 3926.0,
 63: 3989.0,
 64: 4052.0,
 65: 4115.0,
 66: 4179.0,
 67: 4242.0,
 68: 4305.0,
 69: 4369.0,
 70: 4432.0,
 71: 4495.0,
 72: 4559.0,
 73: 4622.0,
 74: 4685.0,
 75: 4749.0,
 76: 4812.0,
 77: 4875.0,
 78: 4939.0,
 7

In [13]:
joe

Unnamed: 0,speaker,time,hour,minute,second,text,time_seconds
2,Joe Biden,00:01:29,0,1,29,"How you doing, man?",89
4,Joe Biden,00:01:31,0,1,31,I’m well.,91
9,Joe Biden,00:04:09,0,4,9,"Well, first of all, thank you for doing this a...",249
11,Joe Biden,00:04:16,0,4,16,The American people have a right to have a say...,256
12,Joe Biden,00:04:52,0,4,52,"Now, what’s at stake here is the President’s m...",292
...,...,...,...,...,...,...,...
751,Joe Biden,01:28:42,1,28,42,Five states have had mail-in ballots for the l...,5322
756,Joe Biden,01:29:41,1,29,41,I am concerned that any court would settle thi...,5381
761,Joe Biden,01:30:24,1,30,24,Mail service delivers [crosstalk 01:07:21] 185...,5424
779,Joe Biden,01:32:35,1,32,35,Yes. And here’s the deal. We count the ballots...,5555


In [14]:
parsed = joe[(joe['time_seconds'] >= 200) & (joe['time_seconds'] <= 600)]

In [15]:
text = parsed['text']
text = ' '.join(text)
text = re.sub(r'[^\w\s]','',text).lower()

custom_remove_string = ['the', 'is', 'of', 'that', 'to']
text = text.split()
text = np.array([w for w in text if w not in custom_remove_string])
words, frequencies = np.unique(text, return_counts=True)

In [16]:
hist = {}
for word, frequency in zip(words, frequencies):
    hist[word] = frequency

In [17]:
hist

{'20': 1,
 '200000': 1,
 '23': 1,
 'a': 9,
 'able': 2,
 'about': 1,
 'act': 5,
 'affordable': 5,
 'again': 1,
 'all': 3,
 'allow': 1,
 'already': 3,
 'also': 2,
 'am': 2,
 'american': 2,
 'an': 1,
 'and': 20,
 'any': 1,
 'appropriate': 1,
 'approved': 2,
 'are': 2,
 'as': 4,
 'at': 2,
 'away': 1,
 'ballot': 2,
 'be': 3,
 'because': 3,
 'been': 2,
 'before': 2,
 'bench': 1,
 'biden': 1,
 'big': 1,
 'but': 1,
 'by': 2,
 'can': 1,
 'care': 5,
 'chance': 1,
 'changed': 1,
 'charge': 1,
 'clear': 1,
 'colleagues': 1,
 'companies': 1,
 'condition': 1,
 'conditions': 3,
 'constitutional': 1,
 'contracted': 1,
 'could': 1,
 'court': 8,
 'covid': 1,
 'deal': 2,
 'debates': 1,
 'democratic': 3,
 'died': 1,
 'discussion': 1,
 'do': 3,
 'does': 1,
 'doing': 1,
 'donald': 1,
 'down': 2,
 'elect': 2,
 'elected': 1,
 'election': 7,
 'ended': 1,
 'exact': 1,
 'expand': 1,
 'express': 1,
 'fact': 2,
 'february': 1,
 'fine': 1,
 'finish': 1,
 'first': 1,
 'for': 6,
 'forward': 3,
 'from': 1,
 'fundament

In [18]:
import math

low = 11.25
high = 29.5

low_low = math.floor(low)
low_high = math.ceil(low)
low_dec = low - low_low

high_low = math.floor(high)
high_high = math.ceil(high)
high_dec = high - high_low

In [19]:
print(low_low, low_high, low_dec)

11 12 0.25


In [20]:
65 + low_dec*(77-65)

68.0