In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import string
import re

In [2]:
def change_speaker_names(x):
    if x == 'Vice President Joe Biden':
        return 'Joe Biden'
    elif x == 'President Donald J. Trump':
        return 'Donald Trump'
    else:
        return x

def add_hour(x):
    temp = x.split(':')
    if len(temp) == 2:
        x = f'00:{x}'
    return x

def overflow(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    if second >= 60:
        minute += (second // 60)
        second = second % 60
    if minute >= 60:
        hour = (minute // 60)
        minute = minute % 60
    return hour, minute, second

def underflow(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    if second < 0:
        minute -= abs(second // 60)
        second = abs(second % 60)
    if minute < 0:
        hour = abs(minute // 60)
        minute = abs(minute % 60)
    return hour, minute, second

def convert_time(x):
    hour, minute, second = x['hour'], x['minute'], x['second']
    hour = str(hour)
    minute = str(minute)
    second = str(second)
    return f'{hour.zfill(2)}:{minute.zfill(2)}:{second.zfill(2)}'
        
def time_in_seconds(x):
    return 3600 * x['hour'] + 60 * x['minute'] + x['second']

def format_time(x):
    hour = str(int(x // 3600))
    x = x % 3600
    minute = str(int(x // 60))
    x = x % 60
    second = str(int(x))
    return f'{hour.zfill(2)}:{minute.zfill(2)}:{second.zfill(2)}'

In [3]:
debate1 = pd.read_csv('./data/kaggle_debate/us_election_2020_1st_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})
debate2 = pd.read_csv('./data/kaggle_debate/us_election_2020_2nd_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})
debate_vp = pd.read_csv('./data/kaggle_debate/us_election_2020_vice_presidential_debate.csv', dtype={'speaker' : str, 'minute' : str, 'text' : str})

# A little preprocessing
####################################
## Debate #1
debate1['time'] = debate1['minute']
debate1['time'] = debate1['time'].apply(add_hour)
debate1['second'] = debate1['time'].apply(lambda x: int(x.split(':')[2]))
debate1['minute'] = debate1['time'].apply(lambda x: int(x.split(':')[1]))
debate1['hour'] = debate1['time'].apply(lambda x: int(x.split(':')[0]))
debate1 = debate1[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Change speaker names
debate1['speaker'] = debate1['speaker'].apply(change_speaker_names)

# Fixes timing issues (of resets)
hour, minute, second = debate1.iloc[[178]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate1.iloc[179:]['hour'] = debate1.iloc[179:]['hour'] + hour
debate1.iloc[179:]['minute'] = debate1.iloc[179:]['minute'] + minute
debate1.iloc[179:]['second'] = debate1.iloc[179:]['second'] + second

# Fixes potential overflow
temp = debate1.iloc[179:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate1.iloc[179:]['hour'] = temp[0]
debate1.iloc[179:]['minute'] = temp[1]
debate1.iloc[179:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate1.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate1['hour'] = debate1['hour'] - hour
debate1['minute'] = debate1['minute'] - minute
debate1['second'] = debate1['second'] - second

# Fixes potential underflow
temp = debate1[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate1['hour'] = temp[0]
debate1['minute'] = temp[1]
debate1['second'] = temp[2]

# Fix overall time
debate1['time'] = debate1[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate1['time_seconds'] = debate1[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

####################################
## Debate #2
debate2['time'] = debate2['minute']
debate2['time'] = debate2['time'].apply(add_hour)
debate2['second'] = debate2['time'].apply(lambda x: int(x.split(':')[2]))
debate2['minute'] = debate2['time'].apply(lambda x: int(x.split(':')[1]))
debate2['hour'] = debate2['time'].apply(lambda x: int(x.split(':')[0]))
debate2 = debate2[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Fixes first timing issue (of resets)
hour, minute, second = debate2.iloc[[88]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2.iloc[89:337]['hour'] = debate2.iloc[89:337]['hour'] + hour
debate2.iloc[89:337]['minute'] = debate2.iloc[89:337]['minute'] + minute
debate2.iloc[89:337]['second'] = debate2.iloc[89:337]['second'] + second
debate2.iloc[89:337]

# Fixes potential overflow
temp = debate2.iloc[89:337][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate2.iloc[89:337]['hour'] = temp[0]
debate2.iloc[89:337]['minute'] = temp[1]
debate2.iloc[89:337]['second'] = temp[2]

# Fixes first timing issue (of resets)
hour, minute, second = debate2.iloc[[336]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2.iloc[337:]['hour'] = debate2.iloc[337:]['hour'] + hour
debate2.iloc[337:]['minute'] = debate2.iloc[337:]['minute'] + minute
debate2.iloc[337:]['second'] = debate2.iloc[337:]['second'] + second

# Fixes potential overflow
temp = debate2.iloc[337:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate2.iloc[337:]['hour'] = temp[0]
debate2.iloc[337:]['minute'] = temp[1]
debate2.iloc[337:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate2.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate2['hour'] = debate2['hour'] - hour
debate2['minute'] = debate2['minute'] - minute
debate2['second'] = debate2['second'] - second

# Fixes potential underflow
temp = debate2[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate2['hour'] = temp[0]
debate2['minute'] = temp[1]
debate2['second'] = temp[2]

# Fix overall time
debate2['time'] = debate2[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate2['time_seconds'] = debate2[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

####################################
## Debate VP
debate_vp['time'] = debate_vp['minute']
debate_vp['time'] = debate_vp['time'].apply(add_hour)
debate_vp['second'] = debate_vp['time'].apply(lambda x: int(x.split(':')[2]))
debate_vp['minute'] = debate_vp['time'].apply(lambda x: int(x.split(':')[1]))
debate_vp['hour'] = debate_vp['time'].apply(lambda x: int(x.split(':')[0]))
debate_vp = debate_vp[['speaker', 'time', 'hour', 'minute', 'second', 'text']]

# Adjust the reset to 00:00
hour, minute, second = debate_vp.iloc[[135]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp.iloc[135:]['hour'] = debate_vp.iloc[135:]['hour'] - hour
debate_vp.iloc[135:]['minute'] = debate_vp.iloc[135:]['minute'] - minute
debate_vp.iloc[135:]['second'] = debate_vp.iloc[135:]['second'] - second

# Fixes potential underflow
temp = debate_vp.iloc[135:][['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate_vp.iloc[135:]['hour'] = temp[0]
debate_vp.iloc[135:]['minute'] = temp[1]
debate_vp.iloc[135:]['second'] = temp[2]

# Fixes first timing issue (of resets)
hour, minute, second = debate_vp.iloc[[134]][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp.iloc[135:]['hour'] = debate_vp.iloc[135:]['hour'] + hour
debate_vp.iloc[135:]['minute'] = debate_vp.iloc[135:]['minute'] + minute
debate_vp.iloc[135:]['second'] = debate_vp.iloc[135:]['second'] + second
debate_vp.iloc[135:]

# Fixes potential overflow
temp = debate_vp.iloc[135:][['hour', 'minute', 'second']].apply(overflow, axis=1).apply(pd.Series)
debate_vp.iloc[135:]['hour'] = temp[0]
debate_vp.iloc[135:]['minute'] = temp[1]
debate_vp.iloc[135:]['second'] = temp[2]

# Shift everything so we start at 00:00
hour, minute, second = debate_vp.iloc[0][['hour', 'minute', 'second']].to_numpy().squeeze().astype(int)
debate_vp['hour'] = debate_vp['hour'] - hour
debate_vp['minute'] = debate_vp['minute'] - minute
debate_vp['second'] = debate_vp['second'] - second

# Fixes potential underflow
temp = debate_vp[['hour', 'minute', 'second']].apply(underflow, axis=1).apply(pd.Series)
debate_vp['hour'] = temp[0]
debate_vp['minute'] = temp[1]
debate_vp['second'] = temp[2]

# Fix overall time
debate_vp['time'] = debate_vp[['hour', 'minute', 'second']].apply(convert_time, axis=1)
debate_vp['time_seconds'] = debate_vp[['hour', 'minute', 'second']].apply(time_in_seconds, axis=1)

vectorized_format_time = np.vectorize(format_time)

low, high = debate1['time_seconds'].iloc[[0, -1]].to_numpy()
seconds, step_size = np.linspace(low, high, num=90, retstep=True)
seconds = seconds.round()
index_to_seconds = {}
for i, second in enumerate(seconds):
    index_to_seconds[i] = second
times = vectorized_format_time(seconds)

joe = debate1[debate1['speaker'] == 'Joe Biden']

parsed = joe[(joe['time_seconds'] >= 200) & (joe['time_seconds'] <= 600)]

text = parsed['text']
text = ' '.join(text)
text = re.sub(r'[^\w\s]','',text).lower()

custom_remove_string = ['the', 'is', 'of', 'that', 'to']
text = text.split()
text = np.array([w for w in text if w not in custom_remove_string])
words, frequencies = np.unique(text, return_counts=True)

hist = {}
for word, frequency in zip(words, frequencies):
    hist[word] = frequency
    
import math

low = 11.25
high = 29.5

low_low = math.floor(low)
low_high = math.ceil(low)
low_dec = low - low_low

high_low = math.floor(high)
high_high = math.ceil(high)
high_dec = high - high_low

print(low_low, low_high, low_dec)
print(65 + low_dec*(77-65))

11 12 0.25
68.0


## Working on Google Trends data

In [4]:
from pytrends.request import TrendReq
import os
import sys
import time

In [5]:
pytrends = TrendReq(hl='en-US', tz=360)

In [6]:
keyword_list = ['covid', 'us']
pytrends.build_payload(keyword_list, cat=0, timeframe='2020-01-01 2020-12-08', geo='US')

In [7]:
pytrends.interest_over_time()

Unnamed: 0_level_0,covid,us,isPartial
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-05,0,31,False
2020-01-12,0,26,False
2020-01-19,0,26,False
2020-01-26,0,29,False
2020-02-02,0,27,False
2020-02-09,0,26,False
2020-02-16,0,26,False
2020-02-23,2,32,False
2020-03-01,7,35,False
2020-03-08,29,46,False


In [8]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [9]:
related_words_dir = './data/google_trends/'
keyword_list_dataframe = None
for topic_file in os.listdir(related_words_dir):
    topic_file_path = f'{related_words_dir}{topic_file}'
    with open(topic_file_path, 'r') as f:
        file_contents = f.readlines()
        file_contents = [content.strip(' \n') for content in file_contents]
    for keyword_list in chunker(file_contents, 1):
        pytrends.build_payload(keyword_list, cat=0, timeframe='2020-01-01 2020-12-08', geo='US')
        if keyword_list_dataframe is None:
            keyword_list_dataframe = pytrends.interest_over_time()
        else:
            keyword_list_dataframe = pd.concat([keyword_list_dataframe, pytrends.interest_over_time()], axis=1, sort=False)

In [10]:
keyword_list_dataframe

Unnamed: 0_level_0,free college,isPartial,free community college,isPartial,free college courses,isPartial,tuition free college,isPartial,free college education,isPartial,...,womens rights,isPartial,trump womens rights,isPartial,elizabeth cady stanton,isPartial,womens rights riots,isPartial,abortion,isPartial
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-05,73,False,70,False,27,False,40,False,84,False,...,40,False,4,False,51,False,0,False,58,False
2020-01-12,100,False,100,False,33,False,45,False,45,False,...,52,False,0,False,79,False,0,False,52,False
2020-01-19,76,False,93,False,28,False,69,False,74,False,...,48,False,0,False,66,False,0,False,61,False
2020-01-26,83,False,82,False,14,False,68,False,69,False,...,49,False,17,False,77,False,0,False,54,False
2020-02-02,77,False,85,False,23,False,60,False,81,False,...,44,False,8,False,79,False,0,False,62,False
2020-02-09,77,False,72,False,24,False,65,False,75,False,...,71,False,8,False,100,False,12,False,58,False
2020-02-16,79,False,80,False,27,False,64,False,61,False,...,54,False,4,False,82,False,0,False,59,False
2020-02-23,89,False,87,False,22,False,100,False,98,False,...,75,False,0,False,82,False,0,False,62,False
2020-03-01,85,False,87,False,13,False,67,False,100,False,...,65,False,4,False,86,False,0,False,70,False
2020-03-08,63,False,54,False,19,False,49,False,69,False,...,66,False,8,False,97,False,0,False,66,False
