In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import glob
import re
from tqdm import tqdm

In [2]:
def convert_int(s):
  if s.isdigit():
    return int(s)
  else:
    return s


def alphanum_key(s):
  return [convert_int(c) for c in re.split('([0-9]+)', s)]


def sort_nicely(l):
  l.sort(key=alphanum_key)

In [48]:
text_files = glob.glob('transcripts/20190502/*.txt')
sort_nicely(text_files)
text_files

['transcripts/20190502/text_T0-2_mins.txt',
 'transcripts/20190502/text_T2-4_mins.txt',
 'transcripts/20190502/text_T4-6_mins.txt',
 'transcripts/20190502/text_T6-8_mins.txt',
 'transcripts/20190502/text_T8-10_mins.txt',
 'transcripts/20190502/text_T10-12_mins.txt',
 'transcripts/20190502/text_T12-14_mins.txt',
 'transcripts/20190502/text_T14-16_mins.txt',
 'transcripts/20190502/text_T16-18_mins.txt',
 'transcripts/20190502/text_T18-20_mins.txt',
 'transcripts/20190502/text_T20-22_mins.txt',
 'transcripts/20190502/text_T22-24_mins.txt',
 'transcripts/20190502/text_T24-26_mins.txt',
 'transcripts/20190502/text_T26-28_mins.txt',
 'transcripts/20190502/text_T28-30_mins.txt',
 'transcripts/20190502/text_T30-32_mins.txt',
 'transcripts/20190502/text_T32-34_mins.txt',
 'transcripts/20190502/text_T34-36_mins.txt',
 'transcripts/20190502/text_T36-38_mins.txt',
 'transcripts/20190502/text_T38-40_mins.txt',
 'transcripts/20190502/text_T40-42_mins.txt',
 'transcripts/20190502/text_T42-44_mins.txt

In [49]:
for i, text_file in tqdm(enumerate(text_files)):
    with open(text_file, 'r') as f:
        res = f.read()
        start_time_min = Path(f.name).name.split('T')[1].split('-')[0]
    records = [list(map(str.strip, line.strip().split(']'))) for line in res.split('\n')]
    l = []
    for record in records:
        d = {}
        d['timestamp'] = record[0].replace('[', '').split(',')[0]
        for r in record[1:]:
            key = 'transcript'
            d[key] = r
        l.append(d)
    df = pd.DataFrame(l).fillna(0)
    try:
        pd.to_datetime(df['timestamp'], format='%H:%M:%S')
        df['timestamp'] = (df['timestamp'] + pd.Timedelta(minutes=int(start_time_min))).astype(str).str.split('0 days ').str[-1]
        # print(i)
        # print(df)
        if i == 0:
            df_all = df
        else:
            df_all = pd.concat([df_all, df], axis=0).reset_index(drop=True)
    except:
        pass

40it [00:00, 699.19it/s]


In [50]:
df_all.head()

Unnamed: 0,timestamp,transcript
0,00:00:30,"I don't know if you can see it, but we have a ..."
1,00:00:32,that are trying to have a conversation
2,00:00:34,where they can't see the question.
3,00:00:36,And they're trying to learn what they're looki...
4,00:00:38,So that would be a problem.


In [51]:
df_all.to_csv('20190502_transcript_all.csv', index=False)

In [57]:
confusion_time_points = ['00:10:00', '00:12:00']

In [58]:
str.join(' ', (df_all.transcript[(df_all.timestamp > confusion_time_points[0]) & 
                  (df_all.timestamp < confusion_time_points[1])].to_list()))

'Покладіть тісто на тісто і дайте йому відпочити на 20 хвилин. Покладіть тісто на тісто і дайте йому відпочити на 20 хвилин. Покладіть тісто на тісто і дайте йому відпочити на 20 хвилин. Покладіть тісто на тісто і дайте йому відпочити на 20 хвилин. Дякую за перегляд!'

In [63]:
df_all

Unnamed: 0,timestamp,transcript
0,00:00:30,"I don't know if you can see it, but we have a ..."
1,00:00:32,that are trying to have a conversation
2,00:00:34,where they can't see the question.
3,00:00:36,And they're trying to learn what they're looki...
4,00:00:38,So that would be a problem.
...,...,...
1269,01:19:38,"Yeah, yeah, we're all here."
1270,01:19:40,This is Nathan Alfred.
1271,01:19:45,Well like this is the 40th anniversary of Nath...
1272,01:19:49,So you might as well do the song and dance bef...
