In [51]:
from bs4 import BeautifulSoup
import numpy as np
import os
import re

In [45]:
def html_to_txt(file):
    
    # read file
    with open(file, encoding='utf-8', errors='ignore') as f: html_content = f.read()
    
    # remove credits paragraph (first p tag)
    first_p_start_index = html_content.find('<p>')
    first_p_end_index = html_content.find('</p>', first_p_start_index) + len('</p>')
    html_content = html_content[:first_p_start_index] + html_content[first_p_end_index:]

    html_content = re.sub('<font.*?>', '', html_content, flags=re.DOTALL)
    html_content = re.sub('</font>', '', html_content)
    html_content = re.sub('<b>', '', html_content)
    html_content = re.sub('</b>', '', html_content)
    html_content = re.sub('</b>', '', html_content)
   
    # initialize soup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # remove title
    for title in soup.find_all('title'): title.extract()
        
    # replace newlines in dialogous with spaces
    for p in soup.find_all('p'):
        p.string = p.get_text().replace('\n', ' ')
        while p.get_text().count('  ') > 0: p.string = p.get_text().replace('  ', ' ')
    
    # get script, lowercase it
    text = soup.get_text()

    # splitlines, join all lines, lowercase, remove weird space, replace . . . with ... for consistency
    text = '\n'.join(line.strip() for line in text.splitlines() if line.strip()).lower().replace('   ', ' ').replace('. . .', '...')
    # remove "end" from end, remove credits
    text = text.rstrip('the end').rstrip('end').replace('opening credits', '').replace('closing credits', '').replace('ending credits', '').replace('end credits.', '')
    # remove transcriber's note
    text = re.sub('{transcriber.+}', '', text)

    if text.count('\n')<50: 
        # add newlines before dialogue start (issue in some files)
        text = re.sub(r' (?=[a-z]+:)', r'\n', text)
        # add newlines before scene descriptions (issue in some files)
        s = re.sub(r' +(\[.*?\])\n', r'\n\1\n', text)
        while s != text:
            text = s
            s = re.sub(r' (\[.*?\])\n', r'\n\1\n', text)
    # add start and end tokens
    text = f'<EPISODE_START>\n{text}\n<EPISODE_END>'
    # remove unnecesary new lines
    text = re.sub(r'\n\n+', 'r\n', text)
    # remove unnecessary spaces
    text = re.sub(r'  +', r' ', text)
    # remove everything between start token and episode name
    text = re.sub(r'<EPISODE_START>\n.*?the one', r'<EPISODE_START>\nthe one', text, flags=re.DOTALL)
    # replace shorter names which are in some files
    text = text.replace('chan: ', 'chandler: ').replace('mnca: ', 'monica: ').replace('phoe: ', 'phoebe: ').replace('rach: ', 'rachel: ')
    
    return text

In [46]:
# convert all .html to .txt
read_dir = './data/raw/scripts'
write_dir = './data/interim/scripts'
for filename in os.listdir(read_dir):
    with open(f'{write_dir}/{filename.replace('.html','.txt')}', 'w', encoding='utf-8') as f: f.write(html_to_txt(f'{read_dir}/{filename}'))

In [57]:
# fix these manually
l,l1,l2,l3,l4 = [],[],[],[],[]
for filename in os.listdir(write_dir):
    with open(f'{write_dir}/{filename}', encoding='utf-8') as f:
        t = f.read()
        if t.count("credits")>0: l.append(filename)
        if t.count(" ]")>0: l1.append(filename)
        if t.count("[\n")>0: l2.append(filename)
        if t.count("\n]")>0: l2.append(filename)
        if t.count(" ]")>0: l2.append(filename)
# remove text between start token and episode name: s10e1718
l,l1,l2,l3,l4

(['0210.txt', '0602.txt', '0613.txt', '0704.txt'],
 ['0909.txt', '1004.txt', '1007.txt'],
 ['0909.txt', '1004.txt', '1007.txt'],
 [],
 [])

In [55]:
import numpy as np
d = {}
for filename in os.listdir(write_dir):
    with open(f'{write_dir}/{filename}', encoding='utf-8') as f: 
        d[filename] = len(f.read().split())
sorted(d.items(), key=lambda item: item[1], reverse=True)[:20]



[('0423.txt', 12193),
 ('0923-0924.txt', 8494),
 ('0523.txt', 8292),
 ('0913.txt', 8031),
 ('0624.txt', 7669),
 ('1017-1018.txt', 7321),
 ('0615-0616.txt', 7287),
 ('0723.txt', 6984),
 ('0911.txt', 6979),
 ('0823.txt', 6739),
 ('0212-0213.txt', 6316),
 ('0906.txt', 5496),
 ('1012.txt', 5122),
 ('0716.txt', 4941),
 ('0117.txt', 4900),
 ('1001.txt', 4701),
 ('0522.txt', 4636),
 ('1011.txt', 4635),
 ('0715.txt', 4630),
 ('0713.txt', 4625)]

In [9]:
outfilename = './data/interim/all.txt'
with open(outfilename, 'w', encoding='utf-8') as outfile:
    for filename in os.listdir(write_dir):
        if filename != 'all.txt':
            with open(f'{write_dir}/{filename}', encoding='utf-8') as infile:
                outfile.write(infile.read())
                outfile.write('\n')