### Extra en Espanol Transcript Extraction

In [1]:
# Installing package to extract text from a doc file
!pip install textract



In [2]:
import textract
import re

In [3]:
files_dir = "C:/Users/dasha/Documents/Education/UBC/Capstone/Extra en Espanol/"
filenames = ['01 La llegada de Sam.docx',
              '02 Sam va de compras.doc',
              '03 Sam aprende a ligar.doc',
              '04 Sam busca un trabajo.doc',
              '05 Ha nacido una estrella.docx',
              '06 El día de la primitiva.doc',
              '07 La gemela.doc',
              '08 La prima de la dueña.doc',
              '09 Trabajos para los chicos.doc',
              '10 ANA PROTESTA.doc',
              '11 Tiempo de vacaciones.doc',
              '12 Fanáticos del fútbol.doc',
              '13 Boda en el aire.doc']

#### Some functions to aid in preprocessing

In [4]:
# A set of functions that help clean up the text

def remove_upto(unwanted_string, text):
    '''This function removes all lines in the text up to and including the line 
       that contains unwanted_string. 
    '''
    unwanted_idx = text.index(unwanted_string)
    end_of_unwanted_line = text[unwanted_idx+len(unwanted_string):].index('\n') + unwanted_idx+len(unwanted_string)
    
    return text[end_of_unwanted_line+1:]

def remove_all_lines(unwanted_string, text):
    ''' This function removes all lines that contain unwanted_string. '''
    # Find the unwanted lines
    while(unwanted_string in text):
        unwanted_idx = text.index(unwanted_string)
        end_of_unwanted_line = text[unwanted_idx+len(unwanted_string):].index('\n') + unwanted_idx+len(unwanted_string)
        text = text[:unwanted_idx] + text[end_of_unwanted_line+1:]
        
    return text

#### Read in and Clean Up Texts

In [5]:
text_dir = 'C:/Users/dasha/Documents/Education/UBC/Capstone/capstone_FHIS/text/'
text_list = []
for idx, filename in enumerate(filenames):
    text = textract.process(files_dir+filename).decode('utf-8')
    # Extract title from filename
    title_start = filenames[idx].index(' ')+1
    title_end = filenames[idx].index('.')
    title = filenames[idx][title_start:title_end]
    
    # Remove extra space in Episode 5
    if idx == 4: #Episode 5
        text_lines = text.split('\n')
        stripped_lines = []
        for line in text_lines:
            line = line.strip()
            stripped_lines.append(line)
        text = "\n".join(stripped_lines)
    
    # Clean up issues common to all texts
    # First, remove the word 'Transcripts' from the text
    if 'Transcripts' in text:
        text = remove_upto('Transcripts', text)
        
    # Remove vertical lines
    if '|' in text:
        text = text.replace('|', '')
        
    # Remove horizontal lines
    if '_________________________' in text:
        text = text.replace('_________________________\n', '')
    if '—–' in text:
        text = text.replace('—–\n', '')
    
    # Remove episode name from top - sometimes this comes after the word 'Episode' or 'Episodio',
    # sometimes it is on a line by itself
    # We need to search for both because in some files, the title after 'Episodio' is not the same as the one in the filename
    
    if 'Episodio' in text:
        text = remove_upto('Episodio', text)
    if 'Episode' in text:
        text = remove_upto('Episode', text)
    if title in text:
        text = remove_upto(title, text)
        
    if 'SCENE' in text:
        text = remove_all_lines('SCENE', text)
    if 'INTERTITIAL' in text:
        text = remove_all_lines('INTERTITIAL', text)
    if 'INTERSTITIAL' in text:
        text = remove_all_lines('INTERSTITIAL', text)
    if 'INTERSITIAL' in text:
        text = remove_all_lines('INTERSITIAL', text)    
        
    for word in re.findall('\n[0-9]{1,2}', text):
        text = text.replace(word, '')
        
    text = text.strip()
    with open(text_dir + filename[:filename.index('.')] + ".txt", 'w') as text_file:
        text_file.write(text)
    text_list.append(text)

In [13]:
# Check number of sentences and number of tokens
import spacy

nlp = spacy.load("es_core_news_md")
total_sents = 0
total_tokens = 0
for text in text_list:
    doc = nlp(text)
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)
    total_sents += len(sents_list)
    total_tokens += len(doc)
    
print(total_sents)
print(total_tokens)

2878
41221


#### Write to JSON

In [69]:
# Create JSON files
from collections import defaultdict
dict_list = []
author = "Channel 4 Learning" #the author is the producer in this case
level = "A1"
source_list = ['https://www.dropbox.com/s/8ia5r3wfsmobg07/01%20La%20llegada%20de%20Sam.docx?dl=0',
              'https://www.dropbox.com/s/x92adnk4cp3u93p/02%20Sam%20va%20de%20compras.doc',
              'https://www.dropbox.com/s/fzf6avuqttlj5kr/03%20Sam%20aprende%20a%20ligar.doc',
              'https://www.dropbox.com/s/uah40240xw7fxnr/04%20Sam%20busca%20un%20trabajo.doc',
              'https://www.dropbox.com/s/c290fxn2kyaiyc0/05%20Ha%20nacido%20una%20estrella.docx?dl=0',
              'https://www.dropbox.com/s/07ro7dqj3qlj69r/06%20El%20d%C3%ADa%20de%20la%20primitiva.doc',
              'https://www.dropbox.com/s/u0o32nmnxxdje3l/07%20La%20gemela.doc',
              'https://www.dropbox.com/s/4socaaai36edz9n/08%20La%20prima%20de%20la%20due%C3%B1a.doc',
              'https://www.dropbox.com/s/nyyv08m6amnxh39/09%20Trabajos%20para%20los%20chicos.doc',
              'https://www.dropbox.com/s/lgr4jod4sap55ej/10%20ANA%20PROTESTA.doc',
              'https://www.dropbox.com/s/xsk5dzi48qseez4/11%20Tiempo%20de%20vacaciones.doc',
              'https://www.dropbox.com/s/lkpbqgzj8ba3zan/12%20Fan%C3%A1ticos%20del%20f%C3%BAtbol.doc',
              'https://www.dropbox.com/s/59g7w7r30xnuczn/13%20Boda%20en%20el%20aire.doc']

for idx, text in enumerate(text_list):
    # metadata
    # source
    source = source_list[idx]
    
    # Title
    title_start = filenames[idx].index(' ')+1
    title_end = filenames[idx].index('.')
    title = filenames[idx][title_start:title_end]
    
    # content
    content = text
    # make dictionary
    text_dict = defaultdict(str)
    text_dict['author'] = author
    text_dict['source'] = source
    text_dict['level'] = level
    text_dict['title'] = title
    text_dict['content'] = content
    # append dictionary to list of dictionaries
    dict_list.append(text_dict)

In [70]:
import json
json_dir = "C:/Users/dasha/Documents/Education/UBC/Capstone/capstone_FHIS/corpus/"
with open(json_dir + 'extra_en_espanol.json', 'w') as outfile:
    json.dump(dict_list, outfile)