# Conference Calls - Parse txt files

In [1]:
NAME = '02-01_conference_calls_parse_txt'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports  

In [2]:
import os
import re
import logging
from tqdm import tqdm
import numpy as np
import pandas as pd

### Settings

In [3]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code 

### Read transcript metadata

In [None]:
METADATA_PATH = os.path.join('2_pipeline', '01-02_sample_sp500_metadata', 'out', 'sample_metadata.feather')
sp500_transcripts = pd.read_feather(METADATA_PATH)
sp500_transcripts.head()

## Parsing functions

In [5]:
TRANSCRIPTS_PATH = os.path.join('0_data', 'conference_calls_transcripts')

def read_transcript(file_path):
    with open(os.path.join(TRANSCRIPTS_PATH, file_path), encoding='utf-8') as f:
        transcript = f.read()
    return transcript

In [6]:
def parse_transcript(transcript):      
    pres_pattern = re.compile("Presentation(.*?)Questions and Answers", flags=re.DOTALL)
    try:
        presentation = pres_pattern.search(transcript).group(1)
    except AttributeError:
        raise AttributeError('No Questions and Answers')

    ques_pattern = re.compile("Questions and Answers(.*?)Definitions", flags=re.DOTALL)
    questions = ques_pattern.search(transcript).group(1)
    return presentation, questions

In [7]:
def parse_speakers(transcript):   
    speech_pattern = re.compile("--\n(.*?)\n[-=]{2}", flags=re.DOTALL)
    matches = [i.group(1) for i in speech_pattern.finditer(transcript)]
    
    speakers = matches[0::2]
    spoken_text = matches[1::2]
    return speakers, spoken_text

In [8]:
def get_speaker_id(speaker):
    speaker_id_pattern = re.compile(r"\[([0-9]+)\]")
    speaker_id = speaker_id_pattern.search(speaker)
    return speaker_id.group(1)

def get_speaker_name(speaker):
    if ',' in speaker:
        return speaker.split(',')[0].strip()
    else:
        return np.nan

def get_speaker_firm(speaker):
    if ',' in speaker:
        return speaker.split(',')[1].split('-')[0].strip()
    else:
        return np.nan

def get_speaker_role(speaker):
    if '-' in speaker:
        speaker_role = speaker.split('-')[-1]
    else:
        speaker_role = speaker
    return speaker_role.split('[')[0].strip()

In [9]:
def process_transcript(filepath):
    section_names = {0: 'Presentation', 1: 'Questions and Answers'}

    transcript = read_transcript(filepath)
    presentation, questions = parse_transcript(transcript)

    parsed_transcript = pd.DataFrame(columns=['speaker_id', 'speaker_name', 'speaker_firm', 'speaker_role',
                                              'transcript', 'section_name', 'filepath'])
    for section_id, section in enumerate([presentation, questions]):
        speakers, spoken_text = parse_speakers(section)

        for speaker, text in zip(speakers, spoken_text):
            # Parse speaker information
            speaker_id = get_speaker_id(speaker)
            speaker_name = get_speaker_name(speaker)
            speaker_firm = get_speaker_firm(speaker)
            speaker_role = get_speaker_role(speaker)

            # Identify section (Presentation or Questions and Answers)
            section_name = section_names[section_id]

            # Append data to DataFrame
            data = [speaker_id, speaker_name, speaker_firm, speaker_role, text, section_name, filepath]
            parsed_transcript.loc[len(parsed_transcript)] = data

    return parsed_transcript


## Parse all transcripts

In [10]:
def parse_from_filepaths(filepaths):
    # Log errors
    logging.basicConfig(filename=os.path.join(pipeline, 'store', 'errors.log'), filemode='w')

    # Create empty DataFrame
    parsed_transcripts = pd.DataFrame()
    for filepath in tqdm(filepaths):
        try: 
            # Parse data and append to DataFrame
            parsed_transcript = process_transcript(filepath)
            parsed_transcripts = pd.concat([parsed_transcripts, parsed_transcript])
        except AttributeError as e:
            logging.warning(f'{filepath} - {e}')
    return parsed_transcripts

In [11]:
parsed_transcripts = parse_from_filepaths(sp500_transcripts['filepath'])

100%|██████████| 26651/26651 [3:49:57<00:00,  1.93it/s]


## Add metadata

In [None]:
parsed_transcripts_metadata = parsed_transcripts.merge(sp500_transcripts, on='filepath', validate='m:1')
parsed_transcripts_metadata.head()

## Save

In [13]:
parsed_transcripts_metadata.to_feather(os.path.join(pipeline, 'out', 'cc_transcripts_parsed.feather'))