# Confrence Calls - Clean Sentences
*Convert parsed transcripts to cleaned sentences*

In [1]:
NAME = '02-02_conference_calls_preprocess'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports

In [2]:
import os
import re
import pandas as pd
import numpy as np

# NLP
from nltk.tokenize import sent_tokenize

# Logging
from utils import log_step 

### Settings

In [3]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code 

In [None]:
PARSED_TRANSCRIPTS_PATH = os.path.join('2_pipeline', '02-01_conference_calls_parse_txt.', 'out', 'cc_transcripts_parsed.feather')
parsed_transcripts = pd.read_feather(PARSED_TRANSCRIPTS_PATH)
parsed_transcripts.head()

In [5]:
@log_step
def start_pipeline(transcripts):
    return transcripts.copy()

In [6]:
@log_step
def filter_qa(transcripts):
    transcripts_qa = transcripts[transcripts['section_name'] == 'Questions and Answers']
    return transcripts_qa

In [7]:
@log_step
def select_speaker_roles(transcripts):
    mapping = {r'\bC[A-Z]O\b|chief|officer': 'Management',
               r'analyst': 'Analyst'}

    for role, clean_role in mapping.items():
        transcripts.loc[transcripts['speaker_role'].str.contains(role, case=False), 'speaker_role_clean'] = clean_role
    
    transcripts = transcripts.dropna(subset=['speaker_role_clean'])
    return transcripts

In [8]:
@log_step
def tokenize_sentences(transcripts):
    return (transcripts.loc[transcripts['transcript'].notna()]
            .reset_index(drop=True)
            .assign(transcript=lambda x: x['transcript'].apply(sent_tokenize))
            .explode('transcript')
            .dropna(subset=['transcript'])
            .reset_index(drop=True))

In [9]:
TAGS_PATTERN = re.compile(r'[<\(\[].*?[>\)\]]')
WORDS_WITH_NUMBERS = re.compile(r'\w*\d\w*')
REPLACE_BY_SPACE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile(r'[^a-z ]')

def process_transcript(transcript):
    transcript_clean = transcript.lower()
    transcript_clean = TAGS_PATTERN.sub('', transcript_clean)
    transcript_clean = WORDS_WITH_NUMBERS.sub('', transcript_clean)
    transcript_clean = REPLACE_BY_SPACE.sub(' ', transcript_clean)
    transcript_clean = BAD_SYMBOLS.sub('', transcript_clean)
    transcript_clean = re.sub('\s+', ' ', transcript_clean).strip()  # Remove extra whitespace characters
    return transcript_clean

def convert_name(speaker_name):
    '''Converts name to I/B/E/S format'''
    speaker_name = speaker_name.upper()
    surname = speaker_name.split()[-1]
    first_name = speaker_name.split()[0]
    return ' '.join([surname, first_name[0]])

@log_step
def process_text(transcripts):
    return transcripts.assign(transcript=lambda x: x['transcript'].apply(process_transcript),
                              speaker_name=lambda x: x['speaker_name'].apply(convert_name),
                              speaker_firm=lambda x: x['speaker_firm'].str.upper())

In [10]:
@log_step
def remove_short_sentences(transcripts):
    return (transcripts
            .assign(num_words=lambda x: x['transcript'].str.split().str.len())
            .query('num_words >= 5'))  # Sentences should have at least 5 words

In [11]:
@log_step
def clean_data(transcripts):
    return (transcripts
            .assign(quarter=lambda x: x['event_date'].dt.to_period('Q'))
            .filter([
                'gvkey', 'ticker', 'event_date', 'coname', 'speaker_role_clean',
                'speaker_name', 'speaker_firm', 'transcript', 'num_words',
                'transcript_id', 'quarter', 'year'
                ])
            .rename(columns={'speaker_role_clean': 'speaker_role'})
            .sort_values(['gvkey', 'event_date', 'speaker_name'])
            .reset_index(drop=True))

In [None]:
cleaned_transcripts = (parsed_transcripts
                       .pipe(start_pipeline)
                       .pipe(filter_qa)
                       .pipe(select_speaker_roles)
                       .pipe(tokenize_sentences)
                       .pipe(process_text)
                       .pipe(remove_short_sentences)
                       .pipe(clean_data))
                       
cleaned_transcripts.head()

In [13]:
cleaned_transcripts.to_feather(os.path.join(pipeline, 'out', 'cc_transcripts.feather'))