In [29]:
from nltk.corpus import stopwords
import nltk
import spacy
from string import punctuation
import numpy as np
from collections import Counter
import os
import tensorflow_hub as hub

In [2]:
# nltk.download('stopwords')
nlp = spacy.load('en_core_web_md')
# TODO: Continue to remove more stopwords (i.e. company name, 'quarter', 'year', '%', etc.)
new_stopwords = ['thank', 'quarter', 'year', '’s',
                 '%']  # People say 'thank you' way too much in earnings calls!

In [3]:
def add_stopwords(stopwords, new_stopwords):
    """
    Method that adds new stopwords to list of spacy default stopwords

    Args:
    stopwords - spacy default stopwords
    new_stopwords - new stopwords

    Returns: new list of stopwords
    """

    for new_stopword in new_stopwords:
        stopwords.add(new_stopword)
    return stopwords

In [41]:
def tokenize_and_clean_transcript(local_file, new_stopwords):
    """
    Method that implements part of NLP pipeline of cleaning text:
    1. Tokenization
    2. Removing stopwords (commonly known stopwords, pronouns, keywords with little info, words less than 2 chars)
    3. Lemmatization

    Args: 
    local_file - name of earnings call transcript file
    new_stopwords - new stop words to filter doc with

    Returns: list of tokens (spacy doc) and list of lemmatized words (list of strings)
    """

    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    stopwords = add_stopwords(stopwords, new_stopwords)

    transcript_read = open(local_file, 'r')
    transcript_doc = nlp(transcript_read.read())
    
    cleaned_transcript_doc = []
    for sent in transcript_doc.sents:
        cleaned_transcript_sent = []
        for token in sent:
            if token.text not in stopwords and token.text not in punctuation and len(token.text) > 2 and token.lemma_ != '-PRON-':
                cleaned_transcript_sent.append(token)
                print('Token text before: ', token.text)
                token.text = token.lemma_.lower().strip()
                print('Token text after: ', token.text)
        cleaned_transcript_doc.append(cleaned_transcript_sent)
        
    return cleaned_transcript_doc

'\ndef lemmatize_transcript(cleaned_transcript_doc):\n    """\n    Method that implements third part of NLP pipline of cleaning text: \n    3. Lemmatization\n    \n    Args:\n    cleaned_transcript_doc - cleaned 2D array of tokens from spacy doc\n    \n    Returns: cleaned 1D list of transcript text\n    """\n    \n    transcript_text = [tok.lemma_.lower().strip() for tok in transcript_doc]\n    return transcript_text\n'

In [42]:
def get_freq_terms(transcript_doc, k):
    """
    Method that get k most frequent words of earnings call transcript

    Args: 
    transcript_doc - spacy doc of transcript
    transcript_text - text of transcript
    k - k most common frequent terms

    Returns: list of k most frequent words
    """

    # five most common tokens
    t
    transcript_text = [token.lemma_.lower().strip() for token in np.array(transcript_doc).flatten()]
    word_freq = Counter(transcript_text)
    common_words = word_freq.most_common(k)

    # five most common noun tokens
    nouns = [token.lemma_ for token in transcript_doc if token.pos_ == 'NOUN']
    noun_freq = Counter(nouns)
    common_nouns = noun_freq.most_common(k)

    # five most common verb tokens
    verbs = [token.lemma_ for token in transcript_doc if token.pos_ == 'VERB']
    verb_freq = Counter(verbs)
    common_verbs = verb_freq.most_common(k)

    return (common_words, common_nouns, common_verbs)

In [43]:
def write_freq_words_to_file(analysis_dir, local_file, common_terms):
    """
    Method that writes the most frequent terms from the document to a file

    Args:
    analysis_dir - directory of frequent terms of earnings transcripts
    local_file - name of file to write frequent terms to
    common_terms - list of frequent terms

    Returns: None
    """

    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)
    freq_words_file_path = analysis_dir + '/' + local_file
    freq_words_file = open(freq_words_file_path, 'w+')

    for terms in common_terms:
        for term in terms:
            freq_words_file.write(str(term) + ' ')
        freq_words_file.write('\n')

    freq_words_file.close()

In [14]:
def generate_deep_word_embeddings(transcript_doc, transcript_text, embedding_type):
    """
    Method that generates deep word embeddings of tokens in transcript
    
    Args: 
    transcript_doc - spacy doc of transcript
    transcript_text - text of transcript
    embedding_type - embedding framework used (e.g. word2vec, Elmo)

    Returns: list of word embeddings
    """
    
    if embedding_type == 'elmo':
        for sent in transcript_doc.sents:
            print("SENTENCE")
            print(sent.text)
        # elmo_embeddings_url = "https://tfhub.dev/google/elmo/2"
        # embed = hub.Module(elmo_embeddings_url, trainable=True)

transcript_path = '../earnings_call_transcripts/tndm/tndm_earnings_transcript_q3_2018.txt'
transcript_doc, transcript_text = tokenize_and_clean_transcript(transcript_path, new_stopwords)
generate_deep_word_embeddings(transcript_doc, transcript_text, 'elmo')

SENTENCE
*
SENTENCE
*
SENTENCE
Executives
SENTENCE
**

Susan Morrison - CAO


SENTENCE
Kim Blickenstaff - President and CEO

John Sheridan - EVP and COO


SENTENCE
Leigh Vosseller - CFO


SENTENCE
*
SENTENCE
*
SENTENCE
Analysts
SENTENCE
**

Travis Steed - Bank of America Merrill Lynch


SENTENCE
Brooks O'Neil - Lake Street Capital

Alex Nowak - Craig-Hallum Capital Group

Matthew Blackman - Stifel

Ravi Misra - Berenberg Capital Markets


SENTENCE
Kyle Bauser - Dougherty & Co.

J.P. McKim - Piper Jaffray

Jeff Johnson - Robert W. Baird


SENTENCE
Steven Lichtman - Oppenheimer & Company


SENTENCE
*
SENTENCE
*Operator
SENTENCE
**


SENTENCE
Good day, ladies and gentlemen, and thank you for your patience.
SENTENCE
You have
joined Tandem's Third Quarter 2018 Earnings Conference Call.
SENTENCE
At this time, all
participants are in a listen-only mode.
SENTENCE
Later we will conduct a question-and-
answer session and instructions will be given at that time.
SENTENCE
[Operator
Instructions]
S

In [44]:
def exec_transcript_analysis(transcript_dir, analysis_dir, transcript_filename):
    """
    Method that executes the entire transcript analysis to get most frequent words

    Args: 
    transcript_dir - directory of transcripts
    analysis_dir - directory of transcript analysis files
    transcript_filename - transcript filename

    Returns: None
    """

    transcript_path = transcript_dir + transcript_filename
    transcript_doc, transcript_text = tokenize_and_clean_transcript(
        transcript_path, new_stopwords)
    common_terms = get_freq_terms(transcript_doc, transcript_text, 10)
    write_freq_words_to_file(
        analysis_dir,
        transcript_filename.split('.')[0] + '_freq_words.txt', common_terms)

In [45]:
transcript_dir = '../earnings_call_transcripts/tndm/'
analysis_dir = '../nlp_analyis'
transcript_filename = 'tndm_earnings_transcript_q3_2018.txt'
exec_transcript_analysis(transcript_dir, analysis_dir, transcript_filename)

Token text before:  Executives


AttributeError: attribute 'text' of 'spacy.tokens.token.Token' objects is not writable