In [3]:
from nltk.corpus import stopwords
import nltk
import spacy
from string import punctuation
import numpy as np
from collections import Counter
import os
import tensorflow as tf

In [4]:
# nltk.download('stopwords')
nlp = spacy.load('en_core_web_md')
# TODO: Continue to remove more stopwords (i.e. company name, 'quarter', 'year', '%', etc.)
new_stopwords = ['thank', 'thanks', 'think', 'quarter', 'year', '’s',
                 '%']  # People say 'thank you' way too much in earnings calls!

In [5]:
def add_stopwords(stopwords, new_stopwords):
    """
    Method that adds new stopwords to list of spacy default stopwords

    Args:
    stopwords - spacy default stopwords
    new_stopwords - new stopwords

    Returns: new list of stopwords
    """

    for new_stopword in new_stopwords:
        stopwords.add(new_stopword)
    return stopwords

In [6]:
def tokenize_and_clean_transcript(local_file, new_stopwords):
    """
    Method that implements part of NLP pipeline of cleaning text:
    1. Tokenization
    2. Removing stopwords (commonly known stopwords, pronouns, keywords with little info, words less than 2 chars)
    3. Lemmatization

    Args: 
    local_file - name of earnings call transcript file
    new_stopwords - new stop words to filter doc with

    Returns: list of tokens (spacy doc) and list of lemmatized words (list of strings)
    """

    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    stopwords = add_stopwords(stopwords, new_stopwords)

    transcript_read = open(local_file, 'r')
    transcript_doc = nlp(transcript_read.read())
    
    # Clean spacy doc and structure as 2D numpy array of spacy tokens
    cleaned_transcript_doc = []
    for sent in transcript_doc.sents:
        cleaned_transcript_sent = []
        for token in sent:
            token_text = token.lemma_.lower().strip()
            if token_text not in stopwords and token_text not in punctuation and len(token_text) > 2 and token.lemma_ != '-PRON-':
                cleaned_transcript_sent.append(token)
        
        if len(cleaned_transcript_sent) > 0:
            cleaned_transcript_doc.append(cleaned_transcript_sent)
    
    return cleaned_transcript_doc

def flatten_doc(transcript_doc):
    return [token for transcript_sent in transcript_doc for token in transcript_sent]

In [7]:
def get_freq_terms(transcript_doc, k):
    """
    Method that get k most frequent words of earnings call transcript

    Args: 
    transcript_doc - spacy doc of transcript
    transcript_text - text of transcript
    k - k most common frequent terms

    Returns: list of k most frequent words
    """

    # print('Unflattened transcript doc: ', transcript_doc)
    # transcript_doc = transcript_doc.flatten()
    transcript_doc = flatten_doc(transcript_doc)
    
    # five most common tokens
    transcript_text = [token.lemma_.lower().strip() for token in transcript_doc] # Make sure to lemmatize and normalize
    word_freq = Counter(transcript_text)
    common_words = word_freq.most_common(k)

    # five most common noun tokens
    nouns = [token.lemma_.lower().strip() for token in transcript_doc if token.pos_ == 'NOUN'] # Make sure to lemmatize and normalize
    noun_freq = Counter(nouns)
    common_nouns = noun_freq.most_common(k)

    # five most common verb tokens
    verbs = [token.lemma_.lower().strip() for token in transcript_doc if token.pos_ == 'VERB'] # Make sure to lemmatize and normalize
    verb_freq = Counter(verbs)
    common_verbs = verb_freq.most_common(k)

    return (common_words, common_nouns, common_verbs)

In [8]:
def write_freq_words_to_file(analysis_dir, local_file, common_terms):
    """
    Method that writes the most frequent terms from the document to a file

    Args:
    analysis_dir - directory of frequent terms of earnings transcripts
    local_file - name of file to write frequent terms to
    common_terms - list of frequent terms

    Returns: None
    """

    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)
    freq_words_file_path = analysis_dir + '/' + local_file
    freq_words_file = open(freq_words_file_path, 'w+')

    for terms in common_terms:
        for term in terms:
            freq_words_file.write(str(term) + ' ')
        freq_words_file.write('\n')

    freq_words_file.close()

In [9]:
def exec_transcript_analysis(transcript_dir, analysis_dir, transcript_filename):
    """
    Method that executes the entire transcript analysis to get most frequent words

    Args: 
    transcript_dir - directory of transcripts
    analysis_dir - directory of transcript analysis files
    transcript_filename - transcript filename

    Returns: None
    """

    transcript_path = transcript_dir + transcript_filename
    transcript_doc = tokenize_and_clean_transcript(
        transcript_path, new_stopwords)
    common_terms = get_freq_terms(transcript_doc, 10)
    write_freq_words_to_file(
        analysis_dir,
        transcript_filename.split('.')[0] + '_freq_words.txt', common_terms)

In [10]:
transcript_dir = '../earnings_call_transcripts/tndm/'
analysis_dir = '../nlp_analyis'
transcript_filename = 'tndm_earnings_transcript_q3_2018.txt'
exec_transcript_analysis(transcript_dir, analysis_dir, transcript_filename)

In [11]:
# Load embeddings:
# 1. Elmo (tensorflow_hub doesn't work on this MacOS, need to find another way to load these embeddings)
# Generated via bidirectional RNNs by feeding word into this NN
'''
elmo_embeddings_url = "https://tfhub.dev/google/elmo/2"
embed = hub.Module(elmo_embeddings_url, trainable=True)
'''
# 2. Word2Vec
# Generated via ...
from gensim.models import Word2Vec

In [None]:
def generate_deep_word_embeddings(transcript_doc, embedding_type):
    """
    Method that generates deep word embeddings of tokens in transcript
    These embeddings will be used in various NLP tasks:
    1. Sentiment analysis
    2. Named entity extraction
    3. Summarization
    4. Question answering
    5. Document classification
    
    Args: 
    transcript_doc - spacy doc of transcript
    transcript_text - text of transcript
    embedding_type - embedding framework used (e.g. word2vec, Elmo)

    Returns: list of word embeddings
    """
    
    if embedding_type == 'elmo':
        pass
    elif embedding_type == 'word2vec':
        transcript_text = [[token.lemma_.lower().strip() for token in sent] for sent in transcript_doc]
        word2vec = Word2Vec(transcript_text, min_count=2) # Generates list of numpy vectors
        return word2vec

transcript_path = '../earnings_call_transcripts/tndm/tndm_earnings_transcript_q3_2018.txt'
transcript_doc = tokenize_and_clean_transcript(transcript_path, new_stopwords)
word2vec_embeddings = generate_deep_word_embeddings(transcript_doc, 'word2vec')

In [None]:
embeddings_vocab = word2vec_embeddings.wv.vocab
transcript_word2vec_embeddings = {}
for word in embeddings_vocab:
    transcript_word2vec_embeddings[word] = word2vec_embeddings.wv[word]
print(transcript_word2vec_embeddings)

In [65]:
# Visualize these embeddings on 2D plane using PCA and tSNE
sorted_words = sorted(transcript_word2vec_embeddings.keys())
sorted_embeddings = []
for word in sorted_words:
    sorted_embeddings.append(transcript_word2vec_embeddings[word])
sorted_embeddings = np.array(sorted_embeddings)

# Reduce using PCA first b/c tSNE is quadratic runtime and space (inefficient for datasets with many features)
from sklearn.decomposition import PCA
pca = PCA(n_components=50) # Reduce down to 50 dim
compressed_embeddings = pca.fit_transform(sorted_embeddings)

from sklearn.manifold import TSNE
compressed_embeddings = TSNE(n_components=2).fit_transform(compressed_embeddings) # Further reduce to 2 dim using t-SNE

In [64]:
# Use Plotly to generate interactive chart that plots all (dimension-reduced) words in 2D vector space

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

data = [
    go.Scatter(
        x=[i[0] for i in compressed_embeddings],
        y=[i[1] for i in compressed_embeddings],
        mode='markers',
        text=[i for i in sorted_words],
    marker=dict(
        size=16,
        color = [len(i) for i in sorted_words], # Colored by word length
        opacity= 0.8,
        colorscale='Viridis',
        showscale=False
    )
    )
]
layout = go.Layout()
layout = dict(
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )
fig = go.Figure(data=data, layout=layout)
file = plot(fig, filename='TNDM_Q3_2018_Transcript_Word_Encodings.html')

'''
import matplotlib.pyplot as plt

print(y)
plt.figure(figsize=(50,50))
plt.scatter(x=y[:,0], y=y[:,1], label=sorted_words)
plt.show()
'''

'\nimport matplotlib.pyplot as plt\n\nprint(y)\nplt.figure(figsize=(50,50))\nplt.scatter(x=y[:,0], y=y[:,1], label=sorted_words)\nplt.show()\n'

In [None]:
# TODO:
# 1. Find out how word2vec embeddings are generated (via context or definition or sentence placement or ...?)
# 2. Generate document embeddings next