## Read Call Pickle and Build Meta-data and Text DataFrames

In [0]:
from IPython.display import clear_output

# import the call dataset
import pandas as pd
import numpy as np
import pickle
import os

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated_v2"):
    pfile = open("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated_v2", "rb")
    calls = pickle.load(pfile)                      
    pfile.close()
    
df_metadata = calls["metadata"]
df_text = calls["text"]

print("Metadata Shape:",df_metadata.shape, "Text Shape:",df_text.shape)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Metadata Shape: (5101, 6) Text Shape: (640132, 11)


#### Mean Num Prepared Remarks Paragraphs per Call

In [0]:
df_metadata.shape[0]
prepared_remarks_count = df_text[(df_text["Call Section"]=="Prepared Remarks")&(df_text["Speaker"]!="Operator")].shape[0]
prepared_remarks_count/df_metadata.shape[0]

40.87433836502647

## Tools to Build Corpus

In [0]:
! pip install pyLDAvis
clear_output()

In [0]:
import numpy as np
import nltk
nltk.download('wordnet')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import time

clear_output()

## Build Corpus

#### Get a List of Tickers to Analyze

In [0]:
ticker_list = ["DNKN","SBUX","MCD","QSR","YUM","BLMN","EAT","CMG","CBRL","DRI","DPZ","JACK","PZZA","TXRH","CAKE",
              "WEN","BH","BH","BJRI","BOJA","TAST","CHUY","PLAY","DFRG","TACO","DENN","DIN","LOCO","FRGI","PBPB",
              "RRGB","RUTH","SHAK","SONC","HABT","WING","ZOES","ARKR","BURG","SAUC","DAVE","FAT","BDL","GTIM",
              "KONA","JAX","JMBA","LUB","NATH","NDLS","FRSH","RAVE","STKS"]

In [0]:
corpus, call_names, paragraph_nums, corpus_whole_calls, call_names_whole_calls = [], [], [], [], []

#calls_to_use = df_metadata.dropna(subset=["ticker","period","call date"])[0:5000]
calls_to_use = df_metadata[(df_metadata["ticker"].isin(ticker_list))].sort_values(by=["ticker"])

for _, row in calls_to_use.iterrows():
    prepared_remarks = df_text[df_text["Link"]==row["Link"]]
    prepared_remarks = prepared_remarks[(prepared_remarks["Call Section"]=="Prepared Remarks")&(prepared_remarks["Speaker"]!="Operator")]
    
    corpus += list(prepared_remarks["Cleaned Text"])
    call_names += list(prepared_remarks.index)
    paragraph_nums  += list(prepared_remarks["Call Paragraph Num"])
    
    corpus_whole_calls.append(" ".join(list(prepared_remarks["Cleaned Text"])))
    call_names_whole_calls.append(row["ticker"] + " " + row["period"])

## Vectors

#### Find Common Used Phrases at Whole Call Level
I'm looking to find phrases said on every call.  Particularly, things like "forward looking statements" to help identify phrases that might signify a paragraph is "mandatory".

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity

vectorizer = CountVectorizer(ngram_range=(2,2), min_df=0.8, binary=True)
X = vectorizer.fit_transform(corpus_whole_calls)
df = pd.DataFrame(data=X.toarray(), index=call_names_whole_calls, columns=vectorizer.get_feature_names())

keywords_by_doc = set(df.columns)

print(keywords_by_doc)

{'long term', 'basis point', 'forward looking', 'looking statement', 'turn call', 'tax rate', 'last year', 'thank you', 'full year', 'per share'}


#### Find Phrases at Paragraph Level
I'm looking to find phrases that typically occur at least once, but no more than a few times during a call (like "forward looking statements").

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity

vectorizer = CountVectorizer(ngram_range=(1,3), min_df=0.02, max_df=0.075, binary=True)
X = vectorizer.fit_transform(corpus)
df = pd.DataFrame(data=X.toarray(), index=call_names, columns=vectorizer.get_feature_names())
df["Paragraph Location"] = paragraph_nums

keywords_by_paragraph = set(df.columns)

print(df.shape, df.columns)

(2940, 455) Index(['10', '100', '11', '12', '13', '14', '15', '16', '17', '18',
       ...
       'work', 'working', 'world', 'would', 'year ago', 'year over',
       'year over year', 'year period', 'you', 'Paragraph Location'],
      dtype='object', length=455)


In [0]:
keywords_by_doc.intersection(keywords_by_paragraph)

{'forward looking',
 'full year',
 'long term',
 'looking statement',
 'per share',
 'tax rate',
 'thank you',
 'turn call'}

## KMeans Clustering to Classify Compliance Paragraphs

https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py

In [0]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3)

cols = list(set(df.columns)-set(["Prediction"]))
cols = list(keywords_by_doc.intersection(keywords_by_paragraph))
km.fit(df.loc[:,cols].values)

df["Prediction"] = km.labels_
low_occurence_class = df["Prediction"].value_counts().sort_values().index[0]
compliance_paragraphs = df.loc[df["Prediction"]==low_occurence_class,["Prediction","Paragraph Location"]].index

print(df["Prediction"].value_counts().sort_values())

2      90
1     186
0    2664
Name: Prediction, dtype: int64


In [0]:
compliance_paragraphs[-100:]

Index(['BJRI Q4 2018 2', 'BJRI Q4 2018 3', 'BJRI Q4 2018 26', 'BJRI Q3 2018 4',
       'BJRI Q3 2018 5', 'BJRI Q3 2018 6', 'BJRI Q3 2018 31', 'BLMN Q4 2018 3',
       'BLMN Q3 2018 2', 'CAKE Q3 2018 3', 'CAKE Q4 2018 3', 'CAKE Q4 2018 4',
       'CAKE Q4 2017 2', 'CBRL Q2 2019 5', 'CBRL Q1 2019 3', 'CBRL Q1 2019 4',
       'CBRL Q4 2018 4', 'CHUY Q3 2018 3', 'CHUY Q1 2018 2', 'CHUY Q4 2018 2',
       'CMG Q1 2018 1', 'CMG Q1 2018 2', 'CMG Q4 2018 3', 'CMG Q4 2018 4',
       'CMG Q3 2018 3', 'CMG Q3 2018 4', 'CMG Q2 2018 2', 'DENN Q4 2018 3',
       'DFRG Q4 2018 3', 'DIN Q4 2017 3', 'DIN Q4 2017 4', 'DNKN Q4 2018 1',
       'DNKN Q3 2018 3', 'DPZ Q4 2018 3', 'DPZ Q3 2018 5', 'DRI Q2 2019 1',
       'DRI Q1 2019 1', 'DRI Q4 2018 1', 'EAT Q2 2019 4', 'EAT Q1 2019 5',
       'FRGI Q3 2018 4', 'FRGI Q4 2018 2', 'FRGI Q4 2018 3', 'JACK Q1 2019 5',
       'JACK Q4 2018 3', 'LOCO Q4 2018 3', 'MCD Q2 2018 3', 'MCD Q4 2018 2',
       'MCD Q3 2018 3', 'MCD Q4 2017 2', 'NDLS Q2 2018 3', 'NDLS Q2 

## Call Links

In [0]:
print(calls_to_use.index.values)

['https://www.fool.com/earnings/call-transcripts/2019/02/22/bjs-restaurants-inc-bjri-q4-2018-earnings-conferen.aspx'
 'https://www.fool.com/earnings/call-transcripts/2018/10/30/bjs-restaurants-inc-bjri-q3-2018-earnings-conferen.aspx'
 'https://www.fool.com/earnings/call-transcripts/2019/02/14/bloomin-brands-inc-blmn-q4-2018-earnings-conferenc.aspx'
 'https://www.fool.com/earnings/call-transcripts/2018/10/30/bloomin-brands-blmn-q3-2018-earnings-conference-ca.aspx'
 'https://www.fool.com/earnings/call-transcripts/2018/10/30/cheesecake-factory-inc-cake-q3-2018-earnings-confe.aspx'
 'https://www.fool.com/earnings/call-transcripts/2019/02/20/cheesecake-factory-inc-cake-q4-2018-earnings-confe.aspx'
 'https://www.fool.com/earnings/call-transcripts/2018/02/21/the-cheesecake-factory-incorporated-cake-q4-2017-e.aspx'
 'https://www.fool.com/earnings/call-transcripts/2019/02/26/cracker-barrel-old-country-store-inc-cbrl-q2-2019.aspx'
 'https://www.fool.com/earnings/call-transcripts/2018/11/27/crack