<a href="https://colab.research.google.com/github/mikelabadie/Earnings_Call_Transcripts/blob/master/analysis_compliance_paragraphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Read Call Pickle and Build Meta-data and Text DataFrames

In [0]:
from IPython.display import clear_output

# import the call dataset
import pandas as pd
import pickle
import os

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated"):
    pfile = open("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated", "rb")
    calls = pickle.load(pfile)                      
    pfile.close()
    
metadata={link:data["metadata"] for link, data in calls.items()}
df_metadata = pd.DataFrame.from_dict(metadata, orient="index")

speakers = {}
for i, (key, value) in enumerate(metadata.items()):
  if not(value.get("call participants") is None):
    speakers[key] = value["call participants"][["Speaker","Title"]]

text={link:data["text"] for link, data in calls.items()}
df_text = pd.DataFrame.from_dict(text, orient="index")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Mean Num Prepared Remarks Paragraphs per Call

In [0]:
num_prep_para = []
for _, row in  df_metadata.dropna(subset=["ticker","period","call date"]).iterrows():
  prepared_remarks = text.get(row.name)
  prepared_remarks = prepared_remarks[(prepared_remarks["Call Section"]=="Prepared Remarks")&(prepared_remarks["Speaker"]!="Operator")]
  num_prep_para.append(prepared_remarks.shape[0])
  
print(np.mean(num_prep_para))  

40.869980506822614


## Tools to Build Corpus

In [0]:
! pip install pyLDAvis
clear_output()

In [0]:
import numpy as np
import nltk
nltk.download('wordnet')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import time

clear_output()

## Build Corpus

#### Get a List of Tickers to Analyze

In [0]:
ticker_list = ["DNKN","SBUX","MCD","QSR","YUM","BLMN","EAT","CMG","CBRL","DRI","DPZ","JACK","PZZA","TXRH","CAKE",
              "WEN","BH","BH","BJRI","BOJA","TAST","CHUY","PLAY","DFRG","TACO","DENN","DIN","LOCO","FRGI","PBPB",
              "RRGB","RUTH","SHAK","SONC","HABT","WING","ZOES","ARKR","BURG","SAUC","DAVE","FAT","BDL","GTIM",
              "KONA","JAX","JMBA","LUB","NATH","NDLS","FRSH","RAVE","STKS"]

In [0]:
corpus = []
call_keys = []
call_names = []
paragraph_nums = []
speakers = []
corpus_whole = []
call_names_whole = []

calls_to_use = df_metadata.dropna(subset=["ticker","period","call date"])[0:5000]
#calls_to_use = df_metadata[(df_metadata["ticker"].isin(ticker_list))].dropna(subset=["ticker","period","call date"]).sort_values(by=["ticker"])

# loop through each call of interest
for _, row in calls_to_use.iterrows():
    prepared_remarks = text.get(row.name)
    prepared_remarks = prepared_remarks[(prepared_remarks["Call Section"]=="Prepared Remarks")&(prepared_remarks["Speaker"]!="Operator")]     
    
    corpus += list(prepared_remarks["Cleaned Text"])
    corpus_whole.append(" ".join(list(prepared_remarks["Cleaned Text"])))
    call_keys += [(row.name, x) for x in prepared_remarks.index.values]
    call_names += [row["ticker"] + " " + row["period"] + " " + str(x) for x in prepared_remarks.index.values]
    call_names_whole.append(row["ticker"] + " " + row["period"])
    paragraph_nums += [x/prepared_remarks.shape[0] for x in prepared_remarks.index.values]
    speakers += list(prepared_remarks["Speaker"])

## Vectors

#### Find Common Used Phrases at Whole Call Level
I'm looking to find phrases said on every call.  Particularly, things like "forward looking statements" to help identify phrases that might signify a paragraph is "mandatory".

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity

vectorizer = CountVectorizer(ngram_range=(2,2), min_df=0.8, binary=True)
X = vectorizer.fit_transform(corpus_whole)
df = pd.DataFrame(data=X.toarray(), index=call_names_whole, columns=vectorizer.get_feature_names())

keywords_by_doc = set(df.columns)

print(keywords_by_doc)

{'turn call', 'thank you', 'forward looking', 'looking statement'}


#### Find Phrases at Paragraph Level
I'm looking to find phrases that typically occur at least once, but no more than a few times during a call (like "forward looking statements").

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity

vectorizer = CountVectorizer(ngram_range=(1,3), min_df=0.02, max_df=0.075, binary=True)
X = vectorizer.fit_transform(corpus)
df = pd.DataFrame(data=X.toarray(), index=call_names, columns=vectorizer.get_feature_names())
df["Paragraph Location"] = paragraph_nums

keywords_by_paragraph = set(df.columns)

print(df.shape, df.columns)

(204238, 391) Index(['10', '100', '11', '12', '13', '14', '15', '16', '17', '18',
       ...
       'working', 'world', 'would', 'would like', 'year 2018', 'year ago',
       'year over', 'year over year', 'you', 'Paragraph Location'],
      dtype='object', length=391)


In [0]:
keywords_by_doc.intersection(keywords_by_paragraph)

{'forward looking', 'looking statement', 'thank you', 'turn call'}

## KMeans Clustering to Classify Compliance Paragraphs

https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py

In [0]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3)

cols = list(set(df.columns)-set(["Prediction"]))
cols = list(keywords_by_doc.intersection(keywords_by_paragraph))
km.fit(df.loc[:,cols].values)

df["Prediction"] = km.labels_
low_occurence_class = df["Prediction"].value_counts().sort_values().index[0]
compliance_paragraphs = df.loc[df["Prediction"]==low_occurence_class,["Prediction","Paragraph Location"]].index

print(df["Prediction"].value_counts().sort_values())

2      5886
1     10145
0    188207
Name: Prediction, dtype: int64


In [0]:
compliance_paragraphs[-100:]

Index(['SSW Q4 2018 2', 'SSW Q4 2018 3', 'MCRB Q4 2018 2', 'MCRB Q4 2018 3',
       'SINA Q4 2018 3', 'SCM Q4 2018 3', 'SCM Q4 2018 6', 'TPVG Q4 2018 3',
       'TPVG Q4 2018 4', 'WB Q4 2018 4', 'WB Q4 2018 5', 'WMC Q4 2018 5',
       'WMC Q4 2018 6', 'WEYS Q4 2018 3', 'YEXT Q4 2018 3', 'YEXT Q4 2018 4',
       'YEXT Q4 2018 5', 'ABM Q1 2019 3', 'ALBO Q4 2018 2', 'ALBO Q4 2018 3',
       'ALBO Q4 2018 4', 'AEO Q4 2018 2', 'AEO Q4 2018 3', 'AEO Q4 2018 7',
       'AOBC Q3 2019 3', 'ABUS Q4 2018 3', 'ARQL Q4 2018 3', 'ARQL Q4 2018 4',
       'BLDP Q4 2018 1', 'BKCC Q4 2018 4', 'BKCC Q4 2018 5', 'BURL Q4 2018 4',
       'WHD Q4 2018 2', 'CRCM Q4 2018 3', 'CRCM Q4 2018 5', 'CECE Q4 2018 2',
       'CECE Q4 2018 3', 'CLDX Q4 2018 2', 'CLDX Q4 2018 3', 'CBPO Q4 2018 4',
       'CBPO Q4 2018 5', 'CHUY Q4 2018 2', 'CWEN Q4 2018 4', 'CLPR Q4 2018 4',
       'CMTL Q2 2019 4', 'CTK Q4 2018 3', 'CTK Q4 2018 4', 'CTK Q4 2018 5',
       'COST Q2 2019 2', 'COST Q2 2019 3', 'BREW Q4 2018 4', 'CYRX Q4 

## Call Links

In [0]:
print(calls_to_use.index.values)

['https://www.fool.com/earnings/call-transcripts/2017/10/23/citizens-financial-group-q3-2017-earnings-conferen.aspx'
 'https://www.fool.com/earnings/call-transcripts/2017/10/24/illinois-tool-works-itw-q3-2017-earnings-conferenc.aspx'
 'https://www.fool.com/earnings/call-transcripts/2017/10/25/3m-mmm-q3-2017-earnings-conference-call-transcript.aspx'
 ...
 'https://www.fool.com/earnings/call-transcripts/2019/01/25/oceanfirst-financial-corp-ocfc-q4-2018-earnings-co.aspx'
 'https://www.fool.com/earnings/call-transcripts/2019/01/25/qcr-holdings-inc-qcrh-q4-2018-earnings-conference.aspx'
 'https://www.fool.com/earnings/call-transcripts/2019/01/25/resmed-rmd-q2-2019-earnings-conference-call-transc.aspx']
