## Bring In Data and Libraries

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pprint
import nltk
nltk.download('punkt')

In [0]:
data_directory = "/content/gdrive/My Drive/DATS6450 - Labadie - Data/"

# import the call dataset
import pandas as pd
import numpy as np
import pickle
import os
from IPython.display import clear_output

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists(data_directory+"Pickles/metadata.pkl"):
    pfile = open(data_directory+"Pickles/metadata.pkl", "rb")
    df_metadata = pickle.load(pfile)                 
    pfile.close()

if os.path.exists(data_directory+"Pickles/text_paragraph.pkl"):
    pfile = open(data_directory+"Pickles/text_paragraph.pkl", "rb")
    df_text_prepremarks = pickle.load(pfile)                 
    pfile.close()

if os.path.exists(data_directory+"Pickles/finstmts.pkl"):
    pfile = open(data_directory+"Pickles/finstmts.pkl", "rb")
    df_finstmts = pickle.load(pfile)                 
    pfile.close()

if os.path.exists(data_directory+"ProcessedCallsPickleUpdated_v2"):
    pfile = open(data_directory+"ProcessedCallsPickleUpdated_v2", "rb")
    calls = pickle.load(pfile)
    df_text_all = calls["text"]
    pfile.close() 

clear_output()

print("Metadata Shape:",df_metadata.shape, 
      "All Paragraphs Shape:",df_text_all.shape,
      "Prepared Remarks Shape:",df_text_prepremarks.shape,
      "Fin Statements Shape:",df_finstmts.shape)

<br>
<br>
<hr>
## Find Most Common Words at Whole Call Level

In [0]:
docs = df_metadata["Prepared Remarks Text"]
print("Unique Words")
print("No Filter",CountVectorizer(binary=False, lowercase=True).fit_transform(docs).shape[1])
print("2.5% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.025).fit_transform(docs).shape[1])
print("5% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.05).fit_transform(docs).shape[1])
print("10% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.10).fit_transform(docs).shape[1])
print("25% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.25).fit_transform(docs).shape[1])
print("50% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.5).fit_transform(docs).shape[1])
print("75% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.75).fit_transform(docs).shape[1])
print("90% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.9).fit_transform(docs).shape[1])
print("99.5% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.995).fit_transform(docs).shape[1])

Unique Words
No Filter 65354
2.5% Filter 4270
5% Filter 2826
10% Filter 1851
25% Filter 885
50% Filter 389
75% Filter 161
90% Filter 72
99.5% Filter 18


In [0]:
vectorizer = CountVectorizer(binary=True, lowercase=True, min_df=0.5)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())
call_stopwords = results.sum(axis=0)

21             0.501569
initiatives    0.501765
want           0.501962
currently      0.502158
experience     0.502746
items          0.503138
risk           0.503727
four           0.505885
increases      0.506277
large          0.507454
use            0.508239
expansion      0.509023
leading        0.510200
mix            0.511181
delivered      0.511573
even           0.512554
potential      0.516281
base           0.517066
still          0.517262
ago            0.518242
dtype: float64

In [0]:
call_stopwords=list(call_stopwords.index)
print(call_stopwords)

['000', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '2017', '2018', '2019', '21', '25', '30', '40', '50', 'ability', 'about', 'acquisition', 'across', 'actual', 'addition', 'additional', 'adjusted', 'after', 'again', 'ago', 'ahead', 'all', 'also', 'an', 'and', 'announced', 'annual', 'another', 'any', 'approximately', 'are', 'around', 'as', 'at', 'available', 'average', 'back', 'balance', 'base', 'based', 'basis', 'be', 'been', 'before', 'begin', 'being', 'believe', 'benefit', 'best', 'better', 'between', 'billion', 'both', 'business', 'but', 'by', 'call', 'can', 'capital', 'cash', 'cause', 'certain', 'change', 'chief', 'com', 'company', 'compared', 'conference', 'consistent', 'continue', 'continued', 'continues', 'continuing', 'core', 'cost', 'costs', 'could', 'current', 'currently', 'customer', 'customers', 'date', 'debt', 'deliver', 'delivered', 'demand', 'development', 'differ', 'discuss', 'discussed', 'do', 'down', 'drive', 'driven', 'due', 'during', 'e

<br>
<br>
<hr>
## Find Most Common Words at Paragraph Level

In [0]:
docs = df_text_prepremarks["Text"]
print("Unique Words")
print("No Filter",CountVectorizer(binary=False, lowercase=True).fit_transform(docs).shape[1])
print("0.5% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.005).fit_transform(docs).shape[1])
print("1.0% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.01).fit_transform(docs).shape[1])
print("2.5% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.025).fit_transform(docs).shape[1])
print("5% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.05).fit_transform(docs).shape[1])
print("10% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.10).fit_transform(docs).shape[1])
print("25% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.25).fit_transform(docs).shape[1])
print("50% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.5).fit_transform(docs).shape[1])
print("75% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.75).fit_transform(docs).shape[1])
print("90% Filter",CountVectorizer(binary=False, lowercase=True, min_df=0.9).fit_transform(docs).shape[1])

Unique Words
No Filter 65354
0.5% Filter 1438
1.0% Filter 889
2.5% Filter 381
5% Filter 164
10% Filter 68
25% Filter 21
50% Filter 8
75% Filter 5
90% Filter 1


In [0]:
vectorizer = CountVectorizer(binary=True, lowercase=True, min_df=0.025)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())
paragraph_stopwords = results.sum(axis=0)
#(paragraph_stopwords.sort_values(ascending=True)/max(paragraph_stopwords)).iloc[0:50]

paragraph_stopwords = list(paragraph_stopwords.index)
#print(np.array(paragraph_stopwords))

In [0]:
test=df_metadata.apply(lambda row: len(nltk.word_tokenize(row["Prepared Remarks Text"])),axis=1)
test.sum()

17976040

In [0]:
docs = df_metadata["Prepared Remarks Text"]

# vectorizer = CountVectorizer(binary=False, lowercase=True)
# vectors = vectorizer.fit_transform(docs)
# feature_names = vectorizer.get_feature_names()
# results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())
# print("Sum of Words:", results.values.sum())

vectorizer = CountVectorizer(binary=True, lowercase=True, stop_words=paragraph_stopwords)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())
print("Sum of Words Removing Stopwords:", results.values.sum())

Sum of Words Removing Stopwords: 2833620


In [0]:
# num_words = 15
# print(results.head().apply(lambda s: s.nlargest(num_words).index.tolist(), axis=1))
# print(results.head().apply(lambda s: s.nlargest(num_words).tolist(), axis=1))

<br>
<br>
<hr>
## Compare Stopwords

In [0]:
print("These occurred in 50% of the calls, but not in 2% of the paragraphs.")
print(set(call_stopwords)-set(paragraph_stopwords))

These occurred in 50% of the calls, but not in 2% of the paragraphs.
{'recently', 'officer', 'executive', 'ability', 'com', 'materially', 'risks', 'remarks', 'uncertainties', 'begin', 'filings', 'increasing', 'chief', 'ended', 'quarterly', 'leading', 'currently', 'four', 'actual', 'sec', 'operator', 'measures', 'however', '21', 'change', 'risk', 'increases', 'use', 'effective', 'welcome', 'differ', 'note', 'making', 'discussed', 'cause', 'president', 'continuing'}


In [0]:
docs = df_text_prepremarks["Text"]
vectorizer = CountVectorizer(binary=True, lowercase=True, vocabulary=list(set(call_stopwords)-set(paragraph_stopwords)))
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())

<br>
<br>
<hr>
## Identify Compliance



In [0]:
sum_paragraphs_by_speaker = df_text_prepremarks[["Link","Speaker","ticker"]].groupby(by=["Link","Speaker"]).count()
pct_call_by_speaker = sum_paragraphs_by_speaker.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

In [0]:
# kmeans
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit(results.values)

df=results.copy()
df["Prediction"] = km.labels_
low_occurence_class = df["Prediction"].value_counts().sort_values().index[0]
compliance_paragraphs = df.loc[df["Prediction"]==low_occurence_class,].index
print(df["Prediction"].value_counts().sort_values())

1      4337
0    204163
Name: Prediction, dtype: int64


In [0]:
compliance_paragraphs[0:5]

Index(['CFG Q3 2017 3', 'ITW Q3 2017 2', 'MMM Q3 2017 6', 'ESRX Q3 2017 2',
       'NDAQ Q3 2017 1'],
      dtype='object', name='Paragraph Identifier')

In [0]:
#df_text_prepremarks.loc[compliance_paragraphs][["Link","Call Paragraph Num"]]
#df_text_prepremarks.loc[compliance_paragraphs][["Call Paragraph Num"]].max()
test = df_text_prepremarks.loc[compliance_paragraphs].copy()
test.head()

Unnamed: 0_level_0,Link,Speaker,Call Section,Text,Cleaned Text,Call Paragraph Num,company,ticker,period,period_end_date,call date
Paragraph Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CFG Q3 2017 3,https://www.fool.com/earnings/call-transcripts...,Ellen Taylor,Prepared Remarks,We're really pleased to have with us today Bra...,were really pleased u today brad conner head c...,3,Citizens Financial Group,CFG,Q3 2017,2017-09-30,2017-10-20
ITW Q3 2017 2,https://www.fool.com/earnings/call-transcripts...,Michael Larsen,Prepared Remarks,"During today's call, we will discuss our third...",today call discus third quarter financial resu...,2,Illinois Tool Works,ITW,Q3 2017,2017-09-30,2017-10-23
MMM Q3 2017 6,https://www.fool.com/earnings/call-transcripts...,Bruce Jermeland,Prepared Remarks,Item 1A of our most recent form 10-K lists som...,item 1a recent form 10 k list important risk f...,6,3M,MMM,Q3 2017,2017-09-30,2017-10-24
ESRX Q3 2017 2,https://www.fool.com/earnings/call-transcripts...,Ben Bier,Prepared Remarks,These matters involve certain risk and uncerta...,matter involve certain risk uncertainty compan...,2,Express Scripts,ESRX,Q3 2017,2017-09-30,2017-10-25
NDAQ Q3 2017 1,https://www.fool.com/earnings/call-transcripts...,Ed Ditmire,Prepared Remarks,"Good morning everyone, and thank you for joini...",good morning everyone thank joining u today di...,1,NASDAQ Inc.,NDAQ,Q3 2017,2017-09-30,2017-10-25
