<a href="https://colab.research.google.com/github/mikelabadie/Earnings_Call_Transcripts/blob/master/8_Keywords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bring In Data and Libraries

*   Segments calls by quarter
*   Builds a vocabulary of every word seen on any call



#### Bring in Data

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pprint
import nltk
nltk.download('punkt')

import re
import pandas as pd
import numpy as np
import pickle
import os
from IPython.display import clear_output
clear_output()

In [0]:
data_directory = "/content/gdrive/My Drive/DATS6450 - Labadie - Data/"

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists(data_directory+"Pickles/metadata.pkl"):
    pfile = open(data_directory+"Pickles/metadata.pkl", "rb")
    df_metadata = pickle.load(pfile)                 
    pfile.close()

if os.path.exists(data_directory+"Pickles/text_paragraph.pkl"):
    pfile = open(data_directory+"Pickles/text_paragraph.pkl", "rb")
    df_text_prepremarks = pickle.load(pfile)                 
    pfile.close()

clear_output()

#### Segment Calls by Quarter

In [0]:
from datetime import datetime
from dateutil.parser import parse

now = datetime.now()
today = datetime(now.year, now.month, now.day)
apr_2019 = datetime(2019,4,1)
jan_2019 = datetime(2019,1,1)
oct_2018 = datetime(2018,10,1)
jul_2018 = datetime(2018,7,1)
apr_2018 = datetime(2018,4,1)
jan_2018 = datetime(2018,1,1)
oct_2017 = datetime(2017,10,1)

q4_2017_calls = df_metadata[df_metadata.apply(lambda row: (jan_2018 > row["call date"] >= oct_2017), axis=1)]
q1_2018_calls = df_metadata[df_metadata.apply(lambda row: (apr_2018 > row["call date"] >= jan_2018), axis=1)]
q2_2018_calls = df_metadata[df_metadata.apply(lambda row: (jul_2018 > row["call date"] >= apr_2018), axis=1)]
q3_2018_calls = df_metadata[df_metadata.apply(lambda row: (oct_2018 > row["call date"] >= jul_2018), axis=1)]
q4_2018_calls = df_metadata[df_metadata.apply(lambda row: (jan_2019 > row["call date"] >= oct_2018), axis=1)]
q1_2019_calls = df_metadata[df_metadata.apply(lambda row: (row["call date"] >= jan_2019), axis=1)]

print("Q4 2017 Calls:",q4_2017_calls.shape)
print("Q1 2018 Calls:",q1_2018_calls.shape)
print("Q2 2018 Calls:",q2_2018_calls.shape)
print("Q3 2018 Calls:",q3_2018_calls.shape)
print("Q4 2018 Calls:",q4_2018_calls.shape)
print("Q1 2019 Calls:",q1_2019_calls.shape)

Q4 2017 Calls: (137, 16)
Q1 2018 Calls: (306, 16)
Q2 2018 Calls: (355, 16)
Q3 2018 Calls: (546, 16)
Q4 2018 Calls: (1404, 16)
Q1 2019 Calls: (2328, 16)


#### Get list of all unique words used across all calls

In [0]:
docs = df_metadata["Prepared Remarks Text"]
def get_vocab(docs):
    vectorizer = CountVectorizer(binary=True, lowercase=True)
    vectors = vectorizer.fit_transform(docs)
    return vectorizer.get_feature_names()  
  
full_vocab = get_vocab(docs)
len(full_vocab)

65354

<br>
## Build Generic Earnings Call Stop Word Sets at Call/Paragraph Levels
* Words that are in 75% of calls
* Words that are in 7.5% of paragraphs

In [0]:
# build high-level stop word list at the entire call level
docs = df_metadata["Prepared Remarks Text"]
vectorizer = CountVectorizer(binary=True, lowercase=True, min_df=0.75)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())
call_stopwords = results.sum(axis=0)
call_stopwords = list(call_stopwords.index)
print(len(call_stopwords))
print(np.array(call_stopwords))
print()

# build high-level stop word list at the paragraph level
docs = df_text_prepremarks["Text"]
vectorizer = CountVectorizer(binary=True, lowercase=True, min_df=0.075)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
results = pd.DataFrame(data=vectors.toarray(), index=docs.index, columns=vectorizer.get_feature_names())
paragraph_stopwords = results.sum(axis=0)
paragraph_stopwords = list(paragraph_stopwords.index)
print(len(paragraph_stopwords))
print(np.array(paragraph_stopwords))

# combine stopwords from both levels
combined_stopwords = set(paragraph_stopwords).union(set(call_stopwords))

161
['10' '12' '20' '2017' '2018' '2019' 'about' 'across' 'actual' 'addition'
 'additional' 'all' 'also' 'an' 'and' 'any' 'approximately' 'are' 'as'
 'at' 'back' 'based' 'basis' 'be' 'been' 'before' 'believe' 'both'
 'business' 'but' 'by' 'call' 'can' 'capital' 'cash' 'company' 'compared'
 'continue' 'continued' 'cost' 'current' 'driven' 'due' 'during'
 'earnings' 'end' 'everyone' 'expect' 'expected' 'financial' 'first' 'for'
 'forward' 'fourth' 'from' 'full' 'future' 'gaap' 'good' 'growth' 'had'
 'has' 'have' 'high' 'higher' 'impact' 'in' 'including' 'income'
 'increase' 'increased' 'into' 'is' 'it' 'joining' 'last' 'like' 'line'
 'll' 'long' 'looking' 'lower' 'made' 'margin' 'market' 'may' 'me'
 'million' 'more' 'most' 'net' 'new' 'non' 'not' 'now' 'of' 'on' 'one'
 'operating' 'or' 'other' 'our' 'over' 'per' 'performance' 'primarily'
 'prior' 'provide' 'quarter' 'questions' 'range' 'rate' 're' 'related'
 'release' 'result' 'results' 'revenue' 'sales' 'second' 'see' 'share'
 'signific

<br>
## Word Frequency Functions
*   This is where the magic happens.



#### Helpers
Many of these are not currently used, but were at one point useful during development.

In [0]:
def get_calls(from_date, to_date, category=None, category_filter=None):
    calls = df_metadata[df_metadata.apply(lambda row: (to_date > row["call date"] >= from_date), axis=1)]
    if category is not None:
        calls = calls[calls[category]==category_filter]
    return calls

def get_calls_for_shared_tickers(calls1, calls2):
    tickers = list(set(calls2["ticker"]).intersection(set(calls1["ticker"])))
    calls1 = calls1[calls1["ticker"].isin(tickers)]
    calls2 = calls2[calls2["ticker"].isin(tickers)]
    return calls1, calls2

def get_calls_interest_reference(interest_date_from, interest_date_to, reference_date_from, referece_date_to, cat_type, cat_filter, shared=False):
    interest_calls = get_calls(interest_date_from, interest_date_to, cat_type, cat_filter)
    reference_calls = get_calls(reference_date_from, referece_date_to, cat_type, cat_filter)
    if shared:
        reference_calls, interest_calls = get_calls_for_shared_tickers(reference_calls, interest_calls)
    return interest_calls, reference_calls 
  
def get_norm_count_vector(docs, index, vocab, stopwords):
    vectorizer = CountVectorizer(binary=False, lowercase=True, vocabulary=vocab, stop_words=stopwords)
    vectors = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names()
    results = pd.DataFrame(data=vectors.toarray(), index=index, columns=vectorizer.get_feature_names())
    results=results.div(results.sum(axis=1), axis=0)
    results.fillna(0,inplace=True)
    return results
  
def get_tfidf_vector(docs, index, vocab, stopwords):
    vectorizer = TfidfVectorizer(binary=False, lowercase=True, vocabulary=vocab, stop_words=stopwords)
    vectors = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names()
    results = pd.DataFrame(data=vectors.toarray(), index=index, columns=vectorizer.get_feature_names())
    results.fillna(0,inplace=True)
    return results
  
# get top 20 changes up and down
def get_keywords_norm_count(first_qtr_calls, second_qtr_calls):
    # build relative frequency word vector for first period
    docs = first_qtr_calls["Prepared Remarks Text"]
    first_qtr_results = get_norm_count_vector(docs,first_qtr_calls["ticker"], full_vocab, combined_stopwords)

    # build relative frequency word vector for second period
    docs = second_qtr_calls["Prepared Remarks Text"]
    second_qtr_results = get_norm_count_vector(docs,second_qtr_calls["ticker"], full_vocab, combined_stopwords)

    # get the difference in relative frequency by company/word
    diff = (second_qtr_results.mean() - first_qtr_results.mean()).sort_values()
    return diff.head(20).append(diff.tail(20))
  
# get top 20 changes up and down
def get_keywords_tfidf(first_qtr_calls, second_qtr_calls):
    # build relative frequency word vector for first quarter
    docs = first_qtr_calls["Prepared Remarks Text"]
    first_qtr_results = get_tfidf_vector(docs, first_qtr_calls["ticker"], full_vocab, combined_stopwords)

    # build relative frequency word vector for second quarter
    docs = second_qtr_calls["Prepared Remarks Text"]
    second_qtr_results = get_tfidf_vector(docs, second_qtr_calls["ticker"], full_vocab, combined_stopwords)

    # get the difference in relative frequency by company/word
    diff = (second_qtr_results.mean() - first_qtr_results.mean()).sort_values()
    return diff.head(20), diff.tail(20).sort_values(ascending=False)


def get_count_vector(docs, index, vocab, stopwords):
    vectorizer = CountVectorizer(binary=False, lowercase=True, vocabulary=vocab, stop_words=stopwords)
    vectors = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names()
    results = pd.DataFrame(data=vectors.toarray(), index=index, columns=vectorizer.get_feature_names())
    results.fillna(0,inplace=True)
    return results

# get top 20 changes up and down
def get_keywords_count(first_qtr_calls, second_qtr_calls, additional_stopwords=[]): 
    # build relative frequency word vector for first period
    docs = first_qtr_calls["Prepared Remarks Text"]
    first_qtr_results = get_count_vector(docs,first_qtr_calls["ticker"], 
                                         full_vocab, list(combined_stopwords)+additional_stopwords)

    # build relative frequency word vector for second period
    docs = second_qtr_calls["Prepared Remarks Text"]
    second_qtr_results = get_count_vector(docs,second_qtr_calls["ticker"], 
                                          full_vocab, list(combined_stopwords)+additional_stopwords)

    # get the difference in relative frequency by company/word
    diff = (second_qtr_results.sum() - first_qtr_results.sum()).sort_values()
    return diff.head(20), diff.tail(20).sort_values(ascending=False)

#### Primary Functions

In [0]:
def get_stopwords_for_all_calls_during_time_period(interest_date_from, interest_date_to,relative_freq_to_max=0.5):
    calls = get_calls(interest_date_from, interest_date_to)
    docs=calls["Prepared Remarks Text"]
    all_results = get_count_vector(docs,calls["ticker"], full_vocab, combined_stopwords)
    all_results = all_results.sum().sort_values(ascending=False)
    num=all_results[0]*relative_freq_to_max
    all_results = list(all_results[all_results>num].index)
    return all_results
    
def get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                               cat_type, cat_filter, 
                               filter_reference=True, shared=False, additional_stopwords=None, 
                               method="counts", number_words_to_return=None):
  
    # build the sets of calls to compare
    interest_calls = get_calls(interest_date_from, interest_date_to, cat_type, cat_filter)
    
    
    # should the reference period use the same filter
    if filter_reference:
      reference_calls = get_calls(reference_date_from, referece_date_to, cat_type, cat_filter)
    else:
      reference_calls = get_calls(reference_date_from, referece_date_to, None, None)
    
    # remove interest calls from the reference calls
    reference_calls = reference_calls[~(reference_calls["Link"].isin(interest_calls["Link"]))]
    
    # filter to calls with tickers in both reference and interest
    if shared:
        reference_calls, interest_calls = get_calls_for_shared_tickers(reference_calls, interest_calls)
    
    
    # determine what method will be used to identify changes
    if method=="counts":
        docs=reference_calls["Prepared Remarks Text"]
        reference_results = get_count_vector(docs,reference_calls["ticker"], full_vocab, combined_stopwords)
        reference_results = reference_results.sum().sort_values(ascending=False)

        docs=interest_calls["Prepared Remarks Text"]
        interest_results = get_count_vector(docs, interest_calls["ticker"], full_vocab, combined_stopwords)
        interest_results = interest_results.sum().sort_values(ascending=False)
        
        # convert counts to percent of total and get difference
        interest_results_pct=interest_results.div(interest_results.sum())
        reference_results_pct=reference_results.div(reference_results.sum())
        diff = interest_results_pct - reference_results_pct        
    elif method == "tfidf":
        reference_doc=" ".join(reference_calls["Prepared Remarks Text"])
        interest_doc=" ".join(interest_calls["Prepared Remarks Text"])
        docs = [reference_doc,interest_doc]

        vectorizer = TfidfVectorizer(binary=False, lowercase=True, stop_words=combined_stopwords)
        vectors = vectorizer.fit_transform(docs)
        feature_names = vectorizer.get_feature_names()
        results = pd.DataFrame(data=vectors.toarray(), columns=vectorizer.get_feature_names())
        results.fillna(0,inplace=True)
        diff = results.loc[1]-results.loc[0]
    
    
    # drop words from results
    if additional_stopwords is not None:
        diff=diff.drop(additional_stopwords, errors="ignore")
    
    diff = diff.sort_values(ascending=False)
    if number_words_to_return is not None:  
        diff = diff.head(number_words_to_return)
    
    return diff, interest_calls

<br>
## Compare Q1 to Q4 for All Calls

In [0]:
interest_date_from, interest_date_to = jan_2019, today
reference_date_from, referece_date_to = oct_2018, jan_2019

words, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                                                   None, None, filter_reference=False, shared=True, 
                                                   additional_stopwords=None, method="tfidf", number_words_to_return=30)

words

december        0.040954
q4              0.036710
billion         0.029940
january         0.020006
half            0.018602
q1              0.016975
2020            0.014725
investments     0.013667
31              0.013646
throughout      0.012652
record          0.010387
tax             0.010301
march           0.008797
february        0.008317
flow            0.008055
31st            0.008029
dividend        0.007910
annual          0.007830
program         0.007180
decline         0.006507
shareholders    0.006431
investment      0.006339
grow            0.006318
plan            0.005985
off             0.005692
strategy        0.005466
ended           0.005289
assets          0.005234
expense         0.005219
position        0.005168
dtype: float64

<br>
## Compare Q1 2019 to Q1 2018 for All Calls

In [0]:
interest_date_from, interest_date_to = jan_2019, today
reference_date_from, referece_date_to = jan_2018, apr_2018

words, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                                                   None, None, filter_reference=False, shared=True, 
                                                   additional_stopwords=None, method="tfidf", number_words_to_return=30)

words

2020         0.021440
program      0.019953
half         0.016679
customer     0.016639
industry     0.016146
within       0.016068
free         0.016003
ebitda       0.014628
product      0.012544
demand       0.012261
headwinds    0.012172
flow         0.011835
ph           0.011657
points       0.011558
mid          0.011459
markets      0.011369
growing      0.011359
grew         0.011321
digit        0.011168
price        0.011101
better       0.011084
gross        0.010986
down         0.010921
19           0.010861
billion      0.010808
top          0.010682
saw          0.010653
number       0.010617
progress     0.010601
focus        0.010277
dtype: float64

<br>
## Compare Q1 2018 to Q1 2019 for All Calls
What were people talking about last year that is not as important today?

In [0]:
interest_date_from, interest_date_to = jan_2018, apr_2018
reference_date_from, referece_date_to = jan_2019, today

words, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                                                   None, None, filter_reference=False, shared=True, 
                                                   additional_stopwords=None, method="tfidf", number_words_to_return=30)

words

tax            0.178153
2016           0.081244
reform         0.053703
deferred       0.021256
benefit        0.021103
just           0.020860
think          0.020291
do             0.015603
act            0.014758
if             0.014520
law            0.012966
change         0.012826
federal        0.012483
positive       0.011652
what           0.011556
legislation    0.011144
there          0.011123
corporate      0.010793
taxes          0.010387
loans          0.010310
charge         0.010130
30             0.009956
18             0.009892
jobs           0.009418
still          0.009277
benefits       0.009102
going          0.008748
little         0.008694
00             0.008499
17             0.008214
dtype: float64

<br>
## Compare a Group to All Other Calls during Same Period
What do companies in this industry talk about more frequently than companies in other industries?

In [0]:
cat_type="Industry"
cat_filter="Clothing/Shoe/Accessory Stores"
interest_date_from, interest_date_to = jan_2018, jan_2019

industry_specific_words, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, interest_date_from, interest_date_to, 
                                                                     cat_type, cat_filter, filter_reference=False, shared=False, 
                                                                     additional_stopwords=None, method="tfidf", number_words_to_return=30)

industry_specific_words

store          0.218441
stores         0.187049
comp           0.181613
brand          0.130975
customer       0.104196
retail         0.101432
week           0.101072
inventory      0.089128
digital        0.083127
brands         0.076595
comps          0.074900
merchandise    0.068320
single         0.065107
apparel        0.062251
men            0.054929
comparable     0.054685
positive       0.054571
footwear       0.051531
traffic        0.051505
digits         0.045357
digit          0.045254
categories     0.041658
sg             0.040637
experience     0.038962
commerce       0.037892
assortment     0.035704
points         0.035445
online         0.035318
shift          0.034195
53rd           0.032501
dtype: float64

<br>
## Compare Company's Quarter to Previous Calls
What is this company talking about more frequently than they have in the past?

In [0]:
cat_type="ticker"
cat_filter="JCP"
interest_date_from, interest_date_to = jan_2019, today
reference_date_from, referece_date_to = oct_2017, jan_2019

period_stopwords = get_stopwords_for_all_calls_during_time_period(interest_date_from, interest_date_to, 0.05)

new_topics, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                                                        cat_type, cat_filter, filter_reference=True, shared=True, 
                                                        additional_stopwords=period_stopwords, method="tfidf", number_words_to_return=25)

new_topics

apparel         0.155052
jcpenney        0.095216
michelle        0.082313
join            0.054875
shrink          0.043943
merchandise     0.043326
thoughtfully    0.042709
served          0.041156
unproductive    0.041156
protection      0.041156
meaningfully    0.041156
gap             0.041156
reestablish     0.041156
journey         0.041156
jill            0.037847
swiftly         0.035996
furniture       0.033565
immediate       0.032948
wants           0.029900
training        0.027438
showing         0.027438
monday          0.027438
satisfying      0.027438
defining        0.027438
impressed       0.027438
dtype: float64

<br>
## Testing Keywords

<br>
### Build Topics


In [0]:
this_period_paragraphs = df_text_prepremarks[df_text_prepremarks.apply(lambda row: (interest_date_to > row["call date"] >= interest_date_from), axis=1)]
docs=list(this_period_paragraphs["Text"])
vectorizer = CountVectorizer(binary=True, lowercase=True, vocabulary=new_topics.index)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
this_period_vector = pd.DataFrame(data=vectors.toarray(), index=list(this_period_paragraphs["Text"].index), columns=vectorizer.get_feature_names())

In [0]:
for word in new_topics.index[0:2]:   
    print("Word:",word)
    
    paragraphs_with_word = interest_calls[(interest_calls["Prepared Remarks Text"].str.contains(word, flags=re.IGNORECASE, regex=True))]
    unique_calls_with_word_industry = paragraphs_with_word[["company","ticker","Link"]].drop_duplicates()
    print("    Number of Calls with Word in Group This Period:",len(unique_calls_with_word_industry))
    print("        Tickers with Word in Group This Period:",set(unique_calls_with_word_industry["ticker"].drop_duplicates()))
    
    paragraphs_with_word_all_calls_this_period = this_period_paragraphs.loc[this_period_vector[this_period_vector[word]==1].index]
    unique_calls_with_word_this_period = paragraphs_with_word_all_calls_this_period[["company","ticker","Link"]].drop_duplicates()
    print("    Number of Calls with Word This Period (All Calls):",len(unique_calls_with_word_this_period))
    print("        Tickers with Word This Period (All Calls):",set(unique_calls_with_word_this_period["ticker"].drop_duplicates()))

    print()
    for _, row in unique_calls_with_word_this_period.iterrows():
        print(row["company"])
        print(row["Link"])
        pprint.pprint(list(paragraphs_with_word_all_calls_this_period[paragraphs_with_word_all_calls_this_period["Link"]==row["Link"]]["Text"]))
        print()
        
    print()
    print()

<br>
### Sector and Industry Lists

In [0]:
print(sectors)

['Finance' 'Technology' 'Health Care' 'Consumer Services'
 'Public Utilities' 'Consumer Non-Durables' 'Energy' 'Capital Goods'
 'Basic Industries' 'Miscellaneous' 'Consumer Durables' 'Transportation']


In [0]:
sectors_industries.loc["Consumer Non-Durables"].sort_values(by="ticker",ascending=False).head(10).index

Index(['Packaged Foods', 'Apparel', 'Beverages (Production/Distribution)',
       'Farming/Seeds/Milling', 'Recreational Products/Toys',
       'Plastic Products', 'Food Distributors', 'Shoe Manufacturing',
       'Package Goods/Cosmetics', 'Meat/Poultry/Fish'],
      dtype='object', name='Industry')

<br>
### Identify Set of Keywords of Interest

In [0]:
industries_to_analyze = sectors_industries.loc["Consumer Non-Durables"].sort_values(by="ticker",ascending=False).head(10).index

cat_type="Industry"
interest_date_from, interest_date_to = jan_2019, today
reference_date_from, referece_date_to = oct_2018, jan_2019
period_stopwords = get_stopwords_for_all_calls_during_time_period(interest_date_from, interest_date_to, 0.05)

for cat_filter in industries_to_analyze:
    new_topics, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                                                            cat_type, cat_filter, filter_reference=True, shared=True, 
                                                            additional_stopwords=period_stopwords, method="tfidf", number_words_to_return=25)

    print(cat_filter,"(Num Calls:",str(interest_calls.shape[0])+")")
    pprint.pprint("   ".join(new_topics.index))
    print("#######################################################################################")
    print()

<br>
### Search for Keyword Usage

In [0]:
word = "jostens"
cat_filter = "Plastic Products"

##############################################################################
cat_type="Industry"
interest_date_from, interest_date_to = jan_2019, today
reference_date_from, referece_date_to = oct_2018, jan_2019

period_stopwords = get_stopwords_for_all_calls_during_time_period(interest_date_from, interest_date_to, 0.05)

new_topics, interest_calls = get_keywords_by_comparison(interest_date_from, interest_date_to, reference_date_from, referece_date_to, 
                                                        cat_type, cat_filter, filter_reference=True, shared=True, 
                                                        additional_stopwords=period_stopwords, method="tfidf", number_words_to_return=25)

this_period_paragraphs = df_text_prepremarks[df_text_prepremarks.apply(lambda row: (interest_date_to > row["call date"] >= interest_date_from), axis=1)]
docs=list(this_period_paragraphs["Text"])
vectorizer = CountVectorizer(binary=True, lowercase=True, vocabulary=new_topics.index)
vectors = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()
this_period_vector = pd.DataFrame(data=vectors.toarray(), index=list(this_period_paragraphs["Text"].index), columns=vectorizer.get_feature_names())


print("Word:",word)
    
paragraphs_with_word = interest_calls[(interest_calls["Prepared Remarks Text"].str.contains(word, flags=re.IGNORECASE, regex=True))]
unique_calls_with_word_industry = paragraphs_with_word[["company","ticker","Link"]].drop_duplicates()
print("    Number of Calls with Word in Group This Period:",len(unique_calls_with_word_industry))
print("        Tickers with Word in Group This Period:",set(unique_calls_with_word_industry["ticker"].drop_duplicates()))

paragraphs_with_word_all_calls_this_period = this_period_paragraphs.loc[this_period_vector[this_period_vector[word]==1].index]
unique_calls_with_word_this_period = paragraphs_with_word_all_calls_this_period[["company","ticker","Link"]].drop_duplicates()
print("    Number of Calls with Word This Period (All Calls):",len(unique_calls_with_word_this_period))
print("        Tickers with Word This Period (All Calls):",set(unique_calls_with_word_this_period["ticker"].drop_duplicates()))

#     paragraphs_with_word_all_calls = df_text_prepremarks[(df_text_prepremarks["Text"].str.contains(word, flags=re.IGNORECASE, regex=True))]    
#     unique_calls_with_word = paragraphs_with_word_all_calls[["company","ticker","Link"]].drop_duplicates()
#     print("    Number of Calls with Word (All Calls, All Time):",len(unique_calls_with_word))
#     print("        Tickers with Word (All Calls, All Time):",set(unique_calls_with_word["ticker"].drop_duplicates()))
   

print()
for _, row in unique_calls_with_word_this_period.iterrows():
    print(row["company"])
    print(row["Link"])
    pprint.pprint(list(paragraphs_with_word_all_calls_this_period[paragraphs_with_word_all_calls_this_period["Link"]==row["Link"]]["Text"]))
    print()

print()
print()

Word: jostens
    Number of Calls with Word in Group This Period: 1
        Tickers with Word in Group This Period: {'NWL'}
    Number of Calls with Word This Period (All Calls): 1
        Tickers with Word This Period (All Calls): {'NWL'}

Newell Brands Inc.
https://www.fool.com/earnings/call-transcripts/2019/02/15/newell-brands-inc-nwl-q4-2018-earnings-conference.aspx
['On cash, we delivered about $500 million of operating cash flow, bringing '
 'our second half operating cash flow to nearly $1.1 billion. This result was '
 'less than what we forecast, driven in part by the completion -- the '
 'completion timing of the Jostens and Pure Fishing yields, higher cash taxes '
 'and transaction related costs, as well as lower accounts payable balance. We '
 'expect to deliver more sustainable performance on payables going forward as '
 "we're making very good progress, integrating extended payment terms into our "
 'sourced finished goods contracts. This work began in earnest in late 2017