In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Goal
The goal of this project is to identify negative sentiment in CEO's messages based on earnings call transcripts. We know that CEOs are unlikely to report bad news outright and thus this notebook strives to understand how positive can CEO frame a negative news during earnings call?

## Imports

In [0]:
import os

# change to path
PATH='/content/drive/My Drive/Colab Notebooks/earnings'
os.chdir(PATH)

In [0]:
!pip install yfinance

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import re
import json

from datetime import datetime
from pandas_datareader import data as pdr
import yfinance as yf

from tqdm import tqdm_notebook as tqdm

import warnings
warnings.simplefilter(action='ignore')

import spacy
from textblob import TextBlob

## Load data

In [0]:
fn = '../crawler/fool/data.json'

with open(fn) as f:
    data = json.load(f)

In [0]:
class Inc:
    def __init__(self, d):
        self.name = d['name']
        self.ticker = d['ticker']
        self.quarter = d['quarter']
        self.reporting_date = d['reporting_date']
        self.conversations = d['conversations']
        
    def inc_name(self):
        return self.name
    
    def inc_ticker(self):
        return self.ticker
        
    def ceo_messages(self):
        for i, speaker in enumerate(self.conversations):
            if speaker.find('Executive') != -1:
                ceo_speech = self.conversations[speaker]
        
        return ' '.join(chunk for chunks in ceo_speech for chunk in chunks)
    
    def analysts_questions(self):
        analysts_questions = ''
        for i, speaker in enumerate(self.conversations):
            if speaker.find('Analyst') != -1:
                analysts_questions = analysts_questions + d[speaker][0][0] + ' '
                
        return analysts_questions
    
    def speakers_analysts_coverage(self):
        speakers = []
        analysts_coverage = []
        for i, speaker in enumerate(self.conversations):
            splits = speaker.split('--')
            
            # if the speaker is not operator,
            # get the name
            if len(splits) > 1:
                speaker = splits[0]
                
                # get only first name of speaker
                speaker = speaker.split(' ')[0]
                speakers += speaker.rstrip().lstrip(),
                
            # if the speaker has 3 titles,
            # he or she is an analyst. Hence,
            # get the bank name            
            if len(splits) == 3:
                analyst = splits[1]
                analysts_coverage += analyst.rstrip().lstrip(),
                
        return speakers, analysts_coverage

In [0]:
example = Inc(data[2])

speakers, analysts_coverage = example.speakers_analysts_coverage()

## Create label

In [0]:
tickers = []

for inc in data:
    try:
        ticks = inc['ticker']
        date = inc['reporting_date']
        
        month = date[:3]
        day = date[4:6].lstrip().rstrip()
        
        s = month + '-' + day + '-2019'
        
        dt = datetime.strptime('Jul-11-2019', '%b-%d-%Y').strftime('%Y-%m-%d')
        
        if ticks.find('NYSE') != -1 or ticks.find('NASDAQ') != -1:
            tickers.append([ticks, dt])            
    except:
        pass

In [0]:
ticker_changes = {}

for t, d in tickers:
    tick = t.split(':')[-1]
    
    info = pdr.get_data_yahoo(tick, start="2019-07-10", end="2019-07-13")
    
    try:
        price_change = (info.iloc[0]['Open'] - info.iloc[-1]['Open']) / info.iloc[0]['Open']

        ticker_changes[t] = price_change
        
    except:
        pass

If the stock price rises more than 5% in the positive direction, then the CEO is reporting positive news while falling more than 5% indicates bad news.

In [0]:
df = pd.DataFrame(ticker_changes.items(),
                  columns=['ticker', 'price_change'])

df['price_change'].fillna(0, inplace=True)
df.loc[(df['price_change'] >= -0.05) & (df['price_change'] < 0.05), 'bad'] = 0
df.loc[df['price_change'] < -0.05, 'bad'] = 1
df.loc[df['price_change'] >= 0.05, 'bad'] = -1

In [191]:
df['bad'].value_counts()

 0.0    738
-1.0     52
 1.0     32
Name: bad, dtype: int64

In [192]:
train_df = df[df['bad'] != 0]

train_df.head()

Unnamed: 0,ticker,price_change,bad
3,NASDAQ:PSMT,-0.052027,1.0
7,NASDAQ:SLP,-0.134926,1.0
8,NYSE:AIR,-0.079381,1.0
13,NYSE:LNN,0.054989,-1.0
19,NYSE:JKS,0.082774,-1.0


## spaCy
Data exploration.

In [0]:
nlp = spacy.load('en_core_web_sm')

In [0]:
txt = example.ceo_messages()

In [0]:
doc = nlp(txt)

In [214]:
txt

"Thank you, Sandeep. Good afternoon and good morning to everyone on the call. Thank you for joining us today. Infosys has delivered a strong quarter and I'm pleased with our overall performance as we continue to demonstrate our increasing relevance to clients. Our constant currency growth year on year for Q1 was 12.4%, which is the third consecutive quarter of double-digit growth. Our digital revenue growth was 41.9% and our digital revenue now accounts for 35.7% of our overall business. The large deal DCV was the highest ever at $2.7 billion. Our operating margin for Q1 was at 20.5%. We saw broad-based growth across our industry segments, service lines, and geographies. In constant currency year on year, our telco segment grew 22.6% and North America geography 13.5%. We continue to benefit from building deeper capabilities across our digital portfolio, especially in the areas of Experian's data analytics, cloud, SaaS, IoT, cybersecurity, AI, and machine learning. Our overall deal pipe

In [162]:
sentences = list(doc.sents)

len(sentences)

140

Find the useful entities.

In [165]:
entities = ['PERSON', 'ORG', 'NORP', 'FACILITY', 'ORG', 
            'GPE', 'LOC', 'PRODUCT', 'EVENT']

for ent in doc.ents:
    if ent.label_ in entities and ent.text not in speakers:
        print(ent.text, ent.label_)

Infosys PERSON
North America LOC
Experian ORG
SaaS ORG
Alexa ORG
US GPE
Infosys McCamish NORP
London GPE
Lex PERSON
Infosys ORG
SAP S/4HANA ORG
NelsonHall ORG
API ORG
Forrester ORG
Ed PERSON
Brexit GPE
UK GPE
Brexit GPE
RPP ORG
Azure PERSON
Google Cloud PERSON
SaaS ORG
Salesforce ORG
ServiceNow ORG
Microsoft ORG
Azure to Office 365 ORG
S/4HANA PERSON
Gartner PERSON


Extract sentences that has the patterns as defined.

In [0]:
from spacy.matcher import PhraseMatcher, Matcher

business_adj = [
    [{"LEMMA": {"IN": ["down", "up", "small", "big",
                       "high", "low", "strong", "weak",
                       "large", "bad", "solid"]}}],
]

b_adj_matcher = Matcher(nlp.vocab)
b_adj_matcher.add("BusinessAdj", None, *business_adj)

In [0]:
metrics = [
    [{"LEMMA": {"IN": ["revenue", "cost", "margin", 
                       "grow", "profit", "sale",
                       "guidance"]}}], # sentence contains number?
]


metrics_matcher = Matcher(nlp.vocab)
metrics_matcher.add("BusinessMetrics", None, *metrics)

In [170]:
for sent in doc.sents:
    sent_to_doc = sent.as_doc()
    
    matches = metrics_matcher(sent_to_doc)
    
    if matches is not None:
        for match_id, start, end in matches:
            
            keyword = sent_to_doc[start:end]
            
            print("Matched on:", keyword)
            
            print(sent_to_doc)
            
            print('\n')
            
            break                    
    
    matches = b_adj_matcher(sent_to_doc)
    
    if matches is not None:
        for match_id, start, end in matches:
            
            keyword = sent_to_doc[start:end]
            
            for chunk in sent_to_doc.noun_chunks:
                if keyword.text in chunk.text:
                    
                    print("Matched on:", keyword)
            
                    print(sent_to_doc.text)                        
                    
                    print("Adj describing:", chunk.text)

                    print('\n')                    

Matched on: strong
Infosys has delivered a strong quarter and I'm pleased with our overall performance as we continue to demonstrate our increasing relevance to clients. 
Adj describing: a strong quarter


Matched on: revenue
Our digital revenue growth was 41.9% and our digital revenue now accounts for 35.7% of our overall business. 


Matched on: large
The large deal DCV was the highest ever at $2.7 billion. 
Adj describing: The large deal


Matched on: margin
Our operating margin for Q1 was at 20.5%. 


Matched on: grew
In constant currency year on year, our telco segment grew 22.6% and North America geography 13.5%. 


Matched on: large
We work with a large automotive client to help them navigate their digital transformation journey, delivering for them future-ready scalable digital hybrid cloud platforms. 
Adj describing: a large automotive client


Matched on: large
We're enabling a large utility to build advanced planning and engineering systems to forecast the dynamic nature of 

In [0]:
t1 = 'Another very strong area for us is the area of data and analytics.'

In [172]:
doc = nlp(t1)

for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)

Another DET DT det
very ADV RB advmod
strong ADJ JJ amod
area NOUN NN nsubj
for ADP IN prep
us PRON PRP pobj
is VERB VBZ ROOT
the DET DT det
area NOUN NN attr
of ADP IN prep
data NOUN NNS pobj
and CCONJ CC cc
analytics NOUN NNS conj
. PUNCT . punct


Creating data for `train_df`.

In [0]:
def compile_business_sents(doc):
    keep = ''
    
    for sent in doc.sents:
        sent_to_doc = sent.as_doc()

        matches = metrics_matcher(sent_to_doc)

        if matches is not None:
            for match_id, start, end in matches:

                keep = keep + sent_to_doc.text

                break                    

        matches = b_adj_matcher(sent_to_doc)

        if matches is not None:
            for match_id, start, end in matches:

                keyword = sent_to_doc[start:end]

                for chunk in sent_to_doc.noun_chunks:
                    if keyword.text in chunk.text:

                        keep = keep + sent_to_doc.text
                        
    return keep

In [0]:
analyze = train_df['ticker'].tolist()

for inc in data:
    try:
        example = Inc(inc)    
        if example.inc_ticker() in analyze:
            txt = example.ceo_messages()
            doc = nlp(txt)
            train_df.loc[train_df['ticker'] == example.inc_ticker(), 'ceo_msgs'] = compile_business_sents(doc)
    except:
        pass

In [0]:
train_df = train_df[~train_df['ceo_msgs'].isna()]

In [220]:
train_df.head()

Unnamed: 0,ticker,price_change,bad,ceo_msgs
3,NASDAQ:PSMT,-0.052027,1.0,"Since our last call, we've been quite busy wit..."
7,NASDAQ:SLP,-0.134926,1.0,This was a very strong quarter for Simulations...
8,NYSE:AIR,-0.079381,1.0,Our sales for the full year were up 17% from $...
13,NYSE:LNN,0.054989,-1.0,The ag market conditions this past quarter con...
19,NYSE:JKS,0.082774,-1.0,"Total revenues were $867 (ph) million, an incr..."


In [224]:
train_df['bad'].value_counts()

-1.0    50
 1.0    31
Name: bad, dtype: int64

## Model
DistilBERT.

https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation

## To be continued...

Rule-base matching.

https://github.com/pmbaumgartner/binder-notebooks/blob/master/rule-based-matching-with-spacy-matcher.ipynb

Entity recognizer.

In [0]:
# for chunk in doc.noun_chunks:
#     print(chunk.root.text)

# from spacy.pipeline import EntityRecognizer

# ner = EntityRecognizer(nlp.vocab)

# processed = ner(doc)

# processed

TextBlob for sentiment analysis.

In [0]:
# testimonial = TextBlob("Maybe first off on some fundamentals for the full year, it looks like your film release number was pretty good but your revenues were down.")
# testimonial.sentiment

## Ideas
1. Get CEO's text and do sentiment analysis
2. Extract descriptive words of a business, e.g., down, up, small, big, high, low, strong...
3. Extract financial keywords, e.g, revenue, cost, margin, growth...
4. Extract change in intonation, e.g., but.
5. Extract superlatives.
6. Extract customers, relationships.
7. Extract investments, acquisitions.
8. Extract %.
9. ESG: sustainability, environment, regulatory

In [0]:
# fundamentals, metrics
# revenue, guidance
# down, up, small, big, raise

## Case study
1. NKE
> Keywords: opportunity, geography, accelerated, long-term, inventory, risk, volatility, customer, digits, double-digits, triple, focus, deliver, accelerate, brand, distribution, members, launch

2. DPZ
> Keywords: same store sales performance, long-term growth, data, service, shrink, technology, franchise, volume-driven retail sales, strong unit economics and franchisee, priorities, international, market share, pricing and promotional, partners,  

3. WMT
> Keywords: market share, guidance, ahead, partnership, omni-channel,