In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript

In [4]:
transcripts = pd.DataFrame(list(collection.find().limit(20)))

In [5]:
transcripts.head(1)

Unnamed: 0,_id,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,tradingSymbol,url,wordSize
0,5937dba3082789410c746a1e,1130423,"{'positiveCount': 203, 'negativeCount': 75}",2013-04-23 21:30:07,Operator [Operator Instructions] We'll go firs...,"{'positiveCount': 117, 'negativeCount': 46}",7493,US Airways Group ( LCC ) Q1 2013 Earnings Call...,213007,AAL,https://seekingalpha.com/article/1363811-us-ai...,11310


In [6]:
nlp = English()

In [7]:
def tokenize(sent):
    return [tok.lemma_ for tok in sent if tok.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"] and not tok.lemma_ == '-PRON-']

In [8]:
#def tokenize(sent):
#    return [tok.lemma_ for tok in sent]

# Load Henry Tone

In [9]:
henry = pd.read_excel('henry_wordlist.xlsx')
henry['Word'] = henry['Word'].str.lower()
henry.reset_index()
henry.head(5)

Unnamed: 0,Word,Score
0,below,-1
1,challenge,-1
2,challenged,-1
3,challenges,-1
4,challenging,-1


In [10]:
def get_first_value(dic, word):
    ser = dic[dic['Word'] == word]['Score']
    if len(ser) > 0:
        return ser.iloc[0]
    return 0

In [11]:
get_first_value(henry, 'good')

1

In [12]:
get_first_value(henry, 'worse')

-1

# Load AFINN-111

In [13]:
afinn = pd.read_csv(filepath_or_buffer='AFINN-111.txt', sep='\t', header=None)

In [14]:
afinn.rename(index=str, columns={0: "Word", 1: "Score"}, inplace=True)

In [15]:
afinn.tail(5)

Unnamed: 0,Word,Score
2472,yucky,-2
2473,yummy,3
2474,zealot,-2
2475,zealots,-2
2476,zealous,2


In [16]:
get_first_value(afinn, 'shit')

-4

In [17]:
transcripts['NLP'] = transcripts['rawText'].apply(lambda val: nlp(val))

In [18]:
transcripts['tokens'] = transcripts['NLP'].apply(lambda val: tokenize(val))

In [19]:
from scipy.stats import itemfreq

In [20]:
def build_score_for_tokens(tokens, dictionary):
    score_pos, score_neg = 0, 0
    token_pos, token_neg = [], []
    for token in tokens:
        temp_score = get_first_value(dictionary, token)
        if temp_score > 0:
            score_pos += temp_score
            token_pos.append(token)
        elif temp_score < 0:
            score_neg += temp_score
            token_neg.append(token)
    return (score_pos, score_neg, score_pos + score_neg, itemfreq(token_pos), itemfreq(token_neg))

In [21]:
transcripts['Sentiment Score'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, afinn))

In [22]:
transcripts['Henry Tone'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, henry))

In [23]:
transcripts['SS Ratio'] = transcripts['Sentiment Score'].apply(lambda row: row[0]/(-row[1]))

In [24]:
transcripts['HTone Ratio'] = transcripts['Henry Tone'].apply(lambda row: row[0]/(-row[1]))

In [25]:
transcripts[['url', 'Sentiment Score', 'Henry Tone', 'h_tone', 'SS Ratio', 'HTone Ratio']]

Unnamed: 0,url,Sentiment Score,Henry Tone,h_tone,SS Ratio,HTone Ratio
0,https://seekingalpha.com/article/1363811-us-ai...,"(486, -167, 319, [[ability, 1], [agree, 1], [a...","(135, -44, 91, [[achieve, 2], [certain, 4], [d...","{'positiveCount': 203, 'negativeCount': 75}",2.91018,3.068182
1,https://seekingalpha.com/article/1129431-apple...,"(645, -97, 548, [[ability, 1], [accomplish, 2]...","(203, -20, 183, [[accomplish, 2], [achieve, 10...","{'positiveCount': 236, 'negativeCount': 30}",6.649485,10.15
2,https://seekingalpha.com/article/1287941-adobe...,"(626, -39, 587, [[adopt, 1], [advanced, 1], [a...","(148, -14, 134, [[achieve, 12], [beat, 2], [ce...","{'positiveCount': 170, 'negativeCount': 16}",16.051282,10.571429
3,https://seekingalpha.com/article/2711985-analo...,"(622, -103, 519, [[ability, 2], [active, 3], [...","(215, -47, 168, [[certain, 4], [deliver, 8], [...","{'positiveCount': 248, 'negativeCount': 62}",6.038835,4.574468
4,https://seekingalpha.com/article/2885106-autom...,"(1149, -135, 1014, [[ability, 7], [advanced, 1...","(347, -50, 297, [[above, 1], [achieve, 3], [ce...","{'positiveCount': 411, 'negativeCount': 59}",8.511111,6.94
5,https://seekingalpha.com/article/2425545-autod...,"(604, -63, 541, [[ability, 1], [accept, 1], [a...","(162, -15, 147, [[accomplish, 1], [certain, 1]...","{'positiveCount': 196, 'negativeCount': 23}",9.587302,10.8
6,https://seekingalpha.com/article/2700495-autod...,"(500, -59, 441, [[accept, 1], [active, 2], [ad...","(146, -20, 126, [[best, 1], [certain, 2], [del...","{'positiveCount': 202, 'negativeCount': 29}",8.474576,7.3
7,https://seekingalpha.com/article/2957556-autod...,"(649, -63, 586, [[ability, 1], [accept, 1], [a...","(182, -19, 163, [[beat, 1], [certain, 3], [del...","{'positiveCount': 238, 'negativeCount': 30}",10.301587,9.578947
8,https://seekingalpha.com/article/3197526-autod...,"(591, -77, 514, [[accept, 1], [adopt, 3], [all...","(147, -23, 124, [[achieve, 1], [beat, 1], [cer...","{'positiveCount': 191, 'negativeCount': 38}",7.675325,6.391304
9,https://seekingalpha.com/article/3473396-autod...,"(618, -95, 523, [[accept, 1], [active, 1], [ad...","(151, -26, 125, [[achieve, 1], [certain, 1], [...","{'positiveCount': 198, 'negativeCount': 31}",6.505263,5.807692


### Smaller sentiment score ratio

In [26]:
transcripts[transcripts['SS Ratio'] < transcripts['HTone Ratio']]['url']

0     https://seekingalpha.com/article/1363811-us-ai...
1     https://seekingalpha.com/article/1129431-apple...
5     https://seekingalpha.com/article/2425545-autod...
15    https://seekingalpha.com/article/80219-ross-st...
17    https://seekingalpha.com/article/4051716-autod...
Name: url, dtype: object

### Much bigger sentiment score ratio

In [27]:
transcripts[(transcripts['SS Ratio'] / transcripts['HTone Ratio']) > 2]['url']

10    https://seekingalpha.com/article/3700786-autod...
12    https://seekingalpha.com/article/3976697-autod...
16    https://seekingalpha.com/article/4027140-autod...
18    https://seekingalpha.com/article/4074589-autod...
Name: url, dtype: object

### Outlier discovery

In [28]:
def discovery(id_):
    positive_histo = transcripts.loc[id_]['Sentiment Score'][3]
    negative_histo = transcripts.loc[id_]['Sentiment Score'][4]
    print('========================\nSentiment positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nSentiment negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))
    
    positive_histo = transcripts.loc[id_]['Henry Tone'][3]
    negative_histo = transcripts.loc[id_]['Henry Tone'][4]
    print('========================\nHenry positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nHenry negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))

In [30]:
discovery(18)

Sentiment positive words with counts:

[['thank' '36']
 ['increase' '26']
 ['good' '24']
 ['want' '19']
 ['growth' '18']
 ['strong' '15']
 ['great' '13']
 ['opportunity' '12']
 ['big' '11']
 ['kind' '8']
 ['prepared' '8']
 ['share' '8']
 ['adopt' '7']
 ['clear' '6']
 ['like' '6']
 ['strength' '6']
 ['ability' '5']
 ['expand' '5']
 ['help' '5']
 ['positive' '5']
 ['significant' '5']
 ['competitive' '4']
 ['confident' '4']
 ['grant' '4']
 ['important' '4']
 ['progress' '4']
 ['solution' '4']
 ['success' '4']
 ['successful' '4']
 ['win' '4']
 ['benefit' '3']
 ['confidence' '3']
 ['ensure' '3']
 ['excellent' '3']
 ['interesting' '3']
 ['mature' '3']
 ['reach' '3']
 ['true' '3']
 ['achievable' '2']
 ['agree' '2']
 ['boost' '2']
 ['comfortable' '2']
 ['committed' '2']
 ['encourage' '2']
 ['engage' '2']
 ['extend' '2']
 ['highlight' '2']
 ['huge' '2']
 ['impressive' '2']
 ['loyalty' '2']
 ['meaningful' '2']
 ['nice' '2']
 ['perfect' '2']
 ['pleased' '2']
 ['yeah' '2']
 ['accept' '1']
 ['accom