In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript

In [4]:
transcripts = pd.DataFrame(list(collection.find().limit(20)))

In [5]:
transcripts.head(1)

Unnamed: 0,_id,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,tradingSymbol,url,wordSize
0,5937dba3082789410c746a1e,1130423,"{'positiveCount': 203, 'negativeCount': 75}",2013-04-23 21:30:07,Operator [Operator Instructions] We'll go firs...,"{'positiveCount': 117, 'negativeCount': 46}",7493,US Airways Group ( LCC ) Q1 2013 Earnings Call...,213007,AAL,https://seekingalpha.com/article/1363811-us-ai...,11310


In [6]:
nlp = English()

In [7]:
def tokenize(sent):
    return [tok.lemma_ for tok in sent if tok.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"] and not tok.lemma_ == '-PRON-']

In [8]:
#def tokenize(sent):
#    return [tok.lemma_ for tok in sent]

# Load Henry Tone

In [9]:
henry = pd.read_excel('henry_wordlist.xlsx')
henry['Word'] = henry['Word'].str.lower()
henry.reset_index()
henry.head(5)

Unnamed: 0,Word,Score
0,below,-1
1,challenge,-1
2,challenged,-1
3,challenges,-1
4,challenging,-1


In [10]:
def get_first_value(dic, word):
    ser = dic[dic['Word'] == word]['Score']
    if len(ser) > 0:
        return ser.iloc[0]
    return 0

In [11]:
get_first_value(henry, 'good')

1

In [31]:
get_first_value(henry, 'worse')

-1

# Load AFINN-111

In [12]:
afinn = pd.read_csv(filepath_or_buffer='AFINN-111.txt', sep='\t', header=None)

In [13]:
afinn.rename(index=str, columns={0: "Word", 1: "Score"}, inplace=True)

In [14]:
afinn.tail(5)

Unnamed: 0,Word,Score
2472,yucky,-2
2473,yummy,3
2474,zealot,-2
2475,zealots,-2
2476,zealous,2


In [15]:
get_first_value(afinn, 'shit')

-4

In [16]:
transcripts['NLP'] = transcripts['rawText'].apply(lambda val: nlp(val))

In [17]:
transcripts['tokens'] = transcripts['NLP'].apply(lambda val: tokenize(val))

In [18]:
from scipy.stats import itemfreq

In [19]:
def build_score_for_tokens(tokens, dictionary):
    score_pos, score_neg = 0, 0
    token_pos, token_neg = [], []
    for token in tokens:
        temp_score = get_first_value(dictionary, token)
        if temp_score > 0:
            score_pos += temp_score
            token_pos.append(token)
        elif temp_score < 0:
            score_neg += temp_score
            token_neg.append(token)
    return (score_pos, score_neg, score_pos + score_neg, itemfreq(token_pos), itemfreq(token_neg))

In [20]:
transcripts['Sentiment Score'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, afinn))

In [21]:
transcripts['Henry Tone'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, henry))

In [22]:
transcripts['SS Ratio'] = transcripts['Sentiment Score'].apply(lambda row: row[0]/(-row[1]))

In [23]:
transcripts['HTone Ratio'] = transcripts['Henry Tone'].apply(lambda row: row[0]/(-row[1]))

In [24]:
transcripts[['url', 'Sentiment Score', 'Henry Tone', 'h_tone', 'SS Ratio', 'HTone Ratio']]

Unnamed: 0,url,Sentiment Score,Henry Tone,h_tone,SS Ratio,HTone Ratio
0,https://seekingalpha.com/article/1363811-us-ai...,"(548, -182, 366, [[ability, 1], [agree, 1], [a...","(196, -69, 127, [[achieve, 2], [better, 2], [c...","{'positiveCount': 203, 'negativeCount': 75}",3.010989,2.84058
1,https://seekingalpha.com/article/1129431-apple...,"(697, -113, 584, [[ability, 1], [accomplish, 2...","(234, -32, 202, [[accomplish, 2], [achieve, 10...","{'positiveCount': 236, 'negativeCount': 30}",6.168142,7.3125
2,https://seekingalpha.com/article/1287941-adobe...,"(661, -45, 616, [[adopt, 1], [advanced, 1], [a...","(174, -18, 156, [[above, 1], [achieve, 12], [b...","{'positiveCount': 170, 'negativeCount': 16}",14.688889,9.666667
3,https://seekingalpha.com/article/2711985-analo...,"(671, -113, 558, [[ability, 2], [active, 3], [...","(256, -66, 190, [[above, 2], [better, 1], [cer...","{'positiveCount': 248, 'negativeCount': 62}",5.938053,3.878788
4,https://seekingalpha.com/article/2885106-autom...,"(1244, -152, 1092, [[ability, 7], [advanced, 1...","(406, -57, 349, [[above, 1], [achieve, 3], [be...","{'positiveCount': 411, 'negativeCount': 59}",8.184211,7.122807
5,https://seekingalpha.com/article/2425545-autod...,"(683, -72, 611, [[ability, 1], [accept, 1], [a...","(202, -25, 177, [[above, 1], [accomplish, 1], ...","{'positiveCount': 196, 'negativeCount': 23}",9.486111,8.08
6,https://seekingalpha.com/article/2700495-autod...,"(593, -65, 528, [[aboard, 1], [accept, 1], [ac...","(195, -29, 166, [[best, 1], [better, 1], [cert...","{'positiveCount': 202, 'negativeCount': 29}",9.123077,6.724138
7,https://seekingalpha.com/article/2957556-autod...,"(775, -68, 707, [[ability, 1], [accept, 1], [a...","(233, -27, 206, [[beat, 1], [better, 2], [cert...","{'positiveCount': 238, 'negativeCount': 30}",11.397059,8.62963
8,https://seekingalpha.com/article/3197526-autod...,"(731, -90, 641, [[accept, 1], [adopt, 3], [all...","(193, -39, 154, [[above, 2], [achieve, 1], [be...","{'positiveCount': 191, 'negativeCount': 38}",8.122222,4.948718
9,https://seekingalpha.com/article/3473396-autod...,"(755, -120, 635, [[accept, 1], [active, 1], [a...","(196, -31, 165, [[achieve, 1], [better, 1], [c...","{'positiveCount': 198, 'negativeCount': 31}",6.291667,6.322581


### Smaller sentiment score ratio

In [25]:
transcripts[transcripts['SS Ratio'] < transcripts['HTone Ratio']]['url']

1    https://seekingalpha.com/article/1129431-apple...
9    https://seekingalpha.com/article/3473396-autod...
Name: url, dtype: object

### Much bigger sentiment score ratio

In [33]:
transcripts[(transcripts['SS Ratio'] / transcripts['HTone Ratio']) > 2]['url']

10    https://seekingalpha.com/article/3700786-autod...
12    https://seekingalpha.com/article/3976697-autod...
13    https://seekingalpha.com/article/4002268-autod...
16    https://seekingalpha.com/article/4027140-autod...
18    https://seekingalpha.com/article/4074589-autod...
Name: url, dtype: object

### Outlier discovery

In [28]:
def discovery(id_):
    positive_histo = transcripts.loc[id_]['Sentiment Score'][3]
    negative_histo = transcripts.loc[id_]['Sentiment Score'][4]
    print('========================\nSentiment positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nSentiment negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))
    
    positive_histo = transcripts.loc[id_]['Henry Tone'][3]
    negative_histo = transcripts.loc[id_]['Henry Tone'][4]
    print('========================\nHenry positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nHenry negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))

In [30]:
discovery(16)

Sentiment positive words with counts:

[['like' '36']
 ['increase' '28']
 ['thank' '27']
 ['good' '19']
 ['yes' '14']
 ['kind' '13']
 ['growth' '12']
 ['important' '11']
 ['big' '10']
 ['strong' '10']
 ['reach' '8']
 ['significant' '8']
 ['great' '7']
 ['positive' '7']
 ['prepared' '7']
 ['expand' '6']
 ['extend' '6']
 ['fair' '6']
 ['pleased' '5']
 ['want' '5']
 ['attract' '4']
 ['benefit' '4']
 ['free' '4']
 ['help' '4']
 ['mature' '4']
 ['share' '4']
 ['worth' '4']
 ['ability' '3']
 ['agreement' '3']
 ['highlight' '3']
 ['interested' '3']
 ['natural' '3']
 ['please' '3']
 ['pretty' '3']
 ['success' '3']
 ['active' '2']
 ['certain' '2']
 ['commit' '2']
 ['confidence' '2']
 ['confident' '2']
 ['excited' '2']
 ['focused' '2']
 ['healthy' '2']
 ['helpful' '2']
 ['hopefully' '2']
 ['matter' '2']
 ['meaningful' '2']
 ['motivate' '2']
 ['nice' '2']
 ['opportunity' '2']
 ['perfect' '2']
 ['progress' '2']
 ['responsible' '2']
 ['accept' '1']
 ['accomplish' '1']
 ['adopt' '1']
 ['advanced' '1