In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript

In [4]:
transcripts = pd.DataFrame(list(collection.find().limit(100)))

In [5]:
transcripts.head(1)

Unnamed: 0,_id,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,tradingSymbol,url,wordSize
0,5937dba3082789410c746a1e,1130423,"{'positiveCount': 203, 'negativeCount': 75}",2013-04-23 21:30:07,Operator [Operator Instructions] We'll go firs...,"{'positiveCount': 117, 'negativeCount': 46}",7493,US Airways Group ( LCC ) Q1 2013 Earnings Call...,213007,AAL,https://seekingalpha.com/article/1363811-us-ai...,11310


In [6]:
nlp = English()

In [7]:
def tokenize(sent):
    return [tok.lemma_ for tok in sent if tok.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"] and not tok.lemma_ == '-PRON-']

In [26]:
def tokenize_2(sent):
    return [str(tok).lower() for tok in sent]

# Load Henry Tone

In [9]:
henry = pd.read_excel('henry_wordlist.xlsx')
henry['Word'] = henry['Word'].str.lower()
henry.reset_index()
henry.head(5)

Unnamed: 0,Word,Score
0,below,-1
1,challenge,-1
2,challenged,-1
3,challenges,-1
4,challenging,-1


In [10]:
def get_first_value(dic, word):
    ser = dic[dic['Word'] == word]['Score']
    if len(ser) > 0:
        return ser.iloc[0]
    return 0

In [11]:
get_first_value(henry, 'good')

1

In [12]:
get_first_value(henry, 'worse')

-1

# Load AFINN-111

In [13]:
afinn = pd.read_csv(filepath_or_buffer='AFINN-111.txt', sep='\t', header=None)

In [14]:
afinn.rename(index=str, columns={0: "Word", 1: "Score"}, inplace=True)

In [15]:
afinn.tail(5)

Unnamed: 0,Word,Score
2472,yucky,-2
2473,yummy,3
2474,zealot,-2
2475,zealots,-2
2476,zealous,2


In [16]:
get_first_value(afinn, 'shit')

-4

In [47]:
henry['Score 2'] = henry['Word'].apply(lambda row: get_first_value(afinn, row))

In [17]:
transcripts['NLP'] = transcripts['rawText'].apply(lambda val: nlp(val))

In [18]:
transcripts['tokens'] = transcripts['NLP'].apply(lambda val: tokenize(val))

In [19]:
from scipy.stats import itemfreq

In [37]:
henry['Score'] = henry['Word'].apply(lambda row: get_first_value)

In [29]:
def build_score_for_tokens(tokens, dictionary):
    score_pos, score_neg = 0, 0
    token_pos, token_neg = [], []
    for token in tokens:
        temp_score = get_first_value(dictionary, token)
        if temp_score > 0:
            score_pos += temp_score
            token_pos.append(token)
        elif temp_score < 0:
            score_neg += temp_score
            token_neg.append(token)
    return (score_pos, score_neg, score_pos + score_neg, itemfreq(token_pos), itemfreq(token_neg))

In [38]:
transcripts['Sentiment Score'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, afinn))

In [39]:
transcripts['Henry Tone'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, henry))

In [40]:
transcripts['SS Ratio'] = transcripts['Sentiment Score'].apply(lambda row: row[0]/(-row[1]))

In [41]:
transcripts['HTone Ratio'] = transcripts['Henry Tone'].apply(lambda row: row[0]/(-row[1]))

In [42]:
transcripts[['url', 'Sentiment Score', 'Henry Tone', 'h_tone', 'SS Ratio', 'HTone Ratio']]

Unnamed: 0,url,Sentiment Score,Henry Tone,h_tone,SS Ratio,HTone Ratio
0,https://seekingalpha.com/article/1363811-us-ai...,"(486, -167, 319, [[ability -> 2, 1], [agree ->...","(135, -44, 91, [[achieve -> 1, 2], [certain ->...","{'positiveCount': 203, 'negativeCount': 75}",2.910180,3.068182
1,https://seekingalpha.com/article/1129431-apple...,"(645, -97, 548, [[ability -> 2, 1], [accomplis...","(203, -20, 183, [[accomplish -> 1, 2], [achiev...","{'positiveCount': 236, 'negativeCount': 30}",6.649485,10.150000
2,https://seekingalpha.com/article/1287941-adobe...,"(626, -39, 587, [[adopt -> 1, 1], [advanced ->...","(148, -14, 134, [[achieve -> 1, 12], [beat -> ...","{'positiveCount': 170, 'negativeCount': 16}",16.051282,10.571429
3,https://seekingalpha.com/article/2711985-analo...,"(622, -103, 519, [[ability -> 2, 2], [active -...","(215, -47, 168, [[certain -> 1, 4], [deliver -...","{'positiveCount': 248, 'negativeCount': 62}",6.038835,4.574468
4,https://seekingalpha.com/article/2885106-autom...,"(1149, -135, 1014, [[ability -> 2, 7], [advanc...","(347, -50, 297, [[above -> 1, 1], [achieve -> ...","{'positiveCount': 411, 'negativeCount': 59}",8.511111,6.940000
5,https://seekingalpha.com/article/2425545-autod...,"(604, -63, 541, [[ability -> 2, 1], [accept ->...","(162, -15, 147, [[accomplish -> 1, 1], [certai...","{'positiveCount': 196, 'negativeCount': 23}",9.587302,10.800000
6,https://seekingalpha.com/article/2700495-autod...,"(500, -59, 441, [[accept -> 1, 1], [active -> ...","(146, -20, 126, [[best -> 1, 1], [certain -> 1...","{'positiveCount': 202, 'negativeCount': 29}",8.474576,7.300000
7,https://seekingalpha.com/article/2957556-autod...,"(649, -63, 586, [[ability -> 2, 1], [accept ->...","(182, -19, 163, [[beat -> 1, 1], [certain -> 1...","{'positiveCount': 238, 'negativeCount': 30}",10.301587,9.578947
8,https://seekingalpha.com/article/3197526-autod...,"(591, -77, 514, [[accept -> 1, 1], [adopt -> 1...","(147, -23, 124, [[achieve -> 1, 1], [beat -> 1...","{'positiveCount': 191, 'negativeCount': 38}",7.675325,6.391304
9,https://seekingalpha.com/article/3473396-autod...,"(618, -95, 523, [[accept -> 1, 1], [active -> ...","(151, -26, 125, [[achieve -> 1, 1], [certain -...","{'positiveCount': 198, 'negativeCount': 31}",6.505263,5.807692


### Smaller sentiment score ratio

In [43]:
transcripts[(transcripts['SS Ratio']/transcripts['HTone Ratio'])<0.75]['url']

1     https://seekingalpha.com/article/1129431-apple...
24    https://seekingalpha.com/article/288456-autode...
33    https://seekingalpha.com/article/1856531-autod...
35    https://seekingalpha.com/article/2221833-autod...
90    https://seekingalpha.com/article/185247-amazon...
Name: url, dtype: object

### Much bigger sentiment score ratio

In [44]:
transcripts[(transcripts['SS Ratio'] / transcripts['HTone Ratio']) > 2.5]['url']

16    https://seekingalpha.com/article/4027140-autod...
51    https://seekingalpha.com/article/171250-automa...
59    https://seekingalpha.com/article/123193-autode...
70    https://seekingalpha.com/article/156040-autode...
95    https://seekingalpha.com/article/1509222-adobe...
Name: url, dtype: object

### Outlier discovery

In [45]:
def discovery(id_):
    positive_histo = transcripts.loc[id_]['Sentiment Score'][3]
    negative_histo = transcripts.loc[id_]['Sentiment Score'][4]
    print('========================\nSentiment positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nSentiment negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))
    
    positive_histo = transcripts.loc[id_]['Henry Tone'][3]
    negative_histo = transcripts.loc[id_]['Henry Tone'][4]
    print('========================\nHenry positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nHenry negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))

In [46]:
discovery(16)

Sentiment positive words with counts:

[['increase -> 1' '28']
 ['thank -> 2' '27']
 ['good -> 3' '19']
 ['like -> 2' '14']
 ['growth -> 2' '12']
 ['important -> 2' '11']
 ['big -> 1' '10']
 ['strong -> 2' '10']
 ['kind -> 2' '8']
 ['reach -> 1' '8']
 ['significant -> 1' '8']
 ['great -> 3' '7']
 ['positive -> 2' '7']
 ['prepared -> 1' '7']
 ['expand -> 1' '6']
 ['extend -> 1' '6']
 ['fair -> 2' '6']
 ['pleased -> 3' '5']
 ['want -> 1' '5']
 ['attract -> 1' '4']
 ['benefit -> 2' '4']
 ['free -> 1' '4']
 ['help -> 2' '4']
 ['mature -> 2' '4']
 ['share -> 1' '4']
 ['worth -> 2' '4']
 ['ability -> 2' '3']
 ['agreement -> 1' '3']
 ['highlight -> 2' '3']
 ['interested -> 2' '3']
 ['natural -> 1' '3']
 ['success -> 2' '3']
 ['active -> 1' '2']
 ['certain -> 1' '2']
 ['commit -> 1' '2']
 ['confidence -> 2' '2']
 ['confident -> 2' '2']
 ['excited -> 3' '2']
 ['focused -> 2' '2']
 ['healthy -> 2' '2']
 ['helpful -> 2' '2']
 ['matter -> 1' '2']
 ['meaningful -> 2' '2']
 ['motivate -> 1' '2']
 ['