In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript

In [4]:
transcripts = pd.DataFrame(list(collection.find().limit(20)))

In [5]:
transcripts.head(1)

Unnamed: 0,_id,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,tradingSymbol,url,wordSize
0,5937dba3082789410c746a1e,1130423,"{'positiveCount': 203, 'negativeCount': 75}",2013-04-23 21:30:07,Operator [Operator Instructions] We'll go firs...,"{'positiveCount': 117, 'negativeCount': 46}",7493,US Airways Group ( LCC ) Q1 2013 Earnings Call...,213007,AAL,https://seekingalpha.com/article/1363811-us-ai...,11310


In [6]:
nlp = English()

In [7]:
def tokenize(sent):
    return [tok.lemma_ for tok in sent if tok.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"] and not tok.lemma_ == '-PRON-']

In [8]:
#def tokenize(sent):
#    return [tok.lemma_ for tok in sent]

# Load Henry Tone

In [9]:
henry = pd.read_excel('henry_wordlist.xlsx')
henry['Word'] = henry['Word'].str.lower()
henry.reset_index()
henry.head(5)

Unnamed: 0,Word,Score
0,below,-1
1,challenge,-1
2,challenged,-1
3,challenges,-1
4,challenging,-1


In [10]:
def get_first_value(dic, word):
    ser = dic[dic['Word'] == word]['Score']
    if len(ser) > 0:
        return ser.iloc[0]
    return 0

In [11]:
get_first_value(henry, 'good')

1

In [12]:
get_first_value(henry, 'worse')

-1

# Load AFINN-111

In [13]:
afinn = pd.read_csv(filepath_or_buffer='AFINN-111.txt', sep='\t', header=None)

In [14]:
afinn.rename(index=str, columns={0: "Word", 1: "Score"}, inplace=True)

In [15]:
afinn.tail(5)

Unnamed: 0,Word,Score
2472,yucky,-2
2473,yummy,3
2474,zealot,-2
2475,zealots,-2
2476,zealous,2


In [16]:
get_first_value(afinn, 'shit')

-4

In [17]:
transcripts['NLP'] = transcripts['rawText'].apply(lambda val: nlp(val))

In [18]:
transcripts['tokens'] = transcripts['NLP'].apply(lambda val: tokenize(val))

In [19]:
from scipy.stats import itemfreq

In [None]:
def build_score_for_tokens(tokens, dictionary):
    score_pos, score_neg = 0, 0
    token_pos, token_neg = [], []
    for token in tokens:
        temp_score = get_first_value(dictionary, token)
        if temp_score > 0:
            score_pos += temp_score
            token_pos.append(token)
        elif temp_score < 0:
            score_neg += temp_score
            token_neg.append(token)
    return (score_pos, score_neg, score_pos + score_neg, itemfreq(token_pos), itemfreq(token_neg))

In [None]:
transcripts['Sentiment Score'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, afinn))

In [None]:
transcripts['Henry Tone'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val, henry))

In [None]:
transcripts['SS Ratio'] = transcripts['Sentiment Score'].apply(lambda row: row[0]/(-row[1]))

In [None]:
transcripts['HTone Ratio'] = transcripts['Henry Tone'].apply(lambda row: row[0]/(-row[1]))

In [None]:
transcripts[['url', 'Sentiment Score', 'Henry Tone', 'h_tone', 'SS Ratio', 'HTone Ratio']]

### Smaller sentiment score ratio

In [None]:
transcripts[transcripts['SS Ratio'] < transcripts['HTone Ratio']]['url']

### Much bigger sentiment score ratio

In [None]:
transcripts[(transcripts['SS Ratio'] / transcripts['HTone Ratio']) > 2]['url']

### Outlier discovery

In [None]:
def discovery(id_):
    positive_histo = transcripts.loc[id_]['Sentiment Score'][3]
    negative_histo = transcripts.loc[id_]['Sentiment Score'][4]
    print('========================\nSentiment positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nSentiment negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))
    
    positive_histo = transcripts.loc[id_]['Henry Tone'][3]
    negative_histo = transcripts.loc[id_]['Henry Tone'][4]
    print('========================\nHenry positive words with counts:\n')
    print(np.array(sorted(positive_histo, key=lambda var: int(var[1]), reverse=True)))
    print('========================\nHenry negative words with counts:\n')
    print(np.array(sorted(negative_histo, key=lambda var: int(var[1]), reverse=True)))

In [None]:
discovery(16)