In [1]:
import spacy
nlp = spacy.load('en')

In [2]:
import pymongo
from pymongo import MongoClient

In [3]:
client = MongoClient('localhost', 27017)
db = client.python_import

In [5]:
collection = db.earnings_transcript

In [6]:
transcript = collection.find_one()

In [7]:
transcript

{'_id': ObjectId('58e57a773740f515244494fa'),
 'date_number': 1141117,
 'h_tone': {'negativeCount': 0, 'positiveCount': 0},
 'publishDate': '2014-11-17T22:25:00Z',
 'qAndAText': '',
 'q_and_a_h_tone': {'negativeCount': 0, 'positiveCount': 0},
 'q_and_a_wordSize': 0,
 'rawText': 'The following audio is from a conference call that will begin on November 17, 2014 at 17:30 PM ET. The audio will stream live while the call is active, and can be replayed upon its completion.',
 'time_number': 222500,
 'tradingSymbol': 'A',
 'url': 'https://seekingalpha.com/article/2675895-agilent-technologies-a-q1-2014-results-earnings-call-webcast',
 'wordSize': 40}

In [12]:
doc = nlp(transcript['rawText'])

In [16]:
[str(word.lemma_) for word in doc]

['the',
 'follow',
 'audio',
 'be',
 'from',
 'a',
 'conference',
 'call',
 'that',
 'will',
 'begin',
 'on',
 'november',
 '17',
 ',',
 '2014',
 'at',
 '17:30',
 'pm',
 'et',
 '.',
 'the',
 'audio',
 'will',
 'stream',
 'live',
 'while',
 'the',
 'call',
 'be',
 'active',
 ',',
 'and',
 'can',
 'be',
 'replay',
 'upon',
 '-PRON-',
 'completion',
 '.']

In [14]:
for sent in doc.sents:
    for token in sent:
        print(token)

The
following
audio
is
from
a
conference
call
that
will
begin
on
November
17
,
2014
at
17:30
PM
ET
.
The
audio
will
stream
live
while
the
call
is
active
,
and
can
be
replayed
upon
its
completion
.


In [9]:
for sent in doc.sents:
    print('=> ', sent)

=>  The following audio is from a conference call that will begin on November 17, 2014 at 17:30 PM ET.
=>  The audio will stream live while the call is active, and can be replayed upon its completion.


In [10]:
token = doc[0]
print(token)

The


In [11]:
i = 0
for sent in doc.sents:
    if i == 3:
        for token in sent:
            print('{} - {}'.format(token, token.pos_))
        break
    i+=1

In [33]:
# Write a function that walks up the syntactic tree of the given token and collects all tokens to the root token (including root token).

def tokens_to_root(token):
    """
    Walk up the syntactic tree, collecting tokens to the root of the given `token`.
    :param token: Spacy token
    :return: list of Spacy tokens
    """
    tokens_to_r = []
    while token.head is not token:
        tokens_to_r.append(token)
        token = token.head
        tokens_to_r.append(token)

    return tokens_to_r

# For every token in document, print it's tokens to the root
# Print dependency labels of the tokens
i = 0
for sent in doc.sents:
    if i == 3:
        for token in sent:
            print('{} --> {}'.format(token, tokens_to_root(token)))
            print(' -> '.join(['{} - {}'.format(dependent_token, dependent_token.dep_) for dependent_token in tokens_to_root(token)]))
            print('token probability: ', token.prob)
            print('==================')
        break
    i+=1

Today --> [Today, recorded]
Today - nsubjpass -> recorded - ROOT
token probability:  -11.045459747314453
’s --> [’s, conference, conference, recorded]
’s - compound -> conference - nsubjpass -> conference - nsubjpass -> recorded - ROOT
token probability:  -17.959850311279297
conference --> [conference, recorded]
conference - nsubjpass -> recorded - ROOT
token probability:  -11.2960786819458
is --> [is, recorded]
is - aux -> recorded - ROOT
token probability:  -4.457748889923096
being --> [being, recorded]
being - auxpass -> recorded - ROOT
token probability:  -6.845808029174805
recorded --> []

token probability:  -11.22569751739502
. --> [., recorded]
. - punct -> recorded - ROOT
token probability:  -3.0678977966308594


In [35]:
import pandas as pd

In [37]:
henry_words = pd.read_excel('henry_wordlist.xlsx')

In [49]:
henry_words.head()

Unnamed: 0,Word,Positive tone,Negative tone
0,Below,,1.0
1,Challenge,,1.0
2,Challenged,,1.0
3,Challenges,,1.0
4,Challenging,,1.0


In [50]:
db.henry_words.insert_many(henry_words.to_dict('records'))

<pymongo.results.InsertManyResult at 0x20079cc9f30>

In [57]:
henry_words_db = pd.DataFrame(list(db.henry_words.find()))

In [60]:
henry_words_db.head()

Unnamed: 0,Negative tone,Positive tone,Word,_id
0,1.0,,Below,58aaef9d3740f52d8cddccce
1,1.0,,Challenge,58aaef9d3740f52d8cddcccf
2,1.0,,Challenged,58aaef9d3740f52d8cddccd0
3,1.0,,Challenges,58aaef9d3740f52d8cddccd1
4,1.0,,Challenging,58aaef9d3740f52d8cddccd2


In [79]:
henry_words_db['Lower word'] = henry_words_db.apply(lambda x: x['Word'].lower(), axis=1)

In [81]:
henry_words_db.head()

Unnamed: 0,Negative tone,Positive tone,Word,_id,Lower word
0,1.0,,Below,58aaef9d3740f52d8cddccce,below
1,1.0,,Challenge,58aaef9d3740f52d8cddcccf,challenge
2,1.0,,Challenged,58aaef9d3740f52d8cddccd0,challenged
3,1.0,,Challenges,58aaef9d3740f52d8cddccd1,challenges
4,1.0,,Challenging,58aaef9d3740f52d8cddccd2,challenging


In [88]:
((henry_words_db['Lower word'] == 'below') & (henry_words_db['Negative tone'] == 1.0)).any()

True

In [90]:
((henry_words_db['Lower word'] == 'below') & (henry_words_db['Positive tone'] == 1.0)).any()

False

In [98]:
def get_words(transcript):
    doc = nlp(transcript['rawText'])
    return [str(word).lower() for word in doc]

def get_words_q_and_a(transcript):
    doc = nlp(transcript['qAndAText'])
    return [str(word).lower() for word in doc]

def process_words(words):
    pos_count, neg_count = 0, 0
    for word in words:
        if ((henry_words_db['Lower word'] == word) & (henry_words_db['Negative tone'] == 1.0)).any():
            neg_count += 1
        elif ((henry_words_db['Lower word'] == word) & (henry_words_db['Positive tone'] == 1.0)).any():
            pos_count += 1
    return {'positiveCount' : pos_count, 'negativeCount' : neg_count}

for transcript in collection.find():
    words = get_words(transcript)
    q_and_a_words = get_words_q_and_a(transcript)
    
    h_tone = process_words(words)
    q_and_a_h_tone = process_words(q_and_a_words)
    collection.update_one({'_id': transcript['_id']}, {'$set': {'h_tone': h_tone, 'q_and_a_h_tone' : q_and_a_h_tone}})