In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
import textacy

In [4]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript

In [44]:
transcripts = pd.DataFrame(list(collection.find({'tradingSymbol':'AAPL'}).sort('publishDate', -1).limit(200)))

In [49]:
transcripts.head(3)

Unnamed: 0,_id,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,tradingSymbol,url,wordSize
0,5959f6f03740f52944a686e6,1130124,"{'positiveCount': 236, 'negativeCount': 30}",2017-07-02 09:49:04.500,,"{'positiveCount': 0, 'negativeCount': 0}",0,Apple Inc. (NASDAQ: AAPL ) F1Q 2013 Earnings C...,20404,AAPL,https://seekingalpha.com/article/1129431-apple...,10631
1,5937e53f082789410c746adf,1170503,"{'positiveCount': 246, 'negativeCount': 32}",2017-05-03 00:13:31.000,Operator First we'll hear from Katy Huberty wi...,"{'positiveCount': 103, 'negativeCount': 22}",5359,"Apple, Inc. (NASDAQ: AAPL ) Q2 2017 Earnings C...",1331,AAPL,https://seekingalpha.com/article/4068153-apple...,9418
2,5937e539082789410c746ade,1170201,"{'positiveCount': 266, 'negativeCount': 32}",2017-02-01 01:27:17.000,Operator Your first question will come from Ka...,"{'positiveCount': 150, 'negativeCount': 22}",5830,"Apple, Inc. (NASDAQ: AAPL ) Q1 2017 Earnings C...",12717,AAPL,https://seekingalpha.com/article/4041266-apple...,9452


In [50]:
nlp = English()

In [51]:
doc = textacy.Doc(transcripts.iloc[0]['rawText'])

In [23]:
def tokenize(doc):
    return [tok.lemma_ for tok in doc if tok.is_alpha and not tok.is_stop]

# Textacy discovery

In [65]:
ngrams = list(textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=False, min_freq=2))

In [66]:
ngrams_str = map(lambda row: str(row), ngrams)

In [67]:
from collections import Counter

In [68]:
c = Counter(ngrams_str)

In [69]:
c.most_common(30)

[('year ago quarter', 16),
 ('Nancy Paxton Thanks', 6),
 ('Cross Research Group', 5),
 ('Bank of Montreal', 5),
 ('ended the quarter', 5),
 ('Nancy Paxton Thank', 4),
 ('current year quarter', 4),
 ('range of guidance', 4),
 ('Bank of America', 3),
 ('like to turn', 3),
 ('based compensation expense', 3),
 ('new product pipeline', 3),
 ('iPhone and iPad', 3),
 ('year quarter compared', 3),
 ('IDC’s latest', 3),
 ('’s latest published', 3),
 ('store per week', 3),
 ('mix of iPhone', 3),
 ('stock based compensation', 3),
 ('Peter Oppenheimer Sure', 3),
 ('2.6 million units', 3),
 ('supply demand balance', 3),
 ("APPLICABLE COMPANY'S", 3),
 ('pleased to report', 2),
 ('records for iPhone', 2),
 ('Apple retail stores', 2),
 ('operations and sales', 2),
 ('teams to deliver', 2),
 ('beautifully designed products', 2),
 ('’re very pleased', 2)]

In [71]:
list(textacy.extract.named_entities(doc, drop_determiners=True))[:10]

[Apple Inc.,
 NASDAQ,
 AAPL,
 2013,
 2013,
 17:00 ET,
 Nancy Paxton - Senior,
 Tim Cook - Chief,
 Peter Oppenheimer - Chief,
 Gary Wipfler - Treasurer Analysts]

In [72]:
pattern = textacy.constants.POS_REGEX_PATTERNS['en']['NP']

In [75]:
list(textacy.extract.pos_regex_matches(doc, pattern))[40:50]

[today,
 Apple’s CEO,
 Tim Cook,
 CFO,
 Peter Oppenheimer,
 Treasurer,
 Gary Wipfler,
 the Q&A session,
 the analysts,
 the information]

In [76]:
textacy.keyterms.textrank(doc, n_keyterms=10)

[('quarter', 0.017455774513649963),
 ('iphone', 0.012834341420196968),
 ('product', 0.01155272750229087),
 ('year', 0.01154287607658789),
 ('ipad', 0.011526452566039992),
 ('apple', 0.009111002096329104),
 ('tim', 0.007418784217395101),
 ('customer', 0.007243849735931341),
 ('revenue', 0.007237940167867038),
 ('sale', 0.007143482403027271)]

In [78]:
ts = textacy.text_stats.TextStats(doc)

In [79]:
ts.n_unique_words

1709

In [80]:
ts.basic_counts

{'n_chars': 43974,
 'n_long_words': 2075,
 'n_monosyllable_words': 6921,
 'n_polysyllable_words': 881,
 'n_sents': 446,
 'n_syllables': 13719,
 'n_unique_words': 1709,
 'n_words': 9690}

In [81]:
ts.readability_stats

{'automated_readability_index': 10.807587832678507,
 'coleman_liau_index': 9.52062437915377,
 'flesch_kincaid_grade_level': 9.589634175123908,
 'flesch_readability_ease': 65.00685626622612,
 'gulpease_index': 57.42724458204334,
 'gunning_fog_index': 12.327321865730008,
 'lix': 43.14028608847363,
 'smog_index': 11.158174000464356,
 'wiener_sachtextformel': 4.958740044519107}

In [119]:
bot = doc.to_bag_of_terms(ngrams={3, 4}, as_strings=True)
sorted(bot.items(), key=lambda x: x[1], reverse=True)[30:50]

[('bank of montreal', 5),
 ('end the quarter', 5),
 ('million in the year', 5),
 ('keith bachman - bank', 4),
 ('mark moskowitz - jpmorgan', 4),
 ('apple ’s', 4),
 ('peter oppenheimer', 4),
 ('current year quarter', 4),
 ('third', 4),
 ('toni sacconaghi - sanford bernstein', 4),
 ('march', 4),
 ('previous year', 4),
 ('two', 4),
 ('last quarter', 4),
 ('range of guidance', 4),
 ('2013', 3),
 ('ben reitzes - barclays', 3),
 ('steve', 3),
 ('quarterly', 3),
 ('13 week', 3)]

In [88]:
doc.count('great')

11

In [95]:
doc.count('loss')

1

In [100]:
list(textacy.extract.subject_verb_object_triples(doc))

[(I, would like, to turn),
 (Speaking, are, ’s CEO),
 (Apple, assumes, obligation),
 (I, like, to turn),
 (technology company, reported, kinds),
 (we, have introduced, products),
 (one, reflects, dedication),
 (we, are unwilling, to cut),
 (commitment, is, reason),
 (customers, choose, to buy),
 (this, be, force),
 (Apple, is, people),
 (we, are, team),
 (Part, is, to preserve),
 (Part, is, drive),
 (Part, is, remain),
 (we, remain, company),
 (Apple, to do, work),
 (there, is, reward),
 (customers, love, to use),
 (We, continue, to believe),
 (people, will serve, us),
 (Tomorrow, marks, anniversary),
 (Steve, introduced, Macintosh),
 (it, was supposed, to be),
 (you, have, to touch),
 (that, brought, Mac),
 (that, brought, products),
 (they, are, way),
 (we, measure, success),
 (customers, love, product),
 (We, sold, devices),
 (We, couldn’t have achieved, milestones),
 (Everyone, has, eyes),
 (who, share, purpose),
 (Peter, will review, highlights),
 (We, established, records),
 (qua

In [105]:
adjectives = list(textacy.extract.words(doc, include_pos='ADJ', filter_stops=True, filter_punct=True))
adjectives_str = map(str, adjectives)

In [106]:
c = Counter(adjectives_str)
c.most_common(30)

[('new', 27),
 ('gross', 12),
 ('great', 11),
 ('higher', 10),
 ('high', 9),
 ('little', 9),
 ('best', 8),
 ('pleased', 7),
 ('strong', 7),
 ('clear', 7),
 ('larger', 7),
 ('previous', 7),
 ('second', 6),
 ('latest', 6),
 ('average', 6),
 ('significant', 6),
 ('single', 6),
 ('sequential', 6),
 ('confident', 6),
 ('good', 5),
 ('important', 5),
 ('fiscal', 5),
 ('current', 5),
 ('overall', 5),
 ('tremendous', 5),
 ('total', 5),
 ('short', 5),
 ('reasonable', 5),
 ('different', 5),
 ('specific', 5)]

In [108]:
list(textacy.extract.words(doc, include_pos='NOUN'))[50:60]

[sales,
 technology,
 company,
 kinds,
 results,
 periods,
 innovation,
 products,
 history,
 months]

In [110]:
list(textacy.extract.words(doc, include_pos='VERB'))[50:60]

[serve, coming, marks, introduced, said, supposed, use, touch, come, rely]

In [120]:
list(textacy.keyterms.sgrank(doc, ngrams={2,3}, n_keyterms=30))

[('cross research group', 0.17684877217419395),
 ('tim cook', 0.10202934484296118),
 ('current year quarter', 0.08375020000302394),
 ('peter oppenheimer', 0.04851380757560747),
 ('nancy paxton', 0.0403578043680476),
 ('gross margin', 0.033427043249452955),
 ('good afternoon', 0.02327419821385837),
 ('new product', 0.021413604457510997),
 ('nancy paxton thank', 0.020798328274365653),
 ('december quarter', 0.019487315495107932),
 ('ipad sale', 0.01860707334354553),
 ('keith bachman', 0.01802272971618052),
 ('steve milunovich', 0.017699459739349565),
 ('chris whitmore', 0.017503291370776733),
 ('deutsche bank', 0.017178583803285755),
 ('shannon cross', 0.017119959549262185),
 ('katy huberty', 0.01710729208050091),
 ('morgan stanley', 0.01708876749472913),
 ('march quarter', 0.01589504154019362),
 ('channel inventory', 0.015844439968861625),
 ('ipad mini', 0.015683625550617437),
 ('bill shope', 0.015363309275401848),
 ('goldman sachs', 0.015361800522869938),
 ('mark moskowitz', 0.015339307

In [113]:
list(textacy.keyterms.singlerank(doc, n_keyterms=30))

[('quarter fiscal year', 0.05435460795208541),
 ('current year quarter', 0.053978429056079844),
 ('week quarter', 0.036488464316690214),
 ('december quarter', 0.035459552082040664),
 ('march quarter', 0.03425826182632119),
 ('iphone market share growth', 0.033604784168682976),
 ('sequential basis iphone sale', 0.03174332086408786),
 ('previous quarter', 0.031644633305077226),
 ('fiscal quarter', 0.031504117394321025),
 ('year sale', 0.03136525135313889),
 ('september quarter', 0.03097410454927005),
 ('year unit growth', 0.030087816842896808),
 ('quarter', 0.02982483243245281),
 ('year ’s', 0.029657888496306767),
 ('increase iphone point', 0.02887852161265135),
 ('year growth', 0.027606786514304784),
 ('growth year', 0.027606786514304784),
 ('experience strong iphone growth', 0.02727208172439098),
 ('iphone sale', 0.02684952220839724),
 ('year increase', 0.026849365765950747),
 ('increase year', 0.026849365765950747),
 ('large year', 0.026298596798386125),
 ('year result', 0.02627967536