In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English
import spacy

In [3]:
import textacy

In [4]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript_NAS100

In [5]:
transcripts = pd.DataFrame(list(collection.find({'tradingSymbol':'AAPL'})))

# Build, save, load corpus

In [6]:
corp = textacy.corpus.Corpus(lang='en')

In [7]:
[corp.add_doc(doc) for doc in transcripts['rawText'].apply(lambda row: textacy.Doc(textacy.preprocess.fix_bad_unicode(row)))]
corp

Corpus(43 docs; 430986 tokens)

In [8]:
corp.save(path='./data', name='apple-corpus', compression=None)

In [9]:
corp = corp.load(path='./data', name='apple-corpus')
corp

Corpus(43 docs; 430986 tokens)

In [10]:
freqs = corp.word_freqs(as_strings=True)

In [11]:
import collections

In [12]:
collections.Counter(freqs).most_common(20)

[('-PRON-', 30294),
 ('quarter', 3887),
 ('year', 2758),
 ('iphone', 2227),
 ('$', 1813),
 ('think', 1589),
 ('product', 1351),
 ('apple', 1350),
 ('thank', 1345),
 ('million', 1264),
 ('question', 1260),
 ('revenue', 1241),
 ('billion', 1233),
 ('market', 1229),
 ('ipad', 1208),
 ('new', 1187),
 ('store', 1120),
 ('sale', 999),
 ('customer', 999),
 ('look', 985)]

In [13]:
dm = textacy.data.load_depechemood(download_if_missing=True)

In [14]:
words_of_first_doc = list([textacy.extract.words(doc) for doc in corp][1])
words_of_first_doc[0:20]

[Apple,
 Inc.,
 NASDAQ,
 AAPL,
 Q4,
 2016,
 Earnings,
 October,
 25,
 2016,
 5:00,
 pm,
 ET,
 Executives,
 Nancy,
 Paxton,
 Apple,
 Inc.,
 Timothy,
 Donald]

In [15]:
textacy.lexicon_methods.emotional_valence(words_of_first_doc)

defaultdict(float,
            {'AFRAID': 0.1172795748547371,
             'AMUSED': 0.13616051265908996,
             'ANGRY': 0.10357801449735822,
             'ANNOYED': 0.1165875946666659,
             'DONT_CARE': 0.13534531386317208,
             'HAPPY': 0.1367107607250905,
             'INSPIRED': 0.14145133531040147,
             'SAD': 0.11541316142221529})

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
corp[0][0].similarity(corp[0][3])

0.074094095347881986

In [18]:
corp.vectors.shape

(43, 300)

In [19]:
corp.word_doc_freqs(as_strings=True)['imply']

22

In [20]:
corp.word_freqs(as_strings=True)['imply']

38

In [64]:
[key for key in freqs.keys()][0:20]

['imply',
 'metric',
 'unique',
 'withhold',
 'productive',
 '201',
 'appeal',
 'discuss',
 'remark',
 'operator',
 'app',
 'fresh',
 'mix',
 'opening',
 'momentum',
 'fantastic',
 'upgrader',
 'developer',
 'usage',
 'radical']

In [77]:
def most_similar(word: spacy.tokens.Token, top_n=10):
    queries = [w for w in corp.spacy_vocab if w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [item.lower_ for item in by_similarity[:top_n]]

In [78]:
most_similar(corp.spacy_vocab['design'], 30)

['design',
 'designs',
 'designed',
 'designers',
 'designer',
 'layout',
 'architecture',
 'graphic',
 'style',
 'construction',
 'concept',
 'engineering',
 'custom',
 'creative',
 'unique',
 'modern',
 'art',
 'art',
 'development',
 'technology',
 'logo',
 'project',
 'project',
 'fashion',
 'concepts',
 'shape',
 'projects',
 'inspired',
 'detail',
 'create']

In [94]:
test_corp = textacy.corpus.Corpus(lang='en')

In [95]:
test_corp.add_text("Best team in the world. It is really great soccer.")

In [96]:
test_corp.add_text("Such a great player in a very small team. He should earn more.")

In [97]:
test_corp.add_text("It was a good game tonight, every player scored a goal. The stadion was full.")

In [98]:
words = [textacy.extract.words(doc) for doc in test_corp]
flat_words = [item for sublist in words for item in sublist]

In [102]:
def most_similar_2(word: spacy.tokens.Token, top_n=10):
    queries = [w for w in flat_words if w.prob >= -30]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [item.lower_ for item in by_similarity[:top_n]]

In [103]:
test_corp[2][16]

full

In [109]:
most_similar_2(test_corp[2][16],50)

['good',
 'great',
 'great',
 'small',
 'best',
 'world',
 'game',
 'team',
 'team',
 'player',
 'player',
 'earn',
 'tonight',
 'goal',
 'soccer',
 'scored',
 'stadion']

In [86]:
nlp=English()
nlp('Great soccer and balls').similarity(nlp('The net was protected'))

0.53450516123609348