In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
import textacy

In [4]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript_NAS100

In [5]:
transcripts = pd.DataFrame(list(collection.find({'tradingSymbol':'AAPL'})))

# Countvectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
nlp = English()
def tokenize(sent):
    return [tok.lemma_ for tok in sent if tok.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"] and tok.is_alpha and not tok.lemma_ == '-PRON-']

In [8]:
count_model = CountVectorizer(ngram_range=(1,1), tokenizer=tokenize, lowercase=False, min_df=3, max_df=0.5)

In [9]:
corpus = transcripts['rawText'].apply(lambda row: list(nlp(row).sents))

In [10]:
corpus_flattened = corpus.apply(pd.Series).stack().reset_index(drop=True)

In [11]:
counted_transcripts = count_model.fit_transform(corpus_flattened)

In [12]:
counted_transcripts

<19932x2833 sparse matrix of type '<class 'numpy.int64'>'
	with 178980 stored elements in Compressed Sparse Row format>

In [13]:
list(count_model.vocabulary_.keys())[0:10]

['apple',
 'nasdaq',
 'aapl',
 'earning',
 'call',
 'january',
 'pm',
 'et',
 'executives',
 'nancy']

# Co-occurrance

In [14]:
transposed = counted_transcripts.T

In [15]:
co_occurrence = transposed * counted_transcripts
co_occurrence.setdiag(0)
co_occurrence = co_occurrence.todense()

# Computing term probabilites
Document Frequency (DF) of a term as the number of documents where the term occurs.

In [16]:
document_count = counted_transcripts.shape[0]
document_count

19932

In [17]:
document_frequency = np.sum(counted_transcripts.todense(), axis=0)

### Probabilities

In [18]:
prob_term = document_frequency/document_count
prob_term.shape

(1, 2833)

In [19]:
prob_term_coocc = co_occurrence/document_count
prob_term_coocc.shape

(2833, 2833)

# Computing PMI and the semantic orientation

In [20]:
positive_words = pd.read_csv('positive-words.txt')
negative_words = pd.read_csv('negative-words.txt')

In [21]:
vocabulary = count_model.vocabulary_

In [22]:
PMI = np.log(prob_term_coocc/(prob_term.T*prob_term))

  """Entry point for launching an IPython kernel.


In [23]:
PMI[PMI == -np.inf] = 0

In [24]:
PMI = np.array(PMI)

In [25]:
PMI.shape

(2833, 2833)

In [32]:
def calc_assoc(words, vocabulary_index):
    assoc = 0
    for pos_word in words:
        if pos_word in vocabulary:
            assoc += PMI[vocabulary_index][vocabulary[pos_word]]
    return assoc

semantic_orientation = {}
            
for vocabulary_item, vocabulary_index in vocabulary.items():
    if vocabulary_item not in positive_words.Word and vocabulary_item not in negative_words.Word:
        positive_assoc = calc_assoc(positive_words.Word, vocabulary_index)
        negative_assoc = calc_assoc(negative_words.Word, vocabulary_index)
        semantic_orientation[vocabulary_item] = positive_assoc - negative_assoc

In [33]:
semantic_sorted = sorted(semantic_orientation, 
                         key=semantic_orientation.get, 
                         reverse=True)

In [37]:
top_pos = semantic_sorted[:30]
top_neg = semantic_sorted[-30:]

In [38]:
top_pos

['customer',
 'design',
 'performance',
 'app',
 'experience',
 'new',
 'feature',
 'developer',
 'work',
 'ios',
 'team',
 'continue',
 'apps',
 'make',
 'world',
 'great',
 'way',
 'product',
 'powerful',
 'innovative',
 'user',
 'platform',
 'introduce',
 'ecosystem',
 'light',
 'display',
 'deliver',
 'process',
 'bring',
 'apple']

In [39]:
top_neg

['regard',
 'pete',
 'deviation',
 'recover',
 'worry',
 'settle',
 'maynard',
 'signing',
 'vary',
 'confirm',
 'apologize',
 'spending',
 'worried',
 'issue',
 'piece',
 'dollar',
 'margin',
 'cause',
 'nuclear',
 'disruption',
 'subsequent',
 'require',
 'subsidy',
 'revenue',
 'budget',
 'expect',
 'expense',
 'currency',
 'alleviate',
 'question']