In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

In [3]:
import textacy

In [4]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript_NAS100

In [5]:
transcripts = pd.DataFrame(list(collection.find({'tradingSymbol':'AAPL'})))

# Countvectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
nlp = English()
def tokenize(sent):
    return [tok.lemma_ for tok in sent if not tok.pos_ == 'PUNCT' and tok.is_alpha]

In [8]:
#test = nlp('Hello you 2-faced god. Let me show you some a+ hotels.')

In [9]:
#for sent in test.sents:
#    print([tok for tok in sent if not tok.pos_ == 'PUNCT'])

In [10]:
count_model = CountVectorizer(ngram_range=(1,1), tokenizer=tokenize, lowercase=False, min_df=3, max_df=0.5)

In [11]:
corpus = transcripts['rawText'].apply(lambda row: list(nlp(row).sents))

In [12]:
corpus_flattened = corpus.apply(pd.Series).stack().reset_index(drop=True)

In [13]:
counted_transcripts = count_model.fit_transform(corpus_flattened)

In [14]:
counted_transcripts

<19932x3132 sparse matrix of type '<class 'numpy.int64'>'
	with 277186 stored elements in Compressed Sparse Row format>

In [15]:
list(count_model.vocabulary_.keys())[0:10]

['apple',
 'nasdaq',
 'aapl',
 'earning',
 'call',
 'january',
 'pm',
 'et',
 'executives',
 'nancy']

# Co-occurrance

In [16]:
transposed = counted_transcripts.T

In [17]:
co_occurrence = transposed * counted_transcripts
co_occurrence.setdiag(0)
co_occurrence = co_occurrence.todense()

# Computing term probabilites
Document Frequency (DF) of a term as the number of documents where the term occurs.

In [18]:
document_count = counted_transcripts.shape[0]
document_count

19932

In [19]:
document_frequency = np.count_nonzero(counted_transcripts.todense(), axis=0)

### Probabilities

In [20]:
prob_term = document_frequency/document_count
prob_term.shape

(1, 3132)

In [21]:
prob_term_coocc = co_occurrence/document_count
prob_term_coocc.shape

(3132, 3132)

# Computing PMI and the semantic orientation

In [22]:
positive_words = pd.read_csv('positive-words.txt')
negative_words = pd.read_csv('negative-words.txt')

In [23]:
vocabulary = count_model.vocabulary_

In [24]:
PMI = np.log(prob_term_coocc/(prob_term.T*prob_term))

  """Entry point for launching an IPython kernel.


In [25]:
PMI[PMI == -np.inf] = 0

In [26]:
PMI = np.array(PMI)

In [27]:
PMI.shape

(3132, 3132)

In [28]:
def calc_assoc(words, vocabulary_index):
    assoc = 0
    for pos_word in words:
        if pos_word in vocabulary:
            assoc += PMI[vocabulary_index][vocabulary[pos_word]]
    return assoc

semantic_orientation = {}
            
for vocabulary_item, vocabulary_index in vocabulary.items():
    positive_assoc = calc_assoc(positive_words.Word, vocabulary_index)
    negative_assoc = calc_assoc(negative_words.Word, vocabulary_index)
    semantic_orientation[vocabulary_item] = positive_assoc - negative_assoc

In [32]:
semantic_sorted = sorted(semantic_orientation, 
                         key=semantic_orientation.get, 
                         reverse=True)

In [37]:
top_pos = semantic_sorted[:20]
top_neg = semantic_sorted[-20:]

In [38]:
top_pos

['customer',
 'and',
 'app',
 'new',
 'experience',
 'design',
 'performance',
 'feature',
 'ios',
 'work',
 'developer',
 'team',
 'apps',
 'product',
 'continue',
 'most',
 'very',
 'to',
 'these',
 'more']

In [39]:
top_neg

['oh',
 'signing',
 'um',
 'expect',
 'thank',
 'require',
 'piece',
 'currency',
 'apologize',
 'expense',
 'cause',
 'budget',
 'subsidy',
 'nuclear',
 'disruption',
 'subsequent',
 'alleviate',
 'any',
 'please',
 'question']