In [35]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import glob
import json 
import re
import nltk
from nltk.tokenize import sent_tokenize
import spacy
nlp=spacy.load('en_core_web_sm')

In [2]:
filepaths = glob.glob('data/*.htm*')

In [3]:
ten_ks = {}

for filepath in filepaths:
    with open(filepath, 'r', encoding= "utf-8") as f:
        content = f.read()
        soup = BeautifulSoup(content, 'html5lib')
        
        # Kill all script and style elements
        for script in soup(['script', 'style']):
            script.extract()

        # Get text    
        text = soup.get_text()

        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())

        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)

        # Append to dictionary
        ten_ks[filepath] = text


In [4]:
with open("data/annotations.json", encoding='utf-8') as file:
    annotation = json.load(file)

In [5]:
def sorter(filepath):
    labels = ""
    for i in annotation:
        if i['filing'] == filepath[5:]:
            labels = labels + ' ' + (i['text'])
    #
    text = ten_ks[filepath]
    text = str.lower(text)
    #
    text = re.sub('\xa0', ' ', text)
    #
    sent_doc = [sentence.lower() for sentence \
             in sent_tokenize(text)]
    #
    labels = [sentence.lower().strip() for sentence \
             in sent_tokenize(labels)]
    #
    y = []
    for sentence in sent_doc:
        if any(label in sentence for label in labels):
            y.append(1)
        else:
            y.append(0)
    return sent_doc, y

In [6]:
sent_doc = []
y=[]
for filepath in ten_ks.keys():
    new_sentences, new_y = sorter(filepath)
    #extend concatentates existing lists into one big list
    sent_doc.extend(new_sentences)
    y.extend(new_y)

In [7]:
sum(y)

199

In [8]:
sent_doc[8]

'☑\nindicate by check mark whether the registrant (1) has filed all reports required to be filed by section 13 or 15(d) of the securities exchange act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days.'

In [9]:
# filepath = list(ten_ks.keys())[0]
# filepath[5:]

In [10]:
# #sort through annotation to find relevant text 
# labels = ""
# for i in annotation:
#     if i['filing'] == filepath[5:]:
#         labels = labels + ' ' + (i['text'])
    

In [11]:
# text = ten_ks[filepath]
# text = str.lower(text)

In [12]:
# text = re.sub('\xa0', ' ', text)

In [13]:
# sent_doc = [sentence.lower() for sentence \
#              in sent_tokenize(text)]

In [14]:
# labels = [sentence.lower().strip() for sentence \
#              in sent_tokenize(labels)]

In [15]:
# #for label in labels - check all of the labels
# #label in sentence - check if label is in sentence
# #if any - is label 1, 2, 3, 4, in there?
# y = []
# for sentence in sent_doc:
#     if any(label in sentence for label in labels):
#         y.append(1)
#     else:
#         y.append(0)
        

In [16]:
# sentences = []
# stemmer = PorterStemmer()
# stock_stem = stemmer.stem('stock')
# shares_stem = stemmer.stem('shares')

# for idx, sentence in enumerate(sent_doc):
#     if 'repurchase program' in sentence:
#         chunk = sent_doc[idx-5:idx+6]
#         for x in chunk:
#             if stock_stem in x or shares_stem in x:
#                 sentences.append(chunk)
#                 break

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

vect = CountVectorizer(
        token_pattern = r"[a-z]+",
        ngram_range = (1,1),
        lowercase = True,
        min_df = 4,
        max_df = 1.0)

In [18]:
vect.fit(sent_doc)

CountVectorizer(min_df=4, token_pattern='[a-z]+')

In [19]:
sent_train, sent_test, y_train, y_test = train_test_split(sent_doc, y, test_size = 0.33, random_state = 432, stratify = y)

In [20]:
sent_test[652]

'dispositions 643. earnings per share, share repurchases and dividends on common stock654.'

In [21]:
count_train = vect.transform(sent_train)
count_test = vect.transform(sent_test)

In [22]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB 

nb_classifier = MultinomialNB(alpha=1)
nb_classifier.fit(count_train, y_train)
pred = nb_classifier.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
cm = metrics.confusion_matrix(y_test, pred, labels=[0, 1])
cm

array([[12292,    98],
       [   26,    40]], dtype=int64)

In [33]:
sent_pred = list(np.array(sent_test)[np.where(pred == 1)])

In [37]:
val_doc = []

for value in sent_pred:
    val_doc.append(nlp(str(value)))


In [41]:
from spacy.matcher import Matcher

In [42]:
matcher = Matcher(nlp.vocab)
repur_1 = [{'LEMMA': 'repurchase'}]
matcher.add('REPURCHASE_PATTERN', [repur_1])
repur_2 = [{'LEMMA': 'authoriz'}]
matcher.add('AUTH_PATTERN', [repur_2])
repur_3 = [{'LEMMA': 'share'}]
matcher.add('SHARE_PATTERN', [repur_3])


In [43]:
ent_match_cont = []
for doc in val_doc:
    ent_match_list = []
    for ent in doc.ents:
        if ent.label_ == "CARDINAL":
            ent_match_list.append((ent.text, \
                                   ent.start_char, ent.end_char, \
                                   ent.label_))
        if ent.label_ == "DATE":
            ent_match_list.append((ent.text, \
                                   ent.start_char, ent.end_char, \
                                   ent.label_))
        if ent.label_ == 'MONEY':
            ent_match_list.append((ent.text, \
                                   ent.start_char, ent.end_char, \
                                   ent.label_))
        if ent.label_ == 'SHARES':
            ent_match_list.append((ent.text, \
                                   ent.start_char, ent.end_char, \
                                   ent.label_))
    ent_match_cont.append({'TEXT': doc.text,
                           'MATCHER': matcher(doc),
                           'ENTITIES': ent_match_list})

In [44]:
ent_match_cont

[{'TEXT': 'on july 16, 2008, the company’s\nboard of directors increased the authorization by an additional\n3,000,000 vail shares, and on december 4, 2015, the company’s board\nof directors increased the authorization by an additional 1,500,000\nvail shares for a total authorization to repurchase up to 7,500,000\nvail shares.',
  'MATCHER': [(3387861381980900341, 22, 23),
   (3387861381980900341, 47, 48),
   (7634494319942164669, 53, 54),
   (3387861381980900341, 59, 60)],
  'ENTITIES': [('july 16, 2008', 3, 16, 'DATE'),
   ('3,000,000', 96, 105, 'CARDINAL'),
   ('december 4, 2015', 126, 142, 'DATE'),
   ('an additional 1,500,000', 208, 231, 'CARDINAL'),
   ('up to 7,500,000', 284, 299, 'CARDINAL')]},
 {'TEXT': 'under the program, we are authorized to repurchase shares for cash on the open market, from time to time, in privately negotiated transactions or block transactions, or through an accelerated repurchase agreement.',
  'MATCHER': [(7634494319942164669, 8, 9),
   (33878613819809

In [23]:
np.where((np.array(y_test)==0) & (pred==1))[0]

array([   51,    78,   199,   266,   325,   431,   558,   604,   635,
         652,   743,  1191,  1621,  1732,  2026,  2271,  2833,  3026,
        3222,  3242,  3270,  3328,  3337,  3362,  3552,  3645,  3699,
        3751,  3776,  4309,  4439,  4443,  4533,  4581,  4687,  4772,
        4780,  4841,  4862,  5155,  5368,  5469,  5642,  5700,  5767,
        5948,  5998,  6083,  6146,  6163,  6263,  6346,  6412,  6522,
        6945,  6956,  7004,  7108,  7114,  7238,  7279,  7408,  7410,
        7527,  7641,  7719,  7942,  8104,  8373,  8477,  8504,  8548,
        8554,  8591,  8708,  8992,  9148,  9477,  9490,  9591,  9725,
        9765,  9867, 10591, 10703, 10709, 10785, 11260, 11421, 11424,
       11428, 11581, 11744, 11917, 12009, 12219, 12276, 12342],
      dtype=int64)

In [24]:
prior_not_repo, prior_repo = np.exp(nb_classifier.class_log_prior_)
prior_repo

0.005259411578614359

In [25]:
sanity_check = pd.DataFrame({'word': vect.get_feature_names(),
                             'probability_given_repurchase': np.exp(nb_classifier.feature_log_prob_[1]),
                            'probability_given_not_repo': np.exp(nb_classifier.feature_log_prob_[0])})

sanity_check["prob_ratio"] = sanity_check.probability_given_repurchase / sanity_check.probability_given_not_repo
sanity_check['conditional_prob'] = sanity_check.probability_given_repurchase * prior_repo / \
(sanity_check.probability_given_repurchase * prior_repo + sanity_check.probability_given_not_repo * prior_not_repo)
sanity_check.sort_values('conditional_prob', ascending=False).head(50)

Unnamed: 0,word,probability_given_repurchase,probability_given_not_repo,prob_ratio,conditional_prob
6434,repurchasesin,0.000286,9.802576e-07,291.885551,0.606804
3389,goldman,0.000501,5.881546e-06,85.133286,0.310401
6690,sachs,0.000501,5.881546e-06,85.133286,0.310401
538,asr,0.000358,4.901288e-06,72.971388,0.278403
5549,philadelphia,0.000143,1.960515e-06,72.971388,0.278403
2662,equates,0.000143,1.960515e-06,72.971388,0.278403
4486,lossthe,0.000143,1.960515e-06,72.971388,0.278403
5334,overallotment,0.000215,2.940773e-06,72.971388,0.278403
8071,valuedue,7.2e-05,9.802576e-07,72.971388,0.278403
3472,guild,7.2e-05,9.802576e-07,72.971388,0.278403


In [26]:
P(repurchase|word) = P(word|repurchase) * P(repurchase) / P(word) 

approximating P(word) (B)

P(word|repurchase) * P(repurchase) + P(word|not_repurchase) * P(not_repurchase)

SyntaxError: invalid syntax (<ipython-input-26-2627b02e7c6d>, line 3)

In [None]:
np.exp(nb_classifier.feature_log_prob_[1]) 

In [None]:
# def alphanumeric_tokenize(ten_ks):
#     return re.findall(r'[a-zA-Z0-9]+', ten_ks)

# token_dict = {}
# for 