In [1]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import pickle

In [2]:
# loading in dictionary of 10Ks
with open('../data/10k_processed.pkl', 'rb') as f:
    sec_10ks = pickle.load(f)

In [3]:
def find_stock_repurchase(string):
    nlp = spacy.load("en_core_web_md")
    
    matcher = Matcher(nlp.vocab)

    corpus = string
    doc = nlp(corpus)
    pattern = [{'LEMMA': 'share'},
               {'IS_ALPHA': True, 'OP': '*'},
               {'LEMMA': 'repurchase'}]
    matcher.add('REPURCHASE', [pattern]) 

    matches = matcher(doc)
    text_matches = []
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        text_matches.append("Match ID: {}\nString ID: {}\nStart: {}\nEnd: {}\nText: {}\nSentence: {}".format(
          match_id, string_id, start, end, span.text, span.sent))
    return text_matches

In [4]:
def one_string_to_rule_them_all(list):
    """
    Takes a list of strings and joins them into one string
    """
    string = ''.join(list)
    return string

In [5]:
def roberta_extract(question, context):
    """
    Takes in a question and a string and uses the roberta model from Huggingface to pull out the question items
    
    question = A question in a string format
    context = a string that has the answers to the question asked - hopefully
    """
    model_name = "deepset/roberta-base-squad2"

    # a) Get predictions
    nlp = pipeline('question-answering', model = model_name, tokenizer = model_name)
    QA_input = {
        'question': question,
        'context': context
    }
    res = nlp(QA_input)

    # b) Load model & tokenizer
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return res

In [6]:
text_keys = list(sec_10ks.keys())

In [7]:
text_keys[0]

'amrk-10k_20210630.htm'

In [8]:
# test on amrk-10k_20210630.htm

text_10k = sec_10ks[text_keys[0]]

# finding relevant sentences in 10K with spaCy Matcher
matches_10k = find_stock_repurchase(text_10k)

# turning the list of matches into a single string
matches_10k = one_string_to_rule_them_all(matches_10k)

# feeding the spaCy matches into roberta to answer question
question = 'When were share repurcahses approved?'
approval = roberta_extract(question, matches_10k)

In [9]:
approval

{'score': 0.7636703252792358, 'start': 986, 'end': 996, 'answer': 'April 2018'}

In [10]:
roberta_extract(question, text_10k)

  tensor = as_tensor(value)
  p_mask = np.asarray(


{'score': 0.9157719612121582,
 'start': 486854,
 'end': 486864,
 'answer': 'April 2018'}

In [11]:
def combined_model(text, question):
    matches_10k = find_stock_repurchase(text)
    matches_10k = one_string_to_rule_them_all(matches_10k)
    answer = roberta_extract(question, matches_10k)
    
    return answer

In [12]:
text_keys[1]

'cnxc-20211130'

## Testing the various questions with the combined model

In [13]:
text = sec_10ks[text_keys[1]]
question = 'When were repurchases authorized?'

combined_model(text, question)

{'score': 0.901553213596344,
 'start': 2690,
 'end': 2704,
 'answer': 'September 2021'}

In [14]:
text = sec_10ks[text_keys[1]]
question = 'When were shares purchased?'

combined_model(text, question)

{'score': 0.6095575094223022,
 'start': 2690,
 'end': 2704,
 'answer': 'September 2021'}

In [15]:
text = sec_10ks[text_keys[1]]
question = 'How many shares were repurchased?'

combined_model(text, question)

{'score': 0.2499254047870636, 'start': 153, 'end': 160, 'answer': '138,455'}

In [16]:
text = sec_10ks[text_keys[1]]
question = 'What was the authorization amount?'

combined_model(text, question)

{'score': 0.42985936999320984,
 'start': 674,
 'end': 688,
 'answer': '$474.9 million'}

In [17]:
text = sec_10ks[text_keys[1]]
question = 'What was the purchase amount?'

combined_model(text, question)

{'score': 0.3907131552696228,
 'start': 3147,
 'end': 3159,
 'answer': '$500 million'}

## Testing the various questions using just Roberta

In [18]:
text = sec_10ks[text_keys[1]]
question = 'When were repurchases authorized?'

roberta_extract(question, text)

{'score': 0.9213380813598633,
 'start': 162667,
 'end': 162681,
 'answer': 'September 2021'}

In [19]:
text = sec_10ks[text_keys[1]]
question = 'When were shares purchased?'

roberta_extract(question, text)

{'score': 0.9353886246681213,
 'start': 94767,
 'end': 94780,
 'answer': 'December 2021'}

In [20]:
text = sec_10ks[text_keys[1]]
question = 'How many shares were repurchased?'

roberta_extract(question, text)

{'score': 0.9722062945365906,
 'start': 163340,
 'end': 163347,
 'answer': '138,455'}

In [21]:
text = sec_10ks[text_keys[1]]
question = 'What was the authorization amount?'

roberta_extract(question, text)

{'score': 0.6396191716194153,
 'start': 316680,
 'end': 316688,
 'answer': '$500,000'}

In [22]:
text = sec_10ks[text_keys[1]]
question = 'What was the purchase cost?'

roberta_extract(question, text)

{'score': 0.9680351614952087,
 'start': 317118,
 'end': 317125,
 'answer': '$25,100'}