In [2]:
import nltk
import spacy
import re

import requests
from bs4 import BeautifulSoup as bs

from collections import OrderedDict

In [3]:
# Read csv as list

URLS = '../data/ffiec_urls.csv'

with open(URLS, mode='r') as f:
    pages = f.readlines()
    pages = [re.sub("\\n","",i) for i in pages]
    pages = [i.replace(' ', '') for i in pages]
    pages.pop(0)

['https://bsaaml.ffiec.gov/manual/ComplianceProgram/01',
 'https://bsaaml.ffiec.gov/manual/ComplianceProgram/02',
 'https://bsaaml.ffiec.gov/manual/ComplianceProgram/03',
 'https://bsaaml.ffiec.gov/manual/ComplianceProgram/04',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/01',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/02',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/03',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/04',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/05',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/06',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/07',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/08',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/09',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/10',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/11',
 'https://bsaaml.ffiec.gov/manual/RegulatoryRequirements/12',
 'https://bsaaml.ffiec.gov/m

In [5]:
# Run loop for each page

list_of_soup = []

for page in range(len(pages)):
    
    content = requests.get(pages[page])    
    soup = bs(content.content)
    
    list_of_soup.append(soup)

In [6]:
# Extract content and append to one list

def get_content_list(soup):

    page_content = []

    content = soup.find('section', id="content")
    content_container = content.findAll(['h3','p','h4','h6','li'])

    for i in content_container:
        text = i.text
        cleaned_text = text.replace('\r\n', '').replace('\n', '').replace('       ', '')
        page_content.append(cleaned_text)

    return page_content


extracted_content = list(map(get_content_list, list_of_soup))

count = 0
for listElem in extracted_content:
    count += len(listElem)  


print(  len(extracted_content),"   =  Number of extracted sections")
print(count," =  Number of extracted objects")

72    =  Number of extracted sections
4925  =  Number of extracted objects


In [8]:
# Flatten nested list

from itertools import chain
ffiec_content = list(chain(*extracted_content))

if len(ffiec_content) == count:
    print("PASS: ITEM COUNTS MATCH")
else:
    print("ERROR: ITEM COUNT MISMATCH\nMISSING DATA")

PASS: ITEM COUNTS MATCH


In [9]:
# NLTK Sentence Tokenizer

import functools
@functools.lru_cache(maxsize=128)
def sentence_tokenizer(content):
    sents = nltk.tokenize.sent_tokenize(content)
    return sents



sentences = []

for i in extracted_content:
    for j in i:
        sentences.append(sentence_tokenizer(j))
        
nltk_sentences = list(chain(*sentences))


In [14]:
print(len(ffiec_content))
print(len(nltk_sentences))

4925
7437


In [15]:
# Inverted index datastructure
import nltk
from collections import defaultdict
from nltk.stem.snowball import EnglishStemmer  # Assuming we're working with English
 
class Index:
    """ Inverted index datastructure """
 
    def __init__(self, tokenizer, stemmer=None, stopwords=None):
        """
        tokenizer   -- NLTK compatible tokenizer function
        stemmer     -- NLTK compatible stemmer 
        stopwords   -- list of ignored words
        """
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.index = defaultdict(list)
        self.documents = {}
        self.__unique_id = 0
        if not stopwords:
            self.stopwords = set()
        else:
            self.stopwords = set(stopwords)
 
    def lookup(self, word):
        """
        Lookup a word in the index
        """
        word = word.lower()
        if self.stemmer:
            word = self.stemmer.stem(word)
 
        return [self.documents.get(id, None) for id in self.index.get(word)]
 
    def add(self, document):
        """
        Add a document string to the index
        """
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if token in self.stopwords:
                continue
 
            if self.stemmer:
                token = self.stemmer.stem(token)
 
            if self.__unique_id not in self.index[token]:
                self.index[token].append(self.__unique_id)
 
        self.documents[self.__unique_id] = document
        self.__unique_id += 1           
 
 
index = Index(nltk.word_tokenize, 
              EnglishStemmer(), 
              nltk.corpus.stopwords.words('english'))

In [16]:
# Add sentences to index
for string in nltk_sentences:
    index.add(string)

In [17]:
# Create search function
def search(search_term):
    lookup = index.lookup(search_term)
    return lookup

In [18]:
# Test search function
search('responsible')

['Response to request letter items.',
 'Designate an individual or individuals responsible for managing BSA compliance (BSA compliance officer).',
 'The board of directors, acting through senior management, is ultimately responsible for ensuring that the bank maintains an effective BSA/AML internal control structure, including suspicious activity monitoring and reporting.',
 'Identify a person or persons responsible for BSA/AML compliance.',
 'Meet all regulatory recordkeeping and reporting requirements, meet recommendations for BSA/AML compliance, and provide for timely updates in response to changes in regulations.33Refer to Appendix P ("BSA Record Retention Requirements") for guidance.',
 'For example, employees that complete the reporting forms (such as SARs, CTRs, and CTR exemptions) generally should not also be responsible for the decision to file the reports or grant the exemptions.',
 'Train employees to be aware of their responsibilities under the BSA regulations and internal 

In [19]:
# Define requirement terms
search_terms = ['shall', 'must', 'require', 'need', 'make', 'ensure', 'responsible']

# Pass search terms to search function
result0 = search(search_terms[0])
result1 = search(search_terms[1])
result2 = search(search_terms[2])
result3 = search(search_terms[3])
result4 = search(search_terms[4])
result5 = search(search_terms[5])
result6 = search(search_terms[6])

# Concatenate results in a set
all_results = {*result0, *result1, *result2, *result3, *result4, *result5, *result6}

In [20]:
print(len(all_results)," =  Number of extracted requirements")

1079  =  Number of extracted requirements


In [21]:
# Prepare file for export
req_export = []
for num,req in enumerate(all_results, start=1):
    item_num = num
    item_req = req
    temp_entry = [item_num, item_req]
    
    req_export.append(temp_entry)

In [22]:
# Export list to CSV

import csv

    
with open('../data/ffiec_requirements.csv', 'a') as outcsv:   
    #configure writer to write standard csv file
    writer = csv.writer(outcsv, lineterminator='\n')
    for item in req_export:
        #Write item to outcsv
        writer.writerow(item)

In [None]:
# TEST AND COMPARE SPACY RESULTS TO NLTK

In [7]:
# instantiate spacy model
import en_core_web_sm
nlp = en_core_web_sm.load()

In [8]:
doc = nlp(ffiec_content[22])
for num, sent in enumerate(doc.sents):
    print(f"[{num}] {sent.text}")

[0]     The examiner should develop and document an initial examination plan commensurate with the overall BSA/AML risk profile of the bank.
[1] This plan may change during the examination as a result of on-site findings, and any changes to the plan should likewise be documented.
[2] The examiner should prepare a request letter to the bank.
[3] Suggested request letter items are detailed in    Appendix H (“Request Letter Items (Core and Expanded)”).
[4] On the basis of the risk profile, quality of audit, previous examination findings, and initial examination work, examiners should complete additional core and expanded examination procedures, as appropriate.
[5] The examiner must include an evaluation of the BSA/AML compliance program within the supervisory plan or cycle.
[6] At larger, more complex banking organizations, examiners may complete various types of examinations throughout the supervisory plan or cycle to assess BSA/AML compliance.
[7] These reviews may focus on one or more 

In [9]:
from spacy.pipeline import Sentencizer
sentencizer = Sentencizer()

In [10]:
spacy_sentences = []

for i in ffiec_content:
    doc = nlp(i)
    sents = doc.sents
    for s in sents:
        txt = s.text

        spacy_sentences.append(txt)

In [11]:
print(f"Spacy: {len(spacy_sentences)}")
print(f"NLTK : {len(nltk_sentences)}")
print(f"Diff : {len(spacy_sentences)-len(nltk_sentences)}")

Spacy: 8260
NLTK : 7437
Diff : 823
