In [15]:
import requests
from lxml import etree
import re



# Define function to load XML content
def request_content(url):
    page = requests.get(url)
    print(page)
    page_content = page.content
    return page_content


# Define functions to strip new line tags
def remove_new_lines(page_content):
    cleaned_page_content = page_content.replace(b'\n', b'')
    print("Removed new-line tags.")
    return cleaned_page_content


# Define functions to parse content with lxml
def parse_to_tree(cleaned_page_content):
    parser = etree.XMLParser()
    tree = etree.fromstring(cleaned_page_content, parser)
    print("Parse complete.")
    return tree


# Define functions to strip <I> tags so each <P> tag pulls in full
def strip_tags(tree, tags_to_strip):
    etree.strip_tags(tree, tags_to_strip)
    print("Tags removed.")
    return tree


# Define function to parse tree to list
def parse_tree_to_list(tree):
    header = str(tree.xpath('//TITLESTMT/TITLE/text()')[0])

    ECFR = []
    
    ECFR.append(header)

    chapters = tree.xpath('//DIV3')

    for chapter in chapters:
        chapter_content = chapter.xpath('.//*')
        for i in chapter_content:
            if i.text:
                ECFR.append(i.text)
            
    print("List created.")
            
    return ECFR

In [23]:
# Define ECFR url to request content from
url = 'https://www.govinfo.gov/bulkdata/ECFR/title-12/ECFR-title12.xml'      


# Run functions
page_content = request_content(url)

cleaned_page_content = remove_new_lines(page_content)

tree = parse_to_tree(cleaned_page_content)

tags_to_strip = ['HED', 'PSPACE', 'I']
strip_tags(tree, tags_to_strip)

title12 = parse_tree_to_list(tree)

<Response [200]>
Removed new-line tags.
Parse complete.
Tags removed.
List created.


In [24]:
title12

['Title 12: Banks and Banking',
 ' CHAPTER I - COMPTROLLER OF THE CURRENCY, DEPARTMENT OF THE TREASURY',
 'PART 1 - INVESTMENT SECURITIES',
 'Authority:12 U.S.C. 1 et seq., 24 (Seventh), and 93a. ',
 'Source:61 FR 63982, Dec. 2, 1996, unless otherwise noted.',
 '§ 1.1   Authority, purpose, scope, and reservation of authority.',
 '(a) Authority. This part is issued pursuant to 12 U.S.C. 1 et seq., 12 U.S.C. 24 (Seventh), and 12 U.S.C. 93a. ',
 '(b) Purpose This part prescribes standards under which national banks may purchase, sell, deal in, underwrite, and hold securities, consistent with the authority contained in 12 U.S.C. 24 (Seventh) and safe and sound banking practices. ',
 '(c) Scope. The standards set forth in this part apply to national banks and Federal branches of foreign banks.Further, pursuant to 12 U.S.C. 335, State banks that are members of the Federal Reserve System are subject to the same limitations and conditions that apply to national banks in connection with purchas

In [25]:
# Define ECFR url to request content from
url = 'https://www.govinfo.gov/bulkdata/ECFR/title-31/ECFR-title31.xml'

# Run functions
page_content = request_content(url)

cleaned_page_content = remove_new_lines(page_content)

tree = parse_to_tree(cleaned_page_content)

tags_to_strip = ['HED', 'PSPACE', 'I']
strip_tags(tree, tags_to_strip)

title31 = parse_tree_to_list(tree)

<Response [200]>
Removed new-line tags.
Parse complete.
Tags removed.
List created.


In [26]:
title31

['Title 31: Money and Finance: Treasury',
 ' CHAPTER I - MONETARY OFFICES, DEPARTMENT OF THE TREASURY',
 'PARTS 51-55 [RESERVED]',
 'PART 56 - DOMESTIC GOLD AND SILVER OPERATIONS SALE OF SILVER',
 'Authority:Sec. 209, 79 Stat. 257; 31 U.S.C. 405a-1.',
 '§ 56.1   Conditions upon which silver will be sold.',
 'The General Services Administration, as agent for the Treasury Department, will conduct periodic sales of silver as agreed upon between GSA and the Treasury Department. Sales will be under competitive bidding procedures established by agreement between GSA and the Treasury Department. Details of the bidding and selling procedures are obtainable by telephone or by writing to General Services Administration, Property Management and Disposal Service, Industry Materials Division, Metals Project, Washington, DC 20405.',
 '[32 FR 13380, Sept. 22, 1967]',
 '§ 56.2   Sales price.',
 'Sales of silver will be at prices offered through the competitive bidding procedures referred to in § 56.1,

In [28]:
banking_urls = ['https://www.govinfo.gov/bulkdata/ECFR/title-12/ECFR-title12.xml', 
                'https://www.govinfo.gov/bulkdata/ECFR/title-31/ECFR-title31.xml']

banking_content = []



for url in banking_urls:
    page_content = request_content(url)
    cleaned_page_content = remove_new_lines(page_content)
    tree = parse_to_tree(cleaned_page_content)
    tags_to_strip = ['HED', 'PSPACE', 'I']
    strip_tags(tree, tags_to_strip)
    title_content = parse_tree_to_list(tree)
    
    banking_content.append(title_content)

<Response [200]>
Removed new-line tags.
Parse complete.
Tags removed.
List created.
<Response [200]>
Removed new-line tags.
Parse complete.
Tags removed.
List created.


In [30]:
banking_content

[['Title 12: Banks and Banking',
  ' CHAPTER I - COMPTROLLER OF THE CURRENCY, DEPARTMENT OF THE TREASURY',
  'PART 1 - INVESTMENT SECURITIES',
  'Authority:12 U.S.C. 1 et seq., 24 (Seventh), and 93a. ',
  'Source:61 FR 63982, Dec. 2, 1996, unless otherwise noted.',
  '§ 1.1   Authority, purpose, scope, and reservation of authority.',
  '(a) Authority. This part is issued pursuant to 12 U.S.C. 1 et seq., 12 U.S.C. 24 (Seventh), and 12 U.S.C. 93a. ',
  '(b) Purpose This part prescribes standards under which national banks may purchase, sell, deal in, underwrite, and hold securities, consistent with the authority contained in 12 U.S.C. 24 (Seventh) and safe and sound banking practices. ',
  '(c) Scope. The standards set forth in this part apply to national banks and Federal branches of foreign banks.Further, pursuant to 12 U.S.C. 335, State banks that are members of the Federal Reserve System are subject to the same limitations and conditions that apply to national banks in connection wit

In [31]:
len(banking_content[0]) + len(banking_content[1])

188151

In [95]:
# Inverted index datastructure
import nltk
from collections import defaultdict
from nltk.stem.snowball import EnglishStemmer  # Assuming we're working with English
 
class Index:
    """ Inverted index datastructure """
 
    def __init__(self, tokenizer, stemmer=None, stopwords=None):
        """
        tokenizer   -- NLTK compatible tokenizer function
        stemmer     -- NLTK compatible stemmer 
        stopwords   -- list of ignored words
        """
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.index = defaultdict(list)
        self.documents = {}
        self.__unique_id = 0
        if not stopwords:
            self.stopwords = set()
        else:
            self.stopwords = set(stopwords)
 
    def lookup(self, word):
        """
        Lookup a word in the index
        """
        word = word.lower()
        if self.stemmer:
            word = self.stemmer.stem(word)
 
        return [self.documents.get(id, None) for id in self.index.get(word)]
 
    def add(self, document):
        """
        Add a document string to the index
        """
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if token in self.stopwords:
                continue
 
            if self.stemmer:
                token = self.stemmer.stem(token)
 
            if self.__unique_id not in self.index[token]:
                self.index[token].append(self.__unique_id)
 
        self.documents[self.__unique_id] = document
        self.__unique_id += 1           
 
 
index = Index(nltk.word_tokenize, 
              EnglishStemmer(), 
              nltk.corpus.stopwords.words('english'))

In [None]:
# Add sentences to index
for string in ECFR:
    index.add(string)


In [135]:
# Create search function
def search(search_term):
    lookup = index.lookup(search_term)
    return lookup

In [139]:
# Test search function
search('customer')

['(a) Authority. A national bank may provide credit life insurance to loan customers pursuant to 12 U.S.C. 24 (Seventh). ',
 '(b) It is an unsafe and unsound practice for any director, officer, employee, or principal shareholder of a national bank (including any entity in which this person owns an interest of more than ten percent), who is involved in the sale of credit life insurance to loan customers of the national bank, to take advantage of that business opportunity for personal profit. Recommendations to customers to buy insurance should be based on the benefits of the policy, not the commissions received from the sale. ',
 '(c) Except as provided in §§ 2.4 and 2.5(b), and paragraph (d) of this section, a director, officer, employee, or principal shareholder of a national bank, or an entity in which such person owns an interest of more than ten percent, may not retain commissions or other income from the sale of credit life insurance in connection with any loan made by that bank, 

In [81]:
import pandas as pd

df = pd.DataFrame(title12)
df["Definition"] = False
df["Requirement"] = False


In [82]:
df.head(30)

Unnamed: 0,0,Definition,Requirement
0,Title 12: Banks and Banking,False,False
1,"CHAPTER I - COMPTROLLER OF THE CURRENCY, DEPA...",False,False
2,PART 1 - INVESTMENT SECURITIES,False,False
3,"§ 1.1 Authority, purpose, scope, and reserva...",False,False
4,(a) Authority. This part is issued pursuant to...,False,False
5,(b) Purpose This part prescribes standards und...,False,False
6,(c) Scope. The standards set forth in this par...,False,False
7,(d) Reservation of authority. The OCC may dete...,False,False
8,§ 1.2 Definitions.,False,False
9,(a) Capital and surplus means:,False,False


In [83]:
# Save to disk

df.to_csv('../data/ECFR-title12.csv')