# Overview
- Read labelled data files and build a dataframe with one data file parsed per row

# Initialization

In [19]:
import pandas as pd
import numpy as np
import matplotlib
import os
import re
import pdfminer as pdfm

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import nltk
nltk.download("punkt")

import string
from collections import Counter

[nltk_data] Downloading package punkt to /Users/emilyng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Functions

## Create list of useless_words

In [2]:
nltk_stopwords = nltk.corpus.stopwords.words("english")
# dont_stop = [] # Put any words that we want to keep that are in nltk_stopwords here, so we can remove them from the list
punct_list = list(string.punctuation)
# dont_stop_punct = []

#mick_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
#             'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
#             '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'id', '2d', 'one', 'two', '3d', 'ibid'
#            ]

tmp = nltk_stopwords + punct_list # + mick_list
useless_words = tmp
# useless_words = tmp minus the dont* variables
#useless_words

## Text Prep Functions
- get text from pdf
- clean the text (remove garbage characters)
- filter out less useful words
- count words

In [3]:
def get_text_from_pdf(pdf_path) :
    output_string = StringIO()
    with open(pdf_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    return output_string.getvalue()    

In [29]:
# This should be replaced using regular expressions and can be significantly enhanced functionally.
# Almost a placeholder function right now, just removing newlines.
def clean_text(input_string) :
    str1 = input_string.replace(" \n", "")
    str2 = str1.replace("\n", "")
    str3 = str2.replace("\\x0c", "")
    str4 = re.sub('[^A-Za-z0-9 ]+', '', str3)
    str5 = re.sub('\d+', '', str4)
    final_string = str5.lower()
    return final_string

In [14]:
def filter_words(input_words) :
    filtered_words = []
    for word in input_words :
        append_it = True
        if word in useless_words :
            #print(f"useless word {word}")
            append_it = False
        elif len(word) == 1 :
            #print(f"word length 1 {word}")
            append_it = False    
        elif word.isdigit() :
            #print(f"number {word}")
            append_it = False
        elif word[0] == chr(167) :
            #print(f"section symbol {word}")
            append_it = False
        if append_it :
            filtered_words.append(word)
    return filtered_words

In [15]:
def count_words(word_list) :
    word_counter = Counter(word_list)
    wc_rev_sort = sorted(word_counter.items(), key=lambda pair: pair[1], reverse=True)
    return wc_rev_sort

In [16]:
def build_bow2(words) :
    dict = {}
    for word in words :
        append_it = True
        if word in useless_words :
            #print(f"useless word {word}")
            append_it = False
        if len(word) == 1 :
            #print(f"word length 1 {word}")
            append_it = False
        if word.isdigit() :
            #print(f"number {word}")
            append_it = False
        if word[0] == chr(167) :
            #print(f"section symbol {word}")
            append_it = False
        if append_it :
            dict[word] = 1
    return dict

# Create DataFrame for ML Use

In [31]:
# We will populate these series
doc_num = pd.Series([], name='doc_num', dtype='int')
doc_filepath = pd.Series([], name='doc_filepath', dtype='str')
doc_text = pd.Series([], name='doc_text', dtype='str')
text_cleaned = pd.Series([], name='text_cleaned', dtype='str')
nltk_words = pd.Series([], name='nltk_words', dtype='str')
filtered_words = pd.Series([], name='filtered_words', dtype='str')
word_counts = pd.Series([], name='word_counts', dtype='str')
env_label = pd.Series([], name='env_label', dtype='str')
bow = pd.Series([], name='bow', dtype='str')

doc_ctr = 0

# Process the Environmental data
envdir = 'Data/Environmental'
for file_nm in os.listdir(envdir) :
    filepath = envdir + '/' + file_nm
    doc_str = get_text_from_pdf(filepath)
    txt_cln = clean_text(doc_str)
    nltk_wds = nltk.word_tokenize(txt_cln)
    filt_words = filter_words(nltk_wds)
    wc = count_words(filt_words)
    word_counts[doc_ctr] = wc
    bow2 = build_bow2(filt_words)
    
    # break # for debugging all text cleanups

    doc_num[doc_ctr] = doc_ctr
    doc_filepath[doc_ctr] = filepath
    doc_text[doc_ctr] = doc_str
    text_cleaned[doc_ctr] = txt_cln
    nltk_words[doc_ctr] = nltk_wds
    filtered_words[doc_ctr] = filt_words
    env_label[doc_ctr] = 'Environmental'
    bow[doc_ctr] = bow2
    doc_ctr += 1

# Process the Non-Environmental Data - same loop, should get into a function later
envdir = 'Data/NonEnvironmental'
for file_nm in os.listdir(envdir) :
    filepath = envdir + '/' + file_nm
    doc_str = get_text_from_pdf(filepath)
    txt_cln = clean_text(doc_str)
    nltk_wds = nltk.word_tokenize(txt_cln)
    filt_words = filter_words(nltk_wds)
    wc = count_words(filt_words)
    word_counts[doc_ctr] = wc
    bow2 = build_bow2(filt_words)

    doc_num[doc_ctr] = doc_ctr
    doc_filepath[doc_ctr] = filepath
    doc_text[doc_ctr] = doc_str
    text_cleaned[doc_ctr] = txt_cln
    nltk_words[doc_ctr] = nltk_wds
    filtered_words[doc_ctr] = filt_words
    env_label[doc_ctr] = 'NonEnvironmental'
    bow[doc_ctr] = bow2
    doc_ctr += 1

# Assemble the final data frame
doc_df = doc_num.to_frame().\
         join(doc_filepath).\
         join(doc_text).\
         join(text_cleaned).\
         join(nltk_words).\
         join(filtered_words).\
         join(word_counts).\
         join(bow).\
         join(env_label)
print(doc_df)

     doc_num                               doc_filepath  \
0          0      Data/Environmental/PLAW-104publ70.pdf   
1          1     Data/Environmental/PLAW-112publ177.pdf   
2          2      Data/Environmental/PLAW-116publ63.pdf   
3          3     Data/Environmental/PLAW-110publ288.pdf   
4          4     Data/Environmental/PLAW-108publ425.pdf   
..       ...                                        ...   
134      134   Data/NonEnvironmental/PLAW-114publ38.pdf   
135      135  Data/NonEnvironmental/PLAW-115publ281.pdf   
136      136  Data/NonEnvironmental/PLAW-115publ280.pdf   
137      137   Data/NonEnvironmental/PLAW-116publ52.pdf   
138      138  Data/NonEnvironmental/PLAW-116publ107.pdf   

                                              doc_text  \
0    PUBLIC LAW 104–70—DEC. 23, 1995\n\n109 STAT. 7...   
1    PUBLIC LAW 112–177—SEPT. 28, 2012 \n\n126 STAT...   
2    133 STAT. 1120 \n\nPUBLIC LAW 116–63—OCT. 4, 2...   
3    PUBLIC LAW 110–288—JULY 29, 2008 \n\nCLEAN BOA...   
4

# Validation

In [34]:
doc_str

'133 STAT. 3292 \n\nPUBLIC LAW 116–107—JAN. 17, 2020 \n\nPublic Law 116–107 \n116th Congress \n\nAn Act \n\nJan. 17, 2020 \n\n[H.R. 2385] \n\nTo permit the Secretary of Veterans Affairs to establish a grant program to conduct \ncemetery  research  and  produce  educational  materials  for  the  Veterans  Legacy \nProgram. \n\nBe  it  enacted  by  the  Senate  and  House  of  Representatives  of \n\nthe United States of America in Congress assembled, \n\n38 USC 2400 \nnote. \n\nSECTION  1.  GRANTS  FOR  CEMETERY  RESEARCH  AND  THE  PRODUC-\n\nTION OF EDUCATIONAL MATERIALS. \n\n(a) GRANTS AUTHORIZED.— \n\n(1)  IN GENERAL.—The  Secretary  of  Veterans  Affairs  may \nestablish  a  grant  program  to  conduct  cemetery  research  and \nproduce  educational  materials  for  the  Veterans  Legacy  Pro-\ngram. \n\n(2)  ELIGIBLE RECIPIENTS.—The  Secretary  may  award  a \n\ngrant under this section to any of the following entities: \n(A) An institution of higher learning. \n(B) A local educat

In [32]:
txt_cln

' stat public law jan  public law th congressan actjan  hr to permit the secretary of veterans affairs to establish a grant program to conductcemetery  research  and  produce  educational  materials  for  the  veterans  legacyprogrambe  it  enacted  by  the  senate  and  house  of  representatives  ofthe united states of america in congress assembled usc notesection    grants  for  cemetery  research  and  the  production of educational materialsa grants authorized  in generalthe  secretary  of  veterans  affairs  mayestablish  a  grant  program  to  conduct  cemetery  research  andproduce  educational  materials  for  the  veterans  legacy  program  eligible recipientsthe  secretary  may  award  agrant under this section to any of the following entitiesa an institution of higher learningb a local education agencyc  a  nonprofit  entity  that  the  secretary  determineshas a demonstrated history of community engagementd  another  recipient  the  secretary  determines  to  beappropriate

# And write it out

In [33]:
doc_df.to_csv("Data/doc_list1.csv")