# Overview
- Read csv with document data in it
- Split into train/test data
- Train model
- Test model

# Initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import os
import pdfminer as pdfm

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import nltk
nltk.download("punkt")
from nltk.classify import NaiveBayesClassifier

import string
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MickC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Read the document data

In [2]:
doc_df = pd.read_csv("Data/doc_list.csv")
doc_df

Unnamed: 0.1,Unnamed: 0,doc_num,doc_filepath,doc_text,text_cleaned,nltk_words,filtered_words,word_counts,bow,env_label
0,0,0,Data/EnvironmentLabel/esa_act.pdf,ENDANGERED SPECIES ACT OF 1973\n\n\n1 \n\n2\n...,endangered species act of 197312endangered s...,"['endangered', 'species', 'act', 'of', '19731'...","['endangered', 'species', 'act', 'endangered',...","[('species', 327), ('secretary', 268), ('shall...","{'endangered': 1, 'species': 1, 'act': 1, 'ame...",Environmental
1,1,1,Data/EnvironmentLabel/PLAW-108publ160.pdf,"PUBLIC LAW 108–160—DEC. 6, 2003\n\n117 STAT. 2...","public law 108–160—dec. 6, 2003117 stat. 2013p...","['public', 'law', '108–160—dec', '.', '6', ','...","['public', 'law', '108–160—dec', 'stat', '2013...","[('environmental', 7), ('resolution', 4), ('di...","{'public': 1, 'law': 1, '108–160—dec': 1, 'sta...",Environmental
2,2,2,Data/EnvironmentLabel/PLAW-110publ255.pdf,"PUBLIC LAW 110–255—JUNE 30, 2008 \n\n122 STAT....","public law 110–255—june 30, 2008122 stat. 2423...","['public', 'law', '110–255—june', '30', ',', '...","['public', 'law', '110–255—june', 'stat', '242...","[('environmental', 8), ('law', 7), ('agency', ...","{'public': 1, 'law': 1, '110–255—june': 1, 'st...",Environmental
3,3,3,Data/EnvironmentLabel/PLAW-110publ365.pdf,"PUBLIC LAW 110–365—OCT. 8, 2008 \n\n122 STAT. ...","public law 110–365—oct. 8, 2008122 stat. 4021p...","['public', 'law', '110–365—oct', '.', '8', ','...","['public', 'law', '110–365—oct', 'stat', '4021...","[('project', 13), ('act', 11), ('non-federal',...","{'public': 1, 'law': 1, '110–365—oct': 1, 'sta...",Environmental
4,4,4,Data/EnvironmentLabel/PLAW-110publ414.pdf,"PUBLIC LAW 110–414—OCT. 14, 2008 \n\n122 STAT....","public law 110–414—oct. 14, 2008122 stat. 4341...","['public', 'law', '110–414—oct', '.', '14', ',...","['public', 'law', '110–414—oct', 'stat', '4341...","[('mercury', 51), ('elemental', 38), ('shall',...","{'public': 1, 'law': 1, '110–414—oct': 1, 'sta...",Environmental
5,5,5,Data/EnvironmentLabel/PLAW-111publ191.pdf,"124 STAT. 1278 \n\nPUBLIC LAW 111–191—JUNE 15,...","124 stat. 1278public law 111–191—june 15, 2010...","['124', 'stat', '.', '1278public', 'law', '111...","['stat', '1278public', 'law', '111–191—june', ...","[('oil', 7), ('act', 4), ('advances', 4), ('la...","{'stat': 1, '1278public': 1, 'law': 1, '111–19...",Environmental
6,6,6,Data/EnvironmentLabel/PLAW-111publ199.pdf,"PUBLIC LAW 111–199—JULY 7, 2010 \n\n124 STAT. ...","public law 111–199—july 7, 2010124 stat. 1359p...","['public', 'law', '111–199—july', '7', ',', '2...","['public', 'law', '111–199—july', 'stat', '135...","[('ii', 39), ('formaldehyde', 27), ('wood', 21...","{'public': 1, 'law': 1, '111–199—july': 1, 'st...",Environmental
7,7,7,Data/EnvironmentLabel/PLAW-111publ215.pdf,"PUBLIC LAW 111–215—JULY 30, 2010 \n\n124 STAT....","public law 111–215—july 30, 2010124 stat. 2347...","['public', 'law', '111–215—july', '30', ',', '...","['public', 'law', '111–215—july', 'stat', '234...","[('public', 3), ('law', 3), ('date', 2), ('sta...","{'public': 1, 'law': 1, '111–215—july': 1, 'st...",Environmental
8,8,8,Data/EnvironmentLabel/PLAW-111publ378.pdf,"124 STAT. 4128 \n\nPUBLIC LAW 111–378—JAN. 4, ...","124 stat. 4128public law 111–378—jan. 4, 2011p...","['124', 'stat', '.', '4128public', 'law', '111...","['stat', '4128public', 'law', '111–378—jan', '...","[('stormwater', 7), ('federal', 5), ('fee', 5)...","{'stat': 1, '4128public': 1, 'law': 1, '111–37...",Environmental
9,9,9,Data/EnvironmentLabel/PLAW-111publ380.pdf,"PUBLIC LAW 111–380—JAN. 4, 2011 \n\n124 STAT. ...","public law 111–380—jan. 4, 2011124 stat. 4131j...","['public', 'law', '111–380—jan', '.', '4', ','...","['public', 'law', '111–380—jan', 'stat', '4131...","[('lead', 13), ('drinking', 5), ('used', 5), (...","{'public': 1, 'law': 1, '111–380—jan': 1, 'sta...",Environmental


In [3]:
doc_df.shape

(46, 9)

# Split Training vs Testing Data

In [3]:
train_filter = doc_df['doc_num']%2 == 0
test_filter = ~train_filter
#train_filter
#test_filter
training_df = doc_df.loc[train_filter]
testing_df = doc_df.loc[test_filter]

In [4]:
training_df.shape

(23, 10)

In [5]:
testing_df.head()

Unnamed: 0.1,Unnamed: 0,doc_num,doc_filepath,doc_text,text_cleaned,nltk_words,filtered_words,word_counts,bow,env_label
1,1,1,Data/EnvironmentLabel/PLAW-108publ160.pdf,"PUBLIC LAW 108–160—DEC. 6, 2003\n\n117 STAT. 2...","public law 108–160—dec. 6, 2003117 stat. 2013p...","['public', 'law', '108–160—dec', '.', '6', ','...","['public', 'law', '108–160—dec', 'stat', '2013...","[('environmental', 7), ('resolution', 4), ('di...","{'public': 1, 'law': 1, '108–160—dec': 1, 'sta...",Environmental
3,3,3,Data/EnvironmentLabel/PLAW-110publ365.pdf,"PUBLIC LAW 110–365—OCT. 8, 2008 \n\n122 STAT. ...","public law 110–365—oct. 8, 2008122 stat. 4021p...","['public', 'law', '110–365—oct', '.', '8', ','...","['public', 'law', '110–365—oct', 'stat', '4021...","[('project', 13), ('act', 11), ('non-federal',...","{'public': 1, 'law': 1, '110–365—oct': 1, 'sta...",Environmental
5,5,5,Data/EnvironmentLabel/PLAW-111publ191.pdf,"124 STAT. 1278 \n\nPUBLIC LAW 111–191—JUNE 15,...","124 stat. 1278public law 111–191—june 15, 2010...","['124', 'stat', '.', '1278public', 'law', '111...","['stat', '1278public', 'law', '111–191—june', ...","[('oil', 7), ('act', 4), ('advances', 4), ('la...","{'stat': 1, '1278public': 1, 'law': 1, '111–19...",Environmental
7,7,7,Data/EnvironmentLabel/PLAW-111publ215.pdf,"PUBLIC LAW 111–215—JULY 30, 2010 \n\n124 STAT....","public law 111–215—july 30, 2010124 stat. 2347...","['public', 'law', '111–215—july', '30', ',', '...","['public', 'law', '111–215—july', 'stat', '234...","[('public', 3), ('law', 3), ('date', 2), ('sta...","{'public': 1, 'law': 1, '111–215—july': 1, 'st...",Environmental
9,9,9,Data/EnvironmentLabel/PLAW-111publ380.pdf,"PUBLIC LAW 111–380—JAN. 4, 2011 \n\n124 STAT. ...","public law 111–380—jan. 4, 2011124 stat. 4131j...","['public', 'law', '111–380—jan', '.', '4', ','...","['public', 'law', '111–380—jan', 'stat', '4131...","[('lead', 13), ('drinking', 5), ('used', 5), (...","{'public': 1, 'law': 1, '111–380—jan': 1, 'sta...",Environmental


In [6]:
training_df.head()

Unnamed: 0.1,Unnamed: 0,doc_num,doc_filepath,doc_text,text_cleaned,nltk_words,filtered_words,word_counts,bow,env_label
0,0,0,Data/EnvironmentLabel/esa_act.pdf,ENDANGERED SPECIES ACT OF 1973\n\n\n1 \n\n2\n...,endangered species act of 197312endangered s...,"['endangered', 'species', 'act', 'of', '19731'...","['endangered', 'species', 'act', 'endangered',...","[('species', 327), ('secretary', 268), ('shall...","{'endangered': 1, 'species': 1, 'act': 1, 'ame...",Environmental
2,2,2,Data/EnvironmentLabel/PLAW-110publ255.pdf,"PUBLIC LAW 110–255—JUNE 30, 2008 \n\n122 STAT....","public law 110–255—june 30, 2008122 stat. 2423...","['public', 'law', '110–255—june', '30', ',', '...","['public', 'law', '110–255—june', 'stat', '242...","[('environmental', 8), ('law', 7), ('agency', ...","{'public': 1, 'law': 1, '110–255—june': 1, 'st...",Environmental
4,4,4,Data/EnvironmentLabel/PLAW-110publ414.pdf,"PUBLIC LAW 110–414—OCT. 14, 2008 \n\n122 STAT....","public law 110–414—oct. 14, 2008122 stat. 4341...","['public', 'law', '110–414—oct', '.', '14', ',...","['public', 'law', '110–414—oct', 'stat', '4341...","[('mercury', 51), ('elemental', 38), ('shall',...","{'public': 1, 'law': 1, '110–414—oct': 1, 'sta...",Environmental
6,6,6,Data/EnvironmentLabel/PLAW-111publ199.pdf,"PUBLIC LAW 111–199—JULY 7, 2010 \n\n124 STAT. ...","public law 111–199—july 7, 2010124 stat. 1359p...","['public', 'law', '111–199—july', '7', ',', '2...","['public', 'law', '111–199—july', 'stat', '135...","[('ii', 39), ('formaldehyde', 27), ('wood', 21...","{'public': 1, 'law': 1, '111–199—july': 1, 'st...",Environmental
8,8,8,Data/EnvironmentLabel/PLAW-111publ378.pdf,"124 STAT. 4128 \n\nPUBLIC LAW 111–378—JAN. 4, ...","124 stat. 4128public law 111–378—jan. 4, 2011p...","['124', 'stat', '.', '4128public', 'law', '111...","['stat', '4128public', 'law', '111–378—jan', '...","[('stormwater', 7), ('federal', 5), ('fee', 5)...","{'stat': 1, '4128public': 1, 'law': 1, '111–37...",Environmental


# Train the Model

In [7]:
def create_nb_input(doc_df, label) :
    ctr = 0
    for word_list in doc_df['bow'].tolist() :
#        print(type(word_list))
#        print(word_list)
#        break
        if ctr == 0 :
            nb_input = [(word_list, label)]
        else :
            nb_input.append((word_list, label))
        ctr += 1
    return nb_input

In [8]:
filter = training_df['env_label'] == "Environmental"
env_training_df = training_df.loc[filter]
notenv_training_df = training_df.loc[~filter]

In [14]:
nb_env_input = create_nb_input(env_training_df, 'Environmental')
nb_notenv_input = create_nb_input(notenv_training_df, 'NotEnvironmental')
print(nb_env_input[0][0])

{'endangered': 1, 'species': 1, 'act': 1, 'amended': 1, 'p.l': 1, '108-136': 1, 'november': 1, 'provide': 1, 'conservation': 1, 'threatened': 1, 'offish': 1, 'wildlife': 1, 'plants': 1, 'purposes.be': 1, 'enacted': 1, 'senate': 1, 'house': 1, 'representatives': 1, 'theunited': 1, 'states': 1, 'america': 1, 'congress': 1, 'assembled': 1, 'maybe': 1, 'cited': 1, '.table': 1, 'contentssec': 1, '2.': 1, 'findings': 1, 'purposes': 1, 'policy.sec': 1, '3.': 1, 'definitions.sec': 1, '4.': 1, 'determination': 1, 'species.sec': 1, '5.': 1, 'land': 1, 'acquisition.sec': 1, '6.': 1, 'cooperation': 1, 'states.sec': 1, '7.': 1, 'interagency': 1, 'cooperation.sec': 1, '8.': 1, 'international': 1, '8a': 1, 'convention': 1, 'implementation.sec': 1, '9.': 1, 'prohibited': 1, 'acts.sec': 1, '10.': 1, 'exceptions.sec': 1, '11.': 1, 'penalties': 1, 'enforcement.sec': 1, '12.': 1, 'plants.sec': 1, '13.': 1, 'conforming': 1, 'amendments.sec': 1, '14.': 1, 'repealer.sec': 1, '15.': 1, 'authorization': 1, 'ap

In [12]:
print(type(nb_env_input))
print(len(nb_env_input))

<class 'list'>
12


In [15]:
legislative_subject_classifier = NaiveBayesClassifier.train(nb_env_input[0][0] + nb_notenv_input[0][0])
#legislative_subject_classifier = NaiveBayesClassifier.train(nb_env_input[0][0] + nb_notenv_input[0][0])
#legislative_subject_classifier = NaiveBayesClassifier.train(nb_env_input[0][0] + nb_notenv_input[0][0])

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
nltk.classify.util.accuracy(mr_op_classifier, nb_mr_train_input + nb_op_train_input)*100

# Test the model
- essentially just run the accuracy but pass in the testing data instead of the training data