# Exploration: Reading and preprocessing data

In [1]:
import pandas as pd

import re

import gensim
import nltk
import spacy

## Data source

In [3]:
# Read in a single organization's corpus
df = pd.read_json("data/02. Data Sets/NIFA/contradictions_datasets_nifa_reports.zip", orient='records', compression='infer')

In [6]:
df.head(2)

Unnamed: 0,file_name,title,num,id,corpus,source_page_url,url,type,n_pages,word_count,text_by_page
0,"1890_Facilities_Program,_Section_1447,_Special...","1890 Facilities Program, Section 1447, Special...","1890 Facilities Program, Section 1447, Special...",7c447e203c43a52a74f50032f52220f621d360e6cc02a1...,nifa_reports,https://www.nifa.usda.gov/document?f%5B0%5D=re...,https://nifa.usda.gov//sites/default/files/res...,pdf,23,19103,"[September 1998 COOPERATIVE STATE RESEARCH, ED..."
1,1994_and_1862_Land-Grant_Cooperation_Progress_...,1994 and 1862 Land-Grant Cooperation Progress ...,1994 and 1862 Land-Grant Cooperation Progress ...,980c6b78c24b9183e851494cc60daa6db4ad202c3547bb...,nifa_reports,https://www.nifa.usda.gov/document?f%5B0%5D=re...,https://nifa.usda.gov//sites/default/files/res...,pdf,9,2624,[Resetting the Conversation: 1994 and 1862 Lan...


In [11]:
# Each row is a single document
df.id.nunique() == df.shape[0]

True

In [13]:
# The plain text for each page of the document is stored as a list in `text_by_page`
pdf = df.iloc[0]
pages = pdf.text_by_page
len(pages) == pdf.n_pages

True

In [14]:
# Let's look at the third page and see what it's like
pages[2]

"3 (HVAC), and refrigeration systems; and specialized items such as cage washers, laboratory casework, some growth chambers, and certain other large, specialized equipment meeting this definition. s. Force Account means the direct performance of facility construction by a grantee's own permanent workforce (consisting of trades and crafts personnel), equipment, materials, and supplies furnished by the grantee and used under the direct control of the grantee. t. Maintenance and Operations means programmatic activities and those activities required to assure the dependable and economical function of a completed facility as follows: 1.Maintenance—preservation of the functional integri- ty and efficiency of the facility and its fixed equipment, including preventive maintenance, corrective maintenance, testing, and replacement of defective components thereof. 2.Operations—activities or processes associated with the programs to be housed in a completed facility and those processes which are n

In [15]:
# We can open up the original PDF from its url to see how well this text extraction performed
pdf.url
# Pretty great! The page number in the footer did get shoved into the beginning of the page text,
# but fortunately that's the only thing in most of the footers for this document except the cover
# page.

'https://nifa.usda.gov//sites/default/files/resource/1890_special_terms_c_sec1447.pdf'

## Tokenization

> **NOTE:** Because sentences may span page breaks, we may wish to concatenate all text in a document's pages into a single string. Only issue is we may need to address footer content prior to the concatenation.
>
> **TODO:** 
> * Explore if footer content is consistently placed at the beginning of the page text
> * Explore if all of the documents in the corpus only have the page number in the footer (hopefully) or if there is additional footer content that we need to detect and strip (more likely)

In [16]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()


In [29]:
# Simple word tokenizer
tokenizer = Tokenizer(nlp.vocab)
text = pages[2]
tokens = tokenizer(text)
print([token.text for token in tokens[:15]])
len(tokens)

['3', '(HVAC),', 'and', 'refrigeration', 'systems;', 'and', 'specialized', 'items', 'such', 'as', 'cage', 'washers,', 'laboratory', 'casework,', 'some']


870

### Preprocessing Pipelines

spaCy uses pipelines to do NLP: https://spacy.io/usage/processing-pipelines#pipelines

The following components are built-in: https://spacy.io/usage/processing-pipelines#built-in

In [68]:
# We may want to do sentence (or even paragraph) tokenizing if we want to find contradictions
# between sentences within the same document. Plus, it gives us a more granual unit to find
# contradictions between documents rather than treating an entire document as a ton of words, we can
# find contradictions between specific sentences across documents.
#
# - The spacy SentencerRecognizer is trainable https://spacy.io/api/sentencerecognizer
#     TODO: Future improvement by using model for sentence splitting
# - The spacy Sentencizer is a simple rule-based parser https://spacy.io/api/sentencizer
nlp = English()
nlp.add_pipe('sentencizer')
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0xffff5b1c40c0>)]

In [66]:
# Run the pipeline on a single object by calling nlp
text = pages[2]
doc = nlp(text)
sentences = list(doc.sents)
print(f"{len(sentences)} sentences found on this page. Examples:")
for s in sentences[:8]:
    print(s.text + '\n')

33 sentences found on this page. Examples:
3 (HVAC), and refrigeration systems; and specialized items such as cage washers, laboratory casework, some growth chambers, and certain other large, specialized equipment meeting this definition.

s. Force Account means the direct performance of facility construction by a grantee's own permanent workforce (consisting of trades and crafts personnel), equipment, materials, and supplies furnished by the grantee and used under the direct control of the grantee.

t. Maintenance and Operations means programmatic activities and those activities required to assure the dependable and economical function of a completed facility as follows: 1.Maintenance—preservation of the functional integri- ty and efficiency of the facility and its fixed equipment, including preventive maintenance, corrective maintenance, testing, and replacement of defective components thereof.

2.Operations—activities or processes associated with the programs to be housed in a compl

In [70]:
# Now that we have sentences, we can use one of spacy's pre-built pipelines. The model for this
# pipeline was downloaded in our Dockerfile running `python -m spacy download en_core_web_sm`
nlp_core = spacy.load('en_core_web_sm')
nlp_core.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0xffff596dbca0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0xffff596dbdc0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0xffff5ae07ca0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0xffff58f7b400>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0xffff58d03b40>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0xffff5ae07c30>)]

In [94]:
sentences_core = list(nlp_core.pipe([s.text for s in sentences]))

In [95]:
len(sentences_core)

33

In [96]:
sent = sentences_core[1]
sent

s. Force Account means the direct performance of facility construction by a grantee's own permanent workforce (consisting of trades and crafts personnel), equipment, materials, and supplies furnished by the grantee and used under the direct control of the grantee.

In [97]:
# Now each of our sentences contains many tokenized words, and each word has many attributes, such
# as the text representing it and its part of speech
print([(word.text, word.pos_) for word in sent])

[('s.', 'PROPN'), ('Force', 'PROPN'), ('Account', 'PROPN'), ('means', 'VERB'), ('the', 'DET'), ('direct', 'ADJ'), ('performance', 'NOUN'), ('of', 'ADP'), ('facility', 'NOUN'), ('construction', 'NOUN'), ('by', 'ADP'), ('a', 'DET'), ('grantee', 'NOUN'), ("'s", 'PART'), ('own', 'ADJ'), ('permanent', 'ADJ'), ('workforce', 'NOUN'), ('(', 'PUNCT'), ('consisting', 'VERB'), ('of', 'ADP'), ('trades', 'NOUN'), ('and', 'CCONJ'), ('crafts', 'NOUN'), ('personnel', 'NOUN'), (')', 'PUNCT'), (',', 'PUNCT'), ('equipment', 'NOUN'), (',', 'PUNCT'), ('materials', 'NOUN'), (',', 'PUNCT'), ('and', 'CCONJ'), ('supplies', 'NOUN'), ('furnished', 'VERB'), ('by', 'ADP'), ('the', 'DET'), ('grantee', 'NOUN'), ('and', 'CCONJ'), ('used', 'VERB'), ('under', 'ADP'), ('the', 'DET'), ('direct', 'ADJ'), ('control', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('grantee', 'NOUN'), ('.', 'PUNCT')]


In [136]:
# Based on our pipeline, these tokens have additional attributes, too!
# https://spacy.io/api/token#attributes
w = sent[1]
print(pd.Series({
    'text': w.text,
    'normalized form': w.norm_,
    'lemma': w.lemma_,
    'part of speech': w.pos_,
    'tag (fine pos)': w.tag_,
    'syntactic dependency': w.dep_,
    'is alpha': w.is_alpha,
    'is stopword': w.is_stop,
    'named entity type': w.ent_type_,
    'named entity id': w.ent_id_,
    'vector embedding': w.vector,
}))

text                                                                Force
normalized form                                                     force
lemma                                                               Force
part of speech                                                      PROPN
tag (fine pos)                                                        NNP
syntactic dependency                                             compound
is alpha                                                             True
is stopword                                                         False
named entity type                                                     ORG
named entity id                                                          
vector embedding        [-0.76734245, -1.5136529, 2.0895395, 0.6628331...
dtype: object


> **NOTE:** Looks like we need to do some n-grams here -- e.g. "Force Account" should be its own token.