In [1]:
import spacy
import pandas as pd
from spacy.matcher import Matcher

In [2]:
# todo: set as env variable for raw main table
rootpath = "/hdd/work/d4ad_standardization/"
filepath = "./D4AD_Standardization/data/raw/etpl_all_programsJune3.xls"

columns = [
    "NAME",
    "NAME_1",
    "DESCRIPTION",
    "PREREQUISITES",
    "FEATURESDESCRIPTION",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE"
]

df = pd.read_excel(rootpath + filepath, usecols=columns)

nlp = spacy.load("en_core_web_lg")
print('done')

done


In [3]:
# for test dev purposes, let's focus on a really small subset
BATCH_SIZE = 50
N = 20
random_state = 42
small_df = df.sample(n=N, random_state=random_state)
print('done')

done


In [40]:
def batches_of_docs(df, column_index=0, nlp=nlp, batch_size=BATCH_SIZE, disable=["parser","ner", "entity_linker"]):
    yield nlp.pipe(df.iloc[:,column_index].values,
                   batch_size=batch_size,
                   disable=disable)

def contents_of(matches, doc, matcher_spans_content=False):
    # if matcher_spans_content=False then the matcher
    # indicates where content is not and we span the
    # regions excluded by the matcher
    if not matcher_spans_content:
        match_start = 0
        for match in matches:
            match_end = match[1]
            yield doc[match_start:match_end]
            match_start = match[2]
        yield doc[match_start:]


patterns =\
    [
        # these break up small_df.iloc[0] into unstandardized tokens
        #[{'POS': 'PUNCT'}],  # fails in later samples
        [{'POS': 'CCONJ'}],
        # modifiction that breaks up small_df.iloc[7]
        [{'ORTH': '/'}],
        # modifiction that combines small_df.iloc[15], [1]
        [{'ORTH': ','}],
        # modifiction seen generally past 50 or os
        [{'ORTH': ';'}],
        # TODO: fix this to work, i could be special casing too early/improperly
        # modifiction seen random_state*2
        [{'IS_SPACE': True}], # captures present spaces after tokenizations
    ]

matcher = Matcher(nlp.vocab)
matcher.add("DoNotStandardize", patterns)

interimpath = "./D4AD_Standardization/data/interim/"

content_is = "prereqs.csv"

# We write out batches of prereqs to disk for downstream analyses, class induction
for batch_number, a_batch in enumerate(batches_of_docs(small_df, column_index=10)):
    print('... on a batch ...', number)
    content_path = rootpath + interimpath + "{}_{}".format(number, content_is)
    
    for doc_number, (doc, matches) in enumerate(matcher.pipe(a_batch, return_matches=True, batch_size=50)):
        direct_doc_index = batch_number*doc_number + doc_number
        with open(content_path, 'a') as csv: # append to dataframe containing content from batch_number docs
            pd.DataFrame(
                data=\
                    {
                        'content': contents_of(matches, doc),
                        'CIPCODE': small_df.iloc[direct_doc_index].CIPCODE,
                        'PREREQUISITES': small_df.iloc[direct_doc_index].PREREQUISITES,                    
                        'batch_number': number,
                        'doc_number': doc_number
                    }
            ).to_csv(
                csv,
                index = False,
                chunksize = 10000,
                header=csv.tell()==0 # only add header if new file
            )
        print('\t ... addded', doc.text[:80], '...')

... on a batch ... 0
	 ... addded H.S. diploma or GED. If not must pass the Wonderlic test. ...
	 ... addded Windows Server 2003 and 2008 Terminal Services Microsoft SQL Server 2005 with Re ...
	 ... addded High School Diploma/GED or ATB ...
	 ... addded  High school diploma, GED, Degree, 1500 hours in completing projects from work a ...
	 ... addded None ...
	 ... addded HS Diploma/GED ...
	 ... addded High School Diploma or GED ...
	 ... addded Background check / High School Diploma / Students must be at least 18 years of a ...
	 ... addded High School Graduate ...
	 ... addded Limited english speaking ...
	 ... addded Advanced Computer Knowledge ...
	 ... addded Computer Concepts ...
	 ... addded High School Diploma ...
	 ... addded High School Diploma or GED ...
	 ... addded HS Diploma, GED, ATB test ...
	 ... addded NYS 8 hour pre-assignment certification, 18+ age, valid photo id, valid social s ...
	 ... addded H.S. Diploma, GED, 18 Years of Age or Older ...
	 ... addded Personal