In [4]:
import spacy
import pandas as pd
from spacy.matcher import Matcher
import os

In [35]:
# todo: set as env variable for raw main table
rootpath = "/hdd/work/d4ad_standardization/"
filepath = "./D4AD_Standardization/data/raw/etpl_all_programsJune3.xls"

columns = [
    "NAME",
    "NAME_1",
    "DESCRIPTION",
    "PREREQUISITES",
    "FEATURESDESCRIPTION",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE",
    "PROVIDERID",
    "APPROVINGAGENCYID"
]

df = pd.read_excel(rootpath + filepath, usecols=columns)
df.PREREQUISITES.fillna(' ', inplace=True) # space so that matches match

nlp = spacy.load("en_core_web_lg")
print('done')

done


In [3]:
# for test dev purposes, let's focus on a really small subset
BATCH_SIZE = 50

N = 20
random_state = 42
small_df = df.sample(n=N, random_state=random_state)
print('done')

done


In [11]:
def batches_of_docs(df, column_index=0, nlp=nlp, batch_size=BATCH_SIZE, disable=["parser","ner", "entity_linker"]):
    yield nlp.pipe(df.iloc[:,column_index].values,
                   batch_size=batch_size,
                   disable=disable)

def contents_of(matches, doc, matcher_spans_content=False):
    # if matcher_spans_content=False then the matcher
    # indicates where content is not and we span the
    # regions excluded by the matcher
    if not matcher_spans_content:
        match_start = 0
        for match in matches:
            match_end = match[1]
            if match[1] != -1:
                yield doc[match_start:match_end]
            match_start = match[2]
        if match_start != -1:
            yield doc[match_start:]


patterns =\
    [
        # these break up small_df.iloc[0] into unstandardized tokens
        #[{'POS': 'PUNCT'}],  # fails in later samples
        [{'POS': 'CCONJ'}],
        # modifiction that breaks up small_df.iloc[7]
        [{'ORTH': '/'}],
        # modifiction that combines small_df.iloc[15], [1]
        [{'ORTH': ','}],
        # modifiction seen generally past 50 or os
        [{'ORTH': ';'}],
        # TODO: fix this to work, i could be special casing too early/improperly
        # modifiction seen random_state*2
        [{'IS_SPACE': True}], # captures present spaces after tokenizations
    ]

matcher = Matcher(nlp.vocab)
matcher.add("DoNotStandardize", patterns)

interimpath = "./D4AD_Standardization/data/interim/"

content_is = "prereqs.csv"

 
the_df = df # could be small_df

def write_df_content(the_df, column_index=5, matcher=matcher, interimpath=interimpath, content_is=content_is):
    # We write out batches of prereqs to disk for downstream analyses, class induction
    # note: I am not sure why batch_number doesn't increment with nlp.pipe yeilds in batches of docs
    for batch_number, a_batch in enumerate(batches_of_docs(the_df, batch_size= 50, column_index=column_index)):
        number_docs_per_batch = 50
        content_path = rootpath + interimpath + "{}_{}".format(batch_number, content_is)
        for doc_number, (doc, matches) in enumerate(matcher.pipe(a_batch, return_matches=True, batch_size=50)):
            direct_doc_index = batch_number*number_docs_per_batch + doc_number

            print('\t ... adding ', doc.text[:80], '...', 'doc_number ', doc_number, ' batch number ', batch_number)

            content_exists = not os.path.isfile(content_path)
            with open(content_path, 'a') as csv: # append to dataframe containing content from batch_number docs
                pd.DataFrame(
                    data=\
                        {
                            'content': contents_of(matches, doc),
                            'CIPCODE': the_df.iloc[direct_doc_index].CIPCODE,
                            'PREREQUISITES': the_df.iloc[direct_doc_index].PREREQUISITES,
                            'batch_number': batch_number,
                            'doc_number': doc_number
                        }
                ).to_csv(
                    csv,
                    index = False,
                    chunksize = 10000,
                    header=content_exists
                )
print('done2')

done2


In [40]:
interimpath = "./D4AD_Standardization/data/interim/"

content_is = "state_comments.csv"

the_df = df
the_df = the_df.dropna(subset=['STATECOMMENTS'])

columns = ['STATECOMMENTS', 'PROVIDERID', 'APPROVINGAGENCYID', 'CIPCODE']

the_df.to_csv(
    rootpath + interimpath + "{}".format(content_is),
                    index = False,
                    chunksize = 10000,
                    columns=columns)
print('done')


done
