In [12]:
# Importing the libraries 
import pandas as pd
import numpy as np
import re
import os
import zipfile
from statsmodels.stats.proportion import proportion_confint

In [2]:
rootpath = "/hdd/work/d4ad_standardization/"
filepath = "./D4AD_Standardization/data/raw/etpl_all_programsJune3.xls"

columns = [
    "NAME",
    "NAME_1",
    "DESCRIPTION",
    "PREREQUISITES",
    "FEATURESDESCRIPTION",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE",
    "PROVIDERID",
    "APPROVINGAGENCYID"
]

df = pd.read_excel(rootpath + filepath, usecols=columns)
print('done')

done


In [8]:
# Set up columns to keep, fields, locations for writing
processedpath = "D4AD_Standardization/data/processed/"
externalpath = "D4AD_Standardization/data/external/"

content_is = "NAME_1_lookup.csv"

the_df = df.sample(n=100, random_state=42)

columns_to_save = ['STANDARDIZEDNAME1', 'NAME_1', 'PROVIDERID',
                    'APPROVINGAGENCYID', 'CIPCODE']
print('done')

done


In [10]:
# This should probably be put in a seperate file but the Tools and Technology of
# ONET should basically cover every possible tool/tech and 

# Yeah, and then Career One Stop has a giant list of certs, occupational licenses
# see: https://www.careeronestop.org/Developers/Data/data-downloads.aspx
# need to stand up or use a db connector to import it, though
ONET_TOOLS_TECH_URL_NAME = ("https://www.onetcenter.org/dl_files/database/db_20_1_text/Tools%20and%20Technology.txt", "onet_tools_tech.csv")
CAREERONESTOP_CERTIFICATIONS_URL_NAME = ("https://www.careeronestop.org/TridionMultimedia/tcm24-48614_CareerOnestop_Certifications_07072020.zip", "career_one_stop.zip")

filepath = rootpath + externalpath

for dataset in (ONET_TOOLS_TECH_URL_NAME, CAREERONESTOP_CERTIFICATIONS_URL_NAME):
    url, filename = dataset
    print("running ...", f'\nwget -O {filepath+filename} {url}')
    os.system(f'wget -O {filepath+filename} {url}')
    print("filetype is",  filename[-3:])

    if filename[-3:] == 'zip':
        with zipfile.ZipFile(filepath+filename,"r") as zip_ref:
            zipdir = filepath+filename[:-4]
            print("unzipping {} to ...".format(filename), zipdir)
            os.mkdir(zipdir)
            zip_ref.extractall(zipdir)

# todo: d/l giant list of acronyms off of wikipedia
# here: https://en.wikipedia.org/wiki/Lists_of_abbreviations
# todo: figure out a standardization/needs-standardization logic
#       the needs logic should basically look up into abbreviations or the certification
#           and replace, remove or annotate (like see description, place a star)

KeyboardInterrupt: 

In [4]:
pd.set_option('display.max_rows', None)

# anomalous cases to test against
"""
CXA-300-1I Advanced Administration for Citrix ...	
LEED AP Exam
Accounting Applications I/
CompTIA i-Net+ Certification
(EHR125D) Electronic Health Records Specialist
Patient Care Technician (PCT)
CELW-109-01

A.A.Degree: Liberal Arts/Ele-Sec Educ
(10 months) Some Course
Windows2000

Electrical/Electronic Control System & Transpo...
PowerPoint Level 1
"""
# Can throw into https://regex101.com/ and cook up a regex that gets them all


# quick notes:
# * so everything in parathensis can be removed and, actually, is valuable abbreviation data
# * the description is interesting, it often has some location info that isn't needed. THis might benefit from a NER type approach where we label tokens that should be removed. But that's not here nor there.
# * trailing non-alphanumerics should be removed (like "...tions I/")
# * Somethings, "CEBC-211-01" are too abbreviated (e.g. all caps, )
# * I could run the abbreviation extractor on this but I'm kinda hestitant to mess 
# with another vauge program, however, it might really simplify some of this
# * There are degree, time realated things that don't belong, "A.A.Degree: Liberal Arts/Ele-Sec Educ" or "10-MOS" (10 months), or Paralegal certificate
# * Saw one mashed together thing, Windows2000

# * loaded a giant list of certifications to ./data/external/career_one_stop, it includes canonical names, some abbreviations (and we can generate abbrevations from the canonical name)
# * yeah that list of cert names will be useful, so it looks like typically the course name describes a single thing, we don't list multiple certifications, programs, etc. This enables me to chunk up the ... thign is there aresome liek CEWL that don't show up at all

'\nCXA-300-1I Advanced Administration for Citrix ...\t\nLEED AP Exam\nAccounting Applications I/\nCompTIA i-Net+ Certification\n(EHR125D) Electronic Health Records Specialist\nPatient Care Technician (PCT)\nCELW-109-01\n\nA.A.Degree: Liberal Arts/Ele-Sec Educ\n(10 months) Some Course\nWindows2000\n\nElectrical/Electronic Control System & Transpo...\nPowerPoint Level 1\n'

In [5]:
# A) 
# The program or course name can start or end with a matching parenthesis. In these cases
# we assume that no other matching parenthesis are present and apply an appropriate
# regex for that...

# First, set up standardized column with default values
the_df["STANDARDIZEDNAME_1"] = ""

# ... then extract names for those with opening parens
open_parenthesis_index = the_df.NAME_1.str[0] == '('
open_parenthesis_regex = '''
                (?P<paren>\(.*\)) # get the first parathesis
                (?P<the_name>.*)  # then get the actual name
                '''

the_df.loc[open_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[open_parenthesis_index, 'NAME_1']\
          .str\
          .extract(open_parenthesis_regex, flags=re.VERBOSE).the_name

# ... then extract names for those with closing parens
close_parenthesis_index = the_df.NAME_1.str[-1] == ')'
closing_parenthesis_regex = '''
                (?P<the_name>.*)  # get the actual name
                (?P<paren>\(.*\)) # get the last parathensis                
                '''
the_df.loc[close_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[close_parenthesis_index, 'NAME_1']\
          .str\
          .extract(closing_parenthesis_regex, flags=re.VERBOSE).the_name

# ... then we copy over content that has a internal parenthesis with those
# parenthesis removed and ignore everything after, e.g. "ABC (123) DEF" --> "ABC"
internal_parenthesis_index =\
    the_df['NAME_1'].str.contains('\(|\)', regex=True) &\
        ~(close_parenthesis_index|open_parenthesis_index)

the_df.loc[internal_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[internal_parenthesis_index, 'NAME_1']\
          .str\
          .extract(closing_parenthesis_regex, flags=re.VERBOSE).the_name

# ... finally, just copy over everything else
no_parenthesis_index = ~(close_parenthesis_index |\
                         open_parenthesis_index  |\
                         internal_parenthesis_index)
the_df.loc[no_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[no_parenthesis_index, 'NAME_1']

the_df[['NAME_1', 'STANDARDIZEDNAME_1']]

# 2)
# So now we have silver version data that has a lot of abbreviations, acroynmns
# that need to be spelled out. We do this with bulk match and replace.

# acroynmns can be done with the career one stop credentials list
# abbreviatins are harder. Let's start on acroynmns first.

# 3) 
# Then go after odd static patterns that are common 
# ... A.A., AAS,e ends-with "/", etc etc
# "Applied Certificate in..." <--- thing is, this could really be a program
# the_df.STANDARDIZEDNAME_1 =\
#     the_df.STANDARDIZEDNAME_1.str.replace("A.A.","", case=False)



Unnamed: 0,NAME_1,STANDARDIZEDNAME_1
3494,Specialized Clinical Technician,Specialized Clinical Technician
14560,CXA-300-1I Advanced Administration for Citrix ...,CXA-300-1I Advanced Administration for Citrix ...
5535,Accounting Applications I/,Accounting Applications I/
20964,Project Management,Project Management
17769,LEED AP Exam,LEED AP Exam
22724,Medical Allied Health Office Assistant,Medical Allied Health Office Assistant
4720,Word Processing Program,Word Processing Program
12570,Certified Home Health Aide,Certified Home Health Aide
4186,Personal Trainer National Certification,Personal Trainer National Certification
2270,English as a Second Language,English as a Second Language


In [18]:
my_string =\
    """
CREATE TABLE CERTIFICATIONS
(
  CERT_ID           VARCHAR2(5 BYTE) CONSTRAINT CERT_ID_NN NOT NULL,
  CERT_NAME         VARCHAR2(200 BYTE) CONSTRAINT CERT_NAME_NN NOT NULL,
  ORG_ID            CHAR(4 BYTE),
  TRAINING          NUMBER(1),
  EXPERIENCE        NUMBER(1),
  EITHER            NUMBER(1),
  EXAM              NUMBER(1),
  RENEWAL           CHAR(2 BYTE),
  CEU               NUMBER(1),
  REEXAM            NUMBER(1),
  CPD               NUMBER(1),
  CERT_ANY          NUMBER(1),
  URL               VARCHAR2(254 BYTE),
  ACRONYM           VARCHAR2(16 BYTE),
  NSSB_URL          VARCHAR2(254 BYTE),
  CERT_URL          VARCHAR2(254 BYTE),
  CERT_LAST_UPDATE  DATE,
  KEYWORD1          VARCHAR2(64 BYTE),
  KEYWORD2          VARCHAR2(64 BYTE),
  KEYWORD3          VARCHAR2(64 BYTE),
  SUPPRESS          VARCHAR2(1 BYTE),
  DATEADDED         DATE                        DEFAULT SYSDATE,
  COMMENTS          VARCHAR2(1000 BYTE),
  VERIFIED          VARCHAR2(1 BYTE),
  UPDATEDBY         VARCHAR2(2 BYTE),
  CERT_DESCRIPTION  VARCHAR2(2000 BYTE),
  DELETED           NUMBER(1)                   DEFAULT 0,
  EXAM_DETAILS      VARCHAR2(2000 BYTE)
);

CREATE UNIQUE INDEX CERT_ID_PK ON CERTIFICATIONS
(CERT_ID);

ALTER TABLE CERTIFICATIONS ADD (
  CONSTRAINT CERT_ID_PK
  PRIMARY KEY
  (CERT_ID)
  USING INDEX CERT_ID_PK
  ENABLE VALIDATE);

ALTER TABLE CERTIFICATIONS ADD (
  CONSTRAINT CERTIFICATIONS_ORG_ID_FK 
  FOREIGN KEY (ORG_ID) 
  REFERENCES CERT_ORGS (ORG_ID)
  ENABLE VALIDATE);

SET DEFINE OFF;
Insert into CERTIFICATIONS
   (CERT_ID, CERT_NAME, ORG_ID, TRAINING, EXPERIENCE, 
    EITHER, EXAM, RENEWAL, CEU, REEXAM, 
    CPD, CERT_ANY, URL, ACRONYM, NSSB_URL, 
    CERT_URL, CERT_LAST_UPDATE, KEYWORD1, KEYWORD2, KEYWORD3, 
    SUPPRESS, DATEADDED, COMMENTS, VERIFIED, UPDATEDBY, 
    CERT_DESCRIPTION, DELETED, EXAM_DETAILS)
 Values
   ('10249', 'SAP Certified Technology Associate - System Administration (Max DB) with SAP NetWeaver 7.4', '0106', NULL, NULL, 
    NULL, 1, '  ', NULL, NULL, 
    NULL, NULL, 'https://training.sap.com/certification/c_tadm50_74-sap-certified-technology-associate---system-administration-max-db-with-sap-netweaver-74-g/', NULL, NULL, 
    NULL, TO_DATE('01/28/2019 00:00:00', 'MM/DD/YYYY HH24:MI:SS'), NULL, NULL, NULL, 
    '1', TO_DATE('01/21/2015 16:31:10', 'MM/DD/YYYY HH24:MI:SS'), NULL, '0', '28', 
    'This certification path will validate your capability as a well-trained technologist prepared to help your client or partner manage SAP systems based on IBM DB2 database. Armed with an understanding of SAP system administration, you can implement this knowledge for your projects as a technology consultant', 1, NULL);
Insert into CERTIFICATIONS
   (CERT_ID, CERT_NAME, ORG_ID, TRAINING, EXPERIENCE, 
    EITHER, EXAM, RENEWAL, CEU, REEXAM, 
    CPD, CERT_ANY, URL, ACRONYM, NSSB_URL, 
    CERT_URL, CERT_LAST_UPDATE, KEYWORD1, KEYWORD2, KEYWORD3, 
    SUPPRESS, DATEADDED, COMMENTS, VERIFIED, UPDATEDBY, 
    CERT_DESCRIPTION, DELETED, EXAM_DETAILS)
 Values
   ('10052', 'IBM Certified BPM Developer - WebSphere Lombardi Edition V7.2', '0814', 1, 1, 
    NULL, 1, '  ', NULL, NULL, 
    NULL, NULL, 'http://www-03.ibm.com/certify/certs/15010502.shtml', NULL, NULL, 
    NULL, TO_DATE('07/06/2017 00:00:00', 'MM/DD/YYYY HH24:MI:SS'), 'Business process', NULL, NULL, 
    '1', TO_DATE('10/21/2014 10:43:08', 'MM/DD/YYYY HH24:MI:SS'), NULL, '0', '28', 
    'This intermediate level certification is intended for BPM application developers who contribute to the delivery of complex level process applications, and provide supervision and guidance to entry level developers.', 1, NULL);
Insert into CERTIFICATIONS
   (CERT_ID, CERT_NAME, ORG_ID, TRAINING, EXPERIENCE, 
    EITHER, EXAM, RENEWAL, CEU, REEXAM, 
    CPD, CERT_ANY, URL, ACRONYM, NSSB_URL, 
    CERT_URL, CERT_LAST_UPDATE, KEYWORD1, KEYWORD2, KEYWORD3, 
    SUPPRESS, DATEADDED, COMMENTS, VERIFIED, UPDATEDBY, 
    CERT_DESCRIPTION, DELETED, EXAM_DETAILS)
 Values
   ('10096', 'IBM Certified Specialist - Systems Networking Sales V1', '0814', 0, 0, 
    NULL, 1, '  ', NULL, NULL, 
    NULL, NULL, 'http://www-03.ibm.com/certify/certs/57000201.shtml', NULL, NULL, 
    NULL, TO_DATE('03/20/2015 00:00:00', 'MM/DD/YYYY HH24:MI:SS'), NULL, NULL, NULL, 
    '1', TO_DATE('10/24/2014 14:24:09', 'MM/DD/YYYY HH24:MI:SS'), NULL, '0', '28', 
    'The IBM Certified Specialist - System Networking Sales V1 engages prospective customers and evaluates the IT infrastructure and understands the customer''s business environment in order to ensure the design and implementation of appropriate networking solutions. This individual performs requirements analysis, systems architecture/design, and planning/installation. Additionally, they provide ongoing support to ensure the customer is operational and that technical requirements are continually addressed.', 1, NULL);    
    """
print('done')

done


In [31]:
# Read in the acorynym list
# credential_lookup 
path = rootpath + externalpath + 'career_one_stop/'
credential_sql = 'TEST-2-CERTIFICATIONS.sql' # '2-CERTIFICATIONS.sql'

with open(path + credential_sql) as sql:
    my_string = sql.read()

header_names =\
    (
        'CERT_ID', 'CERT_NAME', 'ORG_ID', 'TRAINING', 'EXPERIENCE', 
        'EITHER', 'EXAM', 'RENEWAL', 'CEU', 'REEXAM', 
        'CPD', 'CERT_ANY', 'URL', 'ACRONYM', 'NSSB_URL', 
        'CERT_URL', 'CERT_LAST_UPDATE', 'KEYWORD1', 'KEYWORD2', 'KEYWORD3', 
        'SUPPRESS', 'DATEADDED', 'COMMENTS', 'VERIFIED', 'UPDATEDBY', 
        'CERT_DESCRIPTION', 'DELETED', 'EXAM_DETAILS'
    )

# Pandas assumes atomic python types when reading from records,
# See: https://github.com/pandas-dev/pandas/issues/9381, so we need to use
# Python types here
dtypes =\
    np.dtype(
        "str, str, float, float,"
        "float, float, float, str,"
        "float, float, float, float,"
        "str, str, str, str,"
        "str, str, str, str,"
        "str, str, str, str," 
        "str, str, float, str"
    )

flags = re.MULTILINE | re.DOTALL | re.VERBOSE
the_fields_regex =\
    """
    (?P<values>Values\n\s+\()  # Start with the word Value <newline> (
        (?P<fields>.*?)        #    Grab all the field content
    (?P<end>\);)               # ... which stops at the terminating paren, ;
    """

the_fields = re.compile(the_fields_regex, flags=flags)

a_field_regex =\
    """
    '(?P<string>.*?)'[,)]           # get a quoted string ending at comma or paran or
    |(?P<date_time>TO_DATE\(.*?\))  # get the TO_DATE, parse out actual date later or
    |(?P<num>\d),                   # get numeric or
    |(?P<null>NULL)                 # get NULL
    """

a_field = re.compile(a_field_regex, flags=flags)

def yield_certification_records(sql_file=my_string, require_field_numbers=[1]):
    # do we skip those w/o certain fields, like acronymns
    temp_data = [0]*28
    for match in the_fields.finditer(sql_file):
        break_match = False

        for index, field in enumerate(a_field.finditer( match.group('fields') )):
            grp = None
            for grp, value in field.groupdict().items():
                if value:
                    # then we transform the string value into the appropriate type, given the group name
                    if grp == 'date_time':
                        #  There is a difference between https://regex101.com/r/yphUXY/1/
                        # and what I see Python do here; if I don't capture the entire thing
                        # it gets re-raised as another potential match, even if I use ?:, etc.
                        value = value[9:28] # todo: convert to datetime
                    if grp == 'null':
                        value = None
                        if index in require_field_numbers:
                            break_match = True

                    if grp == 'num':
                        value = int(value)

                    temp_data[index] = value
                    break # only one possible match value
            if break_match: # and don't look at other fields
                break

        if not break_match:
            yield tuple(value for value in temp_data)
        
        break_match = False

df =\
    pd.DataFrame.from_records(
        yield_certification_records(),
        columns=header_names)
df

Unnamed: 0,CERT_ID,CERT_NAME,ORG_ID,TRAINING,EXPERIENCE,EITHER,EXAM,RENEWAL,CEU,REEXAM,...,KEYWORD2,KEYWORD3,SUPPRESS,DATEADDED,COMMENTS,VERIFIED,UPDATEDBY,CERT_DESCRIPTION,DELETED,EXAM_DETAILS
0,10249,SAP Certified Technology Associate - System Ad...,106,,,,1,,,,...,,,1,01/21/2015 16:31:10,,0,28,This certification path will validate your cap...,1,
1,10052,IBM Certified BPM Developer - WebSphere Lombar...,814,1.0,1.0,,1,,,,...,,,1,10/21/2014 10:43:08,,0,28,This intermediate level certification is inten...,1,
2,10096,IBM Certified Specialist - Systems Networking ...,814,0.0,0.0,,1,,,,...,,,1,10/24/2014 14:24:09,,0,28,The IBM Certified Specialist - System Networki...,1,


In [151]:
#the_record = np.empty((28,), dtype = dtypes)
dt=np.dtype('O,U10,f,i')
the_record = np.empty((1,), dtype=dt)
the_record[0] =tuple(e for e in ['10249',4, 3.2, 5])
print(pd.DataFrame.from_records(the_record, index=[1]),"<-- record")
print(dtypes)
print(dt)

dt_type = np.dtype([
    ('k', 'i4'),
    ('v', 'i4', 2)
])

pd.DataFrame.from_records(list(dt), columns=dt.dtype.names)


f0 f1   f2  f3
1  10249  4  3.2   5 <-- record
[('f0', 'S5'), ('f1', 'S200'), ('f2', 'u1'), ('f3', 'u1'), ('f4', 'u1'), ('f5', 'u1'), ('f6', 'u1'), ('f7', 'S2'), ('f8', 'u1'), ('f9', 'u1'), ('f10', 'u1'), ('f11', 'u1'), ('f12', 'S254'), ('f13', 'S216'), ('f14', 'S254'), ('f15', 'S254'), ('f16', 'S25'), ('f17', 'S64'), ('f18', 'S64'), ('f19', 'S64'), ('f20', 'S1'), ('f21', 'S25'), ('f22', 'S1000'), ('f23', 'S1'), ('f24', 'S2'), ('f25', 'S2000'), ('f26', 'u1'), ('f27', 'S2000')]
[('f0', 'O'), ('f1', '<U10'), ('f2', '<f4'), ('f3', '<i4')]


TypeError: 'numpy.dtype' object is not iterable

In [73]:
list(record_dtypes)

[('CERT_ID', dtype('S5')),
 ('CERT_NAME', dtype('S200')),
 ('ORG_ID', dtype('uint8')),
 ('TRAINING', dtype('uint8')),
 ('EXPERIENCE', dtype('uint8')),
 ('EITHER', dtype('uint8')),
 ('EXAM', dtype('uint8')),
 ('RENEWAL', dtype('S2')),
 ('CEU', dtype('uint8')),
 ('REEXAM', dtype('uint8')),
 ('CPD', dtype('uint8')),
 ('CERT_ANY', dtype('uint8')),
 ('URL', dtype('S254')),
 ('ACRONYM', dtype('S216')),
 ('NSSB_URL', dtype('S254')),
 ('CERT_URL', dtype('S254')),
 ('CERT_LAST_UPDATE', <function numpy.datetime_as_string>),
 ('KEYWORD1', dtype('S64')),
 ('KEYWORD2', dtype('S64')),
 ('KEYWORD3', dtype('S64')),
 ('SUPPRESS', dtype('S1')),
 ('DATEADDED', <function numpy.datetime_as_string>),
 ('COMMENTS', dtype('S1000')),
 ('VERIFIED', dtype('S1')),
 ('UPDATEDBY', dtype('S2')),
 ('CERT_DESCRIPTION', dtype('S2000')),
 ('DELETED', dtype('uint8')),
 ('EXAM_DETAILS', dtype('S2000'))]

In [None]:
#  now with the transformed data we 
# a) launch an active label checker, that looks for program/course names that are standardized or not standardized
# b) periodically run the cell below that calcualtes the bionomial proportion of standardized to unstandardized; ideally we want 95% - 100%

In [6]:
# insert active labeler here
# this is crude but first we set up the pipeline, following
# https://superintendent.readthedocs.io/en/latest/examples/preprocessing-data.html
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import ComplementNB  # corrects for class imbalance, SGD is pretty good too
from sklearn.pipeline import Pipeline

from superintendent import ClassLabeller
from IPython.display import display, Markdown

pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', ComplementNB()),
])

def display_func(row):
    """
    The display function gets passed your data - in the
    case of a dataframe, it gets passed a row - and then
    has to "display" your data in whatever way you want.

    It doesn't need to return anything
    """
    display(Markdown(row["STANDARDIZEDNAME_1"]))
    #display(Markdown("**At:** " + row["timestamp"]))

def preprocessor(x, y):
    # only take standardized column, leave everything else
    return x["STANDARDIZEDNAME_1"], y

labelling_widget = ClassLabeller(
    features=the_df,
    model=pipeline,
    model_preprocess=preprocessor,
    display_func=display_func,
    options=['standardized', 'not standardized'],
    acquisition_function='margin'
)

labelling_widget

ClassLabeller(children=(HBox(children=(HBox(children=(FloatProgress(value=0.0, description='Progress:', max=1.…

In [26]:
# insert bionomial proprtion esimator here

def print_CI(labels, response_is_standardized = "standardized", method = "jeffreys"):
    successful_count = sum(
        response_is_standardized == label for label in labels
    )
    not_examined_count = sum(
        None == label for label in labels
    )

    CI = proportion_confint(
            count= successful_count,
            nobs= len(labels) - not_examined_count,
            alpha = 0.95,
            method=method
        )
    print(f"{method} bionomial proportion is: [{CI[0]:.2f}, {CI[1]:.2f}]",
)
print_CI(labels=labelling_widget.new_labels)

jeffreys bionomial proportion is: [0.88, 0.88]


In [142]:
# ... finally we can write this out as our first complete lookup table
# for the NAME field
the_df.to_csv(rootpath + processedpath + "{}".format(content_is),
              index = False,
              chunksize = 10000,
              columns=columns_to_save)