In [2]:
# Importing the libraries 
import pandas as pd
import numpy as np
import random
import pickle
import re
import regex  # for better, more capbale regex api
import os
import zipfile
import more_itertools
from itertools import chain
from statsmodels.stats.proportion import proportion_confint
# active labeler related
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import ComplementNB  # corrects for class imbalance, SGD is pretty good too
from sklearn.pipeline import Pipeline
from superintendent import ClassLabeller
from IPython.display import display, Markdown

pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', ComplementNB()),
])
print('done')

done


In [3]:
# Set up columns to keep, fields, locations for writing
rootpath = "/hdd/work/d4ad_standardization/"
processedpath = "D4AD_Standardization/data/processed/"
externalpath = "D4AD_Standardization/data/external/"
interimpath = "D4AD_Standardization/data/interim/"

content_is = "standardized_name_and_name1"


print('done')

done


In [4]:

filepath = "./D4AD_Standardization/data/raw/etpl_all_programsJune3.xls"
filepath = "standardized_name.csv" # builds off of notebook 3 work

columns = [
    "NAME_1",
    "STANDARDIZEDNAME",
    "NAME",
    "DESCRIPTION",
    "PREREQUISITES",
    "FEATURESDESCRIPTION",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE",
    "PROVIDERID",
    "APPROVINGAGENCYID"
]

columns_to_save = ['STANDARDIZEDNAME_1'] + columns

SKIP_THIS = True # helps me be able to run all and not worry about pulling things
# I already know I have on disk

#df = pd.read_excel(rootpath + interimpath + filepath, usecols=columns)
df = pd.read_csv(rootpath + interimpath + filepath, usecols=columns)
print('done')

done


In [5]:
if not SKIP_THIS:
    ONET_TOOLS_TECH_URL_NAME = ("https://www.onetcenter.org/dl_files/database/db_20_1_text/Tools%20and%20Technology.txt", "onet_tools_tech.csv")
    CAREERONESTOP_CERTIFICATIONS_URL_NAME = ("https://www.careeronestop.org/TridionMultimedia/tcm24-48614_CareerOnestop_Certifications_07072020.zip", "career_one_stop.zip")

    filepath = rootpath + externalpath

    for dataset in (ONET_TOOLS_TECH_URL_NAME, CAREERONESTOP_CERTIFICATIONS_URL_NAME):
        url, filename = dataset
        print("running ...", f'\nwget -O {filepath+filename} {url}')
        os.system(f'wget -O {filepath+filename} {url}')
        print("filetype is",  filename[-3:])

        if filename[-3:] == 'zip':
            with zipfile.ZipFile(filepath+filename,"r") as zip_ref:
                zipdir = filepath+filename[:-4]
                print("unzipping {} to ...".format(filename), zipdir)
                os.mkdir(zipdir)
                zip_ref.extractall(zipdir)
print('done')

done


In [6]:
pd.set_option('display.max_rows', False)

the_df = df #df.sample(n=10000, random_state=42)

In [7]:
# A) 
# The program or course name can start or end with a matching parenthesis. In these cases
# we assume that no other matching parenthesis are present and apply 
# an appropriate regex for that...

# First, set up standardized column with default values
the_df["STANDARDIZEDNAME_1"] = ""

# ... then extract names for those with opening parens
open_parenthesis_index = the_df.NAME_1.str[0] == '('
open_parenthesis_regex = '''
                (?P<paren>\(.*\)) # get the first parathesis
                (?P<the_name>.*)  # then get the actual name
                '''

the_df.loc[open_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[open_parenthesis_index, 'NAME_1']\
          .str\
          .extract(open_parenthesis_regex, flags=re.VERBOSE).the_name

# ... then extract names for those with closing parens
close_parenthesis_index = the_df.NAME_1.str[-1] == ')'
closing_parenthesis_regex = '''
                (?P<the_name>.*)  # get the actual name
                (?P<paren>\(.*\)) # get the last parathensis                
                '''
the_df.loc[close_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[close_parenthesis_index, 'NAME_1']\
          .str\
          .extract(closing_parenthesis_regex, flags=re.VERBOSE).the_name

# ... then we copy over content that has a internal parenthesis with those
# parenthesis removed and ignore everything after, e.g. "ABC (123) DEF" --> "ABC"
internal_parenthesis_index =\
    the_df['NAME_1'].str.contains('\(|\)', regex=True) &\
        ~(close_parenthesis_index|open_parenthesis_index)

the_df.loc[internal_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[internal_parenthesis_index, 'NAME_1']\
          .str\
          .extract(closing_parenthesis_regex, flags=re.VERBOSE).the_name

# ... finally, just copy over everything else
no_parenthesis_index = ~(close_parenthesis_index |\
                         open_parenthesis_index  |\
                         internal_parenthesis_index)
the_df.loc[no_parenthesis_index, "STANDARDIZEDNAME_1"] =\
    the_df.loc[no_parenthesis_index, 'NAME_1']

print('done')

done


In [8]:
# 2)
# So now we have silver version data of program, course names
# from the cell above, in STANDARDIZEDNAME_1
#
# To make an incrementally better version we need to expand 
# abbreviations and acroynmns.
#
# In a prior process (not shown here; locally in old_assorted_code.py in ../notebooks) I
# curated a set of abbreviations to expand using a labelling process; here we just load 
# the abbreviations from ./data/external
label_mapper = pd.read_csv(
    rootpath + externalpath + "label_mapper.csv"
)

draft_output = the_df[['STANDARDIZEDNAME_1', 'NAME', 'DESCRIPTION', 'FEATURESDESCRIPTION']]
 #the_df.iloc[:1000,:][['STANDARDIZEDNAME_1', 'DESCRIPTION']]

# using the wonderful regex (not re) Python library, we:
#
# A) Identify the multiple matches, each of which can have multiple matches themselves
# e.g. Win 2k & Win NT --> Windows 2k & Windows NT
#
# B) Efficiently use .sub to do multiple multiple replacment

def make_term_grouped_regex(term="", right_regex="", left_regex=""):
    mystr = left_regex + '(' +\
                re.escape(term) +\
            ')' + right_regex
    return mystr

def make_grouped_regexes(replacement, left_regex="", right_regex=""):
    return (make_term_grouped_regex(left_regex=left_regex,
                                    term=key,
                                    right_regex=right_regex)\
            for key in replacement.keys()
    )

def construct_map(label_mapper=label_mapper):
    return {
        **dict(zip(label_mapper.abbreviation, label_mapper.expanded))
    }

replacement_map = construct_map()

# Caveat: There can be matches that are subpatterns of each other
# I've only really seen this with Oracle OCP and CDL XYZ but in theory
# it could occur with another pattern so I'm hesitant to special case it
#
# So, I use POSIX leftmost longest matching, see: https://bitbucket.org/mrabarnett/mrab-regex/issues/150
# to implement longest matching, with `(?p)`
#
# This generally will make matching take slightly longer, but it should be
# only longer linear in the number of possible matching patterns in a given pattern
abbrevation_pattern =\
    regex.compile(
        "(?p)" +
        "|".join(   # match words at start of string
            make_grouped_regexes(replacement_map, left_regex=r'^', right_regex=r'[\s:]')
        ) + "|" +\
        "|".join(   # match words surrounded by spaces
            make_grouped_regexes(replacement_map, left_regex=r'\s', right_regex=r'\s')
        ) + "|" +\
        "|".join(   # match words that make up entire fields, e.g. 'Nursing'
            make_grouped_regexes(replacement_map, left_regex=r'^', right_regex=r'$')
        ) + "|" +\
        "|".join(   # match words at end of string preceded by space or slash
            make_grouped_regexes(replacement_map, left_regex=r'[\s/]', right_regex=r'$')
        ) + "|" +\
        "|".join(   # match words within string that follow a slash, end with a space or slash
            make_grouped_regexes(replacement_map, left_regex=r'/', right_regex=r'[\s/]')
        )
    )

def multiple_mapper(string):
    return abbrevation_pattern.sub(
        lambda x: \
        x.group().replace( # replace the found string
            more_itertools.first_true(x.groups() # where the first matched group...
        ),  replacement_map[more_itertools.first_true(x.groups())] # ... is replaced with the lookup
    ), string)

draft_output['MULTI_REPLACE_STANDARDIZEDNAME_1'] =\
    draft_output['STANDARDIZEDNAME_1'].dropna().map(multiple_mapper) # ~ 26k rows/10 seconds
print('done1')

done1


In [104]:
# 3) 
# Then go after odd static patterns that are common 
# ... A.A., AAS,e ends-with "/", etc etc
# "Applied Certificate in..." <--- thing is, this could really be a program
# the_df.STANDARDIZEDNAME_1 =\
#     the_df.STANDARDIZEDNAME_1.str.replace("A.A.","", case=False)

#  I think the longest matching tends to account for this when known
# things like Oracle OCP are there

18

In [9]:
# This is the evaluation part of the program and course name standardizations
# along with the provider name. My goal is to have 85%+ standardized, send out
# that 85% will come from the jefferey's interval

# Evaluation Rubric:
#   A) Here we label clearly wrong snippets, anything that is marginal we mark as
# standardized for purposes of this evaluation because we want to err on the side
# of giving overly specific information, which includes odd info
#   B) We also click through quickly, not overly dwelling one any one example, the
# goal here is to get the evaulation done quickly since it's so manual
#   C) For now we ignore casingl there does need to be a camel casing applied to
# all caps

# We create a series of data to evaluate
columns_to_check = ['MULTI_REPLACE_STANDARDIZEDNAME_1'] # we know NAME is mostly fine, 'STANDARDIZEDNAME']
the_data =\
    np.concatenate(
        (
            draft_output[columns_to_check[0]].to_numpy(),
            #the_df[columns_to_check[1]].to_numpy()
        )
    )
    
# we shuffle the data to elminate any bias across/within the columns when
# evaluting
random.Random(42).shuffle(the_data)
print('done', f'The data is {len(the_data)} long')

done The data is 24667 long


In [10]:
def display_func(row):
    """
    The display function gets passed your data - in the
    case of a dataframe, it gets passed a row - and then
    has to "display" your data in whatever way you want.

    It doesn't need to return anything
    """
    display(Markdown(row))
    #display(Markdown("**At:** " + row["timestamp"]))

def preprocessor(x, y):
    
    # only take standardized column, leave everything else
    return x, y

if not SKIP_THIS:
    verification_widget = ClassLabeller(
        features=the_data,
        model=pipeline,
        model_preprocess=preprocessor,
        display_func=display_func,
        options=['standardized', 'not standardized'],
        acquisition_function='margin'
    )

    verification_widget

In [12]:
# insert bionomial proprtion esimator here

if not SKIP_THIS:
    def print_CI(labels, response_is_standardized = "standardized", method = "jeffreys"):
        successful_count = sum(
            response_is_standardized == label for label in labels
        )
        not_examined_count = sum(
            None == label for label in labels
        )

        CI = proportion_confint(
                count= successful_count,
                nobs= len(labels) - not_examined_count,
                alpha = 0.95,
                method=method
            )
        print(f"{method} bionomial proportion is: [{CI[0]:.2f}, {CI[1]:.2f}]",
    )
        print(f"We examined {len(labels) - not_examined_count} labels, of which {successful_count} are correct. There are {len(labels)} labels.")
    print_CI(labels=verification_widget.new_labels)


In [14]:
# 4)
# Now we write out the verfiied results
# ... finally we can write this out as our first complete lookup table
# for the NAME field
write_out = the_df

write_out['STANDARDIZEDNAME_1'] =\
    draft_output['MULTI_REPLACE_STANDARDIZEDNAME_1']

# shuffe the rows to better remove temporal baises
# write_out =\
#     the_df.sample(frac=1, random_state=42, axis=0).reset_index(drop=True)

write_out.to_csv(rootpath + interimpath + content_is + ".csv",
                index = False,
                chunksize = 10000,
                columns=columns_to_save)

write_out.to_excel(rootpath + processedpath + content_is + ".xls",
            sheet_name="Standardized NAME and NAME_1",
            index=False,
            columns=columns_to_save)
print('done')

done


In [168]:
# this belongs in another notebook but let's run over one of the description fields
# so we can close out my NJ todo list for today

#draft_output['MULTI_REPLACE_STANDARDIZEDNAME_1'] =\
#    draft_output['STANDARDIZEDNAME_1'].dropna().map(multiple_mapper)
#print(draft_output['MULTI_REPLACE_STANDARDIZEDNAME_1'])
#draft_output['DESCRIPTION'].dropna().map(multiple_mapper)
# okay so thi sruns (w 1000), takes longer but that's to be expected
#   should be in a new notebook, should be 

ion, including...
650    Designed to assist in supporting managers as a...
651    Stress the importance of good telephone techni...
652    Helps you to deal more effectively with diffic...
653    HRDI has a variety of programs available in co...
654    This Package includes Windows2003MCP/CCNA.\nMi...
655    Graduates are able to:\n-analyze financial sta...
656    The specialty of this program is that it teach...
657    Hands-on experience in computer repair, networ...
658    This program offers a combination of general e...
659    Microsoft Excel\nThis course introduces studen...
660    This course includes instruction in the follow...
661    This course is designed to introduce the alpha...
662    Quality Commercial Cleaning is a mobile cleani...
663    Alternatives, Inc. Employment Services assist ...
664    This course uses interactive software to impro...
665    Prepare for an exciting future in nail care at...
666    Learn how to design your own Web page.  Studen...
667    Salme 