In [1]:
# Importing the libraries 
import pandas as pd
import numpy as np
import random
import pickle
import re
import regex  # for better, more capbale regex api
import os
import zipfile
import more_itertools
from itertools import chain
import datetime
import time
from statsmodels.stats.proportion import proportion_confint
# active labeler related
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import ComplementNB  # corrects for class imbalance, SGD is pretty good too
from sklearn.pipeline import Pipeline
from superintendent import ClassLabeller
from IPython.display import display, Markdown

pd.set_option('display.max_colwidth', None)  # so we can peak at data and spot verify

pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', ComplementNB()),
])
print('done')

done


In [2]:
# Set up columns to keep, fields, locations for writing
rootpath = "/hdd/work/d4ad_standardization/"
processedpath = "D4AD_Standardization/data/processed/"
externalpath = "D4AD_Standardization/data/external/"
interimpath = "D4AD_Standardization/data/interim/"

content_is = "standardized_descriptions_and_degree_funding_type"


print('done')

done


In [3]:

filepath = "standardized_name_and_name1.csv" # builds off of notebook 5 work

columns = [
    "STANDARDIZEDNAME_1",
    "STANDARDIZEDNAME",
    "DESCRIPTION",
    "FEATURESDESCRIPTION",
    "NAME_1",
    "NAME",
    "PREREQUISITES",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE",
    "PROVIDERID",
    "APPROVINGAGENCYID"
]

columns_to_save = ['STANDARDIZED_DESCRIPTION', 'STANDARDIZED_FEATURESDESCRIPTION'] + columns

SKIP_THIS = True # helps me be able to run all and not worry about pulling things
# I already know I have on disk

#df = pd.read_excel(rootpath + interimpath + filepath, usecols=columns)
df = pd.read_csv(rootpath + interimpath + filepath, usecols=columns)
print('done')

done


In [4]:
pd.set_option('display.max_rows', False)

the_df = df #df.sample(n=10000, random_state=42)

In [5]:
# 2) Here we apply the abbreviation expansion to the
# description columns. This code is repeated from the 5.0 notebook and should be externalized into ./src somewhere
#
# We first construct the abbreviation mapper
#
# We also store off a copy of the df for manipulation
# this has older name fields, for informing on funding (WOIA) and degree type (?)
# as well as the standardized fields so taht we can remove the extranous content still in it
# Note: this is mixing responsibilites and should be seperated into a new notebook

label_mapper = pd.read_csv(
    rootpath + externalpath + "label_mapper.csv"
)

draft_output = the_df[['DESCRIPTION', 'FEATURESDESCRIPTION',
                       'STANDARDIZEDNAME_1', 'STANDARDIZEDNAME',
                       'NAME_1', 'NAME']]


def make_term_grouped_regex(term="", right_regex="", left_regex=""):
    mystr = left_regex + '(' +\
                re.escape(term) +\
            ')' + right_regex
    return mystr

def make_grouped_regexes(replacement, left_regex="", right_regex=""):
    return (make_term_grouped_regex(left_regex=left_regex,
                                    term=key,
                                    right_regex=right_regex)\
            for key in replacement.keys()
    )

def construct_map(label_mapper=label_mapper):
    return {
        **dict(zip(label_mapper.abbreviation, label_mapper.expanded))
    }

replacement_map = construct_map()

abbrevation_pattern =\
    regex.compile(
        "(?p)" +
        "|".join(   # match words at start of string
            make_grouped_regexes(replacement_map, left_regex=r'^', right_regex=r'[\s:]')
        ) + "|" +\
        "|".join(   # match words surrounded by spaces
            make_grouped_regexes(replacement_map, left_regex=r'\s', right_regex=r'\s')
        ) + "|" +\
        "|".join(   # match words that make up entire fields, e.g. 'Nursing'
            make_grouped_regexes(replacement_map, left_regex=r'^', right_regex=r'$')
        ) + "|" +\
        "|".join(   # match words at end of string preceded by space or slash
            make_grouped_regexes(replacement_map, left_regex=r'[\s/]', right_regex=r'$')
        ) + "|" +\
        "|".join(   # match words within string that follow a slash, end with a space or slash
            make_grouped_regexes(replacement_map, left_regex=r'/', right_regex=r'[\s/]')
        )
    )

def multiple_mapper(string):
    return abbrevation_pattern.sub(
        lambda x: \
        x.group().replace( # replace the found string
            more_itertools.first_true(x.groups() # where the first matched group...
        ),  replacement_map[more_itertools.first_true(x.groups())] # ... is replaced with the lookup
    ), string)
print('done1')

done1


In [6]:
# ... with the abbreviation mapper in hand we now simply apply to both description columns
# it takes about 2.5 minutes each to run through all rows for both descriptions.
start = datetime.datetime.now()

if not SKIP_THIS:
    draft_output['STANDARDIZED_DESCRIPTION'] =\
        draft_output['DESCRIPTION'].dropna().map(multiple_mapper)
    draft_output['STANDARDIZED_FEATURESDESCRIPTION'] =\
        draft_output['FEATURESDESCRIPTION'].dropna().map(multiple_mapper)
else:
    joining_columns = ['NAME_1', 'NAME']
    interim_csv = "standardized_descriptions_and_degree_funding_type.csv"
    already_standardized_descriptions =\
        pd.read_csv(rootpath+interimpath+interim_csv,
        usecols=[
            'STANDARDIZED_DESCRIPTION', 
            'STANDARDIZED_FEATURESDESCRIPTION'] + joining_columns)\
                .drop_duplicates(subset=joining_columns)  # not sure how or why we have dupes
    # see: https://stackoverflow.com/questions/22720739/pandas-left-outer-join-results-in-table-larger-than-left-table
    
    read_in = draft_output.merge(
            already_standardized_descriptions,
            how='left',
            on=joining_columns,
            validate="m:1"
    )

    assert len(read_in) == len(draft_output), f"read in shape {len(read_in)} does not equal draft df {len(draft_output)}!"
    draft_output = read_in
    
end = datetime.datetime.now()
print(f"Done! That took {(end-start)} time")

Done! That took 0:00:00.398888 time


In [9]:
# 3) 
# Now we have to extract course funding type from the older
# columns. 

wioa_like =\
    regex.compile(
        '''
        (title\s+[IV1234]+\b\s*?)           # WOIA has 4 titles of funding in law, at end of sentence or space
        |(wioa){d<=1}                       # is called WOIA, WIA, allowed to miss a letter
        ''',
        flags=regex.I|regex.VERBOSE)

name =\
    draft_output['NAME'].dropna()\
                        .map(wioa_like.search)\
                        .dropna().index

name_1 =\
    draft_output['NAME_1'].dropna()\
                        .map(wioa_like.search)\
                        .dropna().index

descriptions =\
    draft_output['DESCRIPTION'].dropna()\
                          .map(wioa_like.search)\
                          .dropna().index

features_description =\
    draft_output['FEATURESDESCRIPTION'].dropna()\
                          .map(wioa_like.search)\
                          .dropna().index

wioa_indices = name.union(name_1)\
                   .union(descriptions)\
                   .union(features_description)
draft_output['IS_WIOA'] = False
draft_output.loc[wioa_indices, 'IS_WIOA'] = True
print('done')

done


In [11]:
# ... Finally we extact the degree type from the older columns, repeating the
# procedure above but with slightly different regexes

as_like =\
    regex.compile(
        '''
        [\b\s](A\.A\.S\.)[\b\s]
        |[\b\s](A\.S\.)[\b\s]
        |[\b\s](AS\sDe)                   # AS Degree
        |[\b\s](AS\sSc)                   # AS Science
        |[\b\s](AAS)[\b\s]                 # applied associates of science
        ''',
        flags=regex.VERBOSE)


name =\
    draft_output['NAME'].dropna()\
                        .map(as_like.search)\
                        .dropna().index

name_1 =\
    draft_output['NAME_1'].dropna()\
                          .map(as_like.search)\
                          .dropna().index

descriptions =\
    draft_output['DESCRIPTION'].dropna()\
                          .map(as_like.search)\
                          .dropna().index

features_description =\
    draft_output['FEATURESDESCRIPTION'].dropna()\
                          .map(as_like.search)\
                          .dropna().index

as_indices = name.union(name_1)\
                  .union(descriptions)\
                  .union(features_description)

draft_output['Mentioned_Associates'] = False
draft_output.loc[as_indices, 'Mentioned_Associates'] = True

In [12]:
# Now we go back for mentions of certificate and assign those
cert_like =\
    regex.compile(
        '''
        (certification)
        |(certificate)
        |[\s\b](cert)[\s\b]
        ''',
        flags=regex.I|regex.VERBOSE)

name =\
    draft_output['NAME'].dropna()\
                        .map(cert_like.search)\
                        .dropna().index

name_1 =\
    draft_output['NAME_1'].dropna()\
                          .map(cert_like.search)\
                          .dropna().index

descriptions =\
    draft_output['DESCRIPTION'].dropna()\
                          .map(cert_like.search)\
                          .dropna().index

features_description =\
    draft_output['FEATURESDESCRIPTION'].dropna()\
                          .map(cert_like.search)\
                          .dropna().index

cert_indices = name.union(name_1)\
                   .union(descriptions)\
                   .union(features_description)
draft_output['Mentioned_Certificate'] = False
draft_output.loc[cert_indices, 'Mentioned_Certificate'] = True

In [13]:
# 4) 
# Now we do some simple removals for known degree related mentions in the name fields

degree_cert_variants =\
    ["A.S.",
     "AAS Degree",
     "AAS -",
     "A.S. Degree",
     "AS Degree",     
     "Degree",
     "degree",
     "certificate",
     "Certificate",
     "Associate of Applied Science",
     "-[\s\b]Associate",
     "^\s*In\b"]

draft_output['CLEANED_STANDARDIZED_NAME_1'] =\
    draft_output['STANDARDIZEDNAME_1'].replace(degree_cert_variants, "", regex=True)

In [12]:
# This is the evaluation part of the program and course name standardizations
# along with the provider name. My goal is to have 85%+ standardized, send out
# that 85% will come from the jefferey's interval

# Evaluation Rubric:
#   A) Here we label clearly wrong snippets, anything that is marginal we mark as
# standardized for purposes of this evaluation because we want to err on the side
# of giving overly specific information, which includes odd info
#   B) We also click through quickly, not overly dwelling one any one example, the
# goal here is to get the evaulation done quickly since it's so manual
#   C) For now we ignore casingl there does need to be a camel casing applied to
# all caps

def stratified_sample(the_data, strata, size):
    some_frac = size/len(the_data)
    return \
        the_data.groupby(
            strata
        ).apply(
            lambda g: g.sample(
                n=size
                #frac=1
                )
        )

key_factors_to_consider = ['IS_WOIA'] #, 'Mentioned_Certificate']

# We create a series of data to evaluate
columns_to_check = ['CLEANED_STANDARDIZED_NAME_1', 'IS_WOIA',
                    'Mentioned_Certificate', 'Mentioned_Associates',
                    'STANDARDIZED_DESCRIPTION', 'STANDARDIZED_FEATURESDESCRIPTION']

check_this_many = 10 #100 * len(columns_to_check) # we mark if ANY column are wrong
# the_data = draft_output.sample(check_this_many,random_state=42)\
#                        .loc[:, columns_to_check]
the_data = stratified_sample(draft_output, strata=key_factors_to_consider, size=check_this_many)

# we shuffle the data to elminate any bias across/within the columns when
# evaluting
print('done', f'The stratified validation data is {len(the_data)} long')

done The stratified validation data is 20 long


In [140]:
markdown = []

def display_func(row):
    """
    The display function gets passed your data - in the
    case of a dataframe, it gets passed a row - and then
    has to "display" your data in whatever way you want.

    It doesn't need to return anything
    """

    the_string =\
            "**IS_WOIA:** " + str(row["IS_WOIA"]) +\
            " **Cert:** " + str(row["Mentioned_Certificate"]) +\
            " **Assoc:** " + str(row["Mentioned_Associates"]) +\
            "\n\n**Provider:** " + str(row["STANDARDIZEDNAME"]) + "" +\
            "\n\n**Course Name:** " + str(row["CLEANED_STANDARDIZED_NAME_1"]) + "" +\
            "\n\n**Description:** " + str(row["STANDARDIZED_DESCRIPTION"]) + "" +\
            "\n\n**Featured Description:** " + str(row["STANDARDIZED_FEATURESDESCRIPTION"]) + "" +\
            "\n\n**(unstandardized):** [Name_1] " + str(row["NAME_1"]) + " [Name] " + str(row["NAME"])


    markdown_string =\
        Markdown(the_string)

    display(
        markdown_string
    )
    markdown.append(the_string)

def preprocessor(x, y):
    # only take standardized column, leave everything else
    return x, y

verification_widget = ClassLabeller(
    features=the_data,
    model=pipeline,
    model_preprocess=preprocessor,
    display_func=display_func,
    options=['standardized', 'not standardized'],
    acquisition_function='margin'
)

verification_widget

ClassLabeller(children=(HBox(children=(HBox(children=(FloatProgress(value=0.0, description='Progress:', max=1.…

In [148]:
for m in markdown:
    print(m)
    print('\n\n')

**IS_WOIA:** False **Cert:** False **Assoc:** False

**Provider:** Anthem Institute

**Course Name:** CECI-470-01

**Description:** A brief review of the basics and begin to create complicated spreadsheets

**Featured Description:** nan

**(unstandardized):** [Name_1] CECI-470-01 [Name] Anthem Institute - Jersey City - PVS



**IS_WOIA:** False **Cert:** False **Assoc:** False

**Provider:** Career Education International Group

**Course Name:** Business Administration Package

**Description:** Upon completion of this package students will learn to use the Microsoft Office Pro at high level of proficiency.  Student will also learn use windows XP.  After completing this package students will be able to work in any entry/intermediate level position that requires the use of these programs.  The specific job title will be receptionist, office aid, and clerical.

**Featured Description:** nan

**(unstandardized):** [Name_1] Business Administration Package [Name] Career Education Internation

In [126]:
# insert bionomial proprtion esimator here

def print_CI(labels, response_is_standardized = "standardized", method = "jeffreys"):
    successful_count = sum(
        response_is_standardized == label for label in labels
    )
    not_examined_count = sum(
        None == label for label in labels
    )

    CI = proportion_confint(
            count= successful_count,
            nobs= len(labels) - not_examined_count,
            alpha = 0.95,
            method=method
        )
    print(f"{method} bionomial proportion is: [{CI[0]:.2f}, {CI[1]:.2f}]",
)
    print(f"We examined {len(labels) - not_examined_count} labels, of which {successful_count} are correct. There are {len(labels)} labels.")
print_CI(labels=verification_widget.new_labels)


jeffreys bionomial proportion is: [0.97, 0.97]
We examined 65 labels, of which 63 are correct. There are 65 labels.


In [15]:
# 4)
# Now we write out the verfiied results
# ... finally we can write this out as our first complete lookup table
# for the NAME field
write_out = draft_output[
    [
        'IS_WIOA', 'Mentioned_Certificate', 'Mentioned_Associates',
        'STANDARDIZED_DESCRIPTION', 'STANDARDIZED_FEATURESDESCRIPTION', 
        'CLEANED_STANDARDIZED_NAME_1', 'STANDARDIZEDNAME',
        'STANDARDIZEDNAME_1', 'DESCRIPTION',
        'FEATURESDESCRIPTION', 'NAME_1', 'NAME'
    ]
]

print(
    "We're writing ...",
    write_out.columns
)

# shuffe the rows to better remove temporal baises
write_out =\
    write_out.sample(frac=1, random_state=42, axis=0).reset_index(drop=True)

write_out.to_csv(rootpath + interimpath + content_is + ".csv",
                index = False,
                chunksize = 10000)

write_out.to_excel(rootpath + processedpath + content_is + ".xls",
            sheet_name="Standardized Descriptions",
            index=False)
print('done')

We're writing ... Index(['IS_WIOA', 'Mentioned_Certificate', 'Mentioned_Associates',
       'STANDARDIZED_DESCRIPTION', 'STANDARDIZED_FEATURESDESCRIPTION',
       'CLEANED_STANDARDIZED_NAME_1', 'STANDARDIZEDNAME', 'STANDARDIZEDNAME_1',
       'DESCRIPTION', 'FEATURESDESCRIPTION', 'NAME_1', 'NAME'],
      dtype='object')
done
