In [2]:
# Importing the libraries 
import pandas as pd
import numpy as np
import random
import pickle
import re
import regex  # for better, more capbale regex api
import os
import zipfile
import more_itertools
from itertools import chain
import datetime
import time
from statsmodels.stats.proportion import proportion_confint
# active labeler related
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import ComplementNB  # corrects for class imbalance, SGD is pretty good too
from sklearn.pipeline import Pipeline
from superintendent import ClassLabeller
from IPython.display import display, Markdown

pd.set_option('display.max_colwidth', None)  # so we can peak at data and spot verify

pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', ComplementNB()),
])
print('done')

done


In [3]:
# Set up columns to keep, fields, locations for writing
rootpath = "/hdd/work/d4ad_standardization/"
processedpath = "D4AD_Standardization/data/processed/"
externalpath = "D4AD_Standardization/data/external/"
interimpath = "D4AD_Standardization/data/interim/"

content_is = "standardized_descriptions_and_degree_funding_type"


print('done')

done


In [4]:

filepath = "standardized_name_and_name1.csv" # builds off of notebook 5 work

columns = [
    "STANDARDIZEDNAME_1",
    "STANDARDIZEDNAME",
    "DESCRIPTION",
    "FEATURESDESCRIPTION",
    "NAME_1",
    "NAME",
    "PREREQUISITES",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE",
    "PROVIDERID",
    "APPROVINGAGENCYID"
]

columns_to_save = ['STANDARDIZED_DESCRIPTION', 'STANDARDIZED_FEATURESDESCRIPTION'] + columns

SKIP_THIS = True # helps me be able to run all and not worry about pulling things
# I already know I have on disk

#df = pd.read_excel(rootpath + interimpath + filepath, usecols=columns)
df = pd.read_csv(rootpath + interimpath + filepath, usecols=columns)
print('done')

done


In [5]:
pd.set_option('display.max_rows', False)

the_df = df #df.sample(n=10000, random_state=42)

In [6]:
# 2) Here we apply the abbreviation expansion to the
# description columns. This code is repeated from the 5.0 notebook and should be externalized into ./src somewhere
#
# We first construct the abbreviation mapper
#
# We also store off a copy of the df for manipulation
# this has older name fields, for informing on funding (WOIA) and degree type (?)
# as well as the standardized fields so taht we can remove the extranous content still in it
# Note: this is mixing responsibilites and should be seperated into a new notebook

label_mapper = pd.read_csv(
    rootpath + externalpath + "label_mapper.csv"
)

draft_output = the_df[['DESCRIPTION', 'FEATURESDESCRIPTION',
                       'STANDARDIZEDNAME_1', 'STANDARDIZEDNAME',
                       'NAME_1', 'NAME']]


def make_term_grouped_regex(term="", right_regex="", left_regex=""):
    mystr = left_regex + '(' +\
                re.escape(term) +\
            ')' + right_regex
    return mystr

def make_grouped_regexes(replacement, left_regex="", right_regex=""):
    return (make_term_grouped_regex(left_regex=left_regex,
                                    term=key,
                                    right_regex=right_regex)\
            for key in replacement.keys()
    )

def construct_map(label_mapper=label_mapper):
    return {
        **dict(zip(label_mapper.abbreviation, label_mapper.expanded))
    }

replacement_map = construct_map()

abbrevation_pattern =\
    regex.compile(
        "(?p)" +
        "|".join(   # match words at start of string
            make_grouped_regexes(replacement_map, left_regex=r'^', right_regex=r'[\s:]')
        ) + "|" +\
        "|".join(   # match words surrounded by spaces
            make_grouped_regexes(replacement_map, left_regex=r'\s', right_regex=r'\s')
        ) + "|" +\
        "|".join(   # match words that make up entire fields, e.g. 'Nursing'
            make_grouped_regexes(replacement_map, left_regex=r'^', right_regex=r'$')
        ) + "|" +\
        "|".join(   # match words at end of string preceded by space or slash
            make_grouped_regexes(replacement_map, left_regex=r'[\s/]', right_regex=r'$')
        ) + "|" +\
        "|".join(   # match words within string that follow a slash, end with a space or slash
            make_grouped_regexes(replacement_map, left_regex=r'/', right_regex=r'[\s/]')
        )
    )

def multiple_mapper(string):
    return abbrevation_pattern.sub(
        lambda x: \
        x.group().replace( # replace the found string
            more_itertools.first_true(x.groups() # where the first matched group...
        ),  replacement_map[more_itertools.first_true(x.groups())] # ... is replaced with the lookup
    ), string)
print('done1')

done1


In [22]:
# ... with the abbreviation mapper in hand we now simply apply to both description columns
# it takes about 2.5 minutes each to run through all rows for both descriptions.
start = datetime.datetime.now()

if not SKIP_THIS:
    draft_output['STANDARDIZED_DESCRIPTION'] =\
        draft_output['DESCRIPTION'].dropna().map(multiple_mapper)
    draft_output['STANDARDIZED_FEATURESDESCRIPTION'] =\
        draft_output['FEATURESDESCRIPTION'].dropna().map(multiple_mapper)
else:
    joining_columns = ['NAME_1', 'NAME']
    interim_csv = "standardized_descriptions_and_degree_funding_type.csv"
    already_standardized_descriptions =\
        pd.read_csv(rootpath+interimpath+interim_csv,
        usecols=[
            'STANDARDIZED_DESCRIPTION', 
            'STANDARDIZED_FEATURESDESCRIPTION'] + joining_columns)\
                .drop_duplicates(subset=joining_columns)  # not sure how or why we have dupes
    # see: https://stackoverflow.com/questions/22720739/pandas-left-outer-join-results-in-table-larger-than-left-table
    
    read_in = draft_output.merge(
            already_standardized_descriptions,
            how='left',
            on=joining_columns,
            validate="m:1"
    )

    assert len(read_in) == len(draft_output), f"read in shape {len(read_in)} does not equal draft df {len(draft_output)}!"
    
end = datetime.datetime.now()
print(f"Done! That took {(end-start)} time")

Done! That took 0:00:00.399499 time


In [7]:
# 3) 
# Now we have to extract course funding type from the older
# columns. 

woia_like =\
    regex.compile(
        '''
         (title\s+[I|II|III|IV|1|2|3]+\s)   # WOIA has 4 titles of funding in law
        |(woia){d<=1}                       # is called WOIA, WIA, allowed to miss a letter
        ''',
        flags=regex.I|regex.VERBOSE)

name =\
    draft_output['NAME'].dropna()\
                        .map(woia_like.search)\
                        .dropna().index

name_1 =\
    draft_output['NAME_1'].dropna()\
                        .map(woia_like.search)\
                        .dropna().index

descriptions =\
    draft_output['DESCRIPTION'].dropna()\
                          .map(woia_like.search)\
                          .dropna().index

features_description =\
    draft_output['FEATURESDESCRIPTION'].dropna()\
                          .map(woia_like.search)\
                          .dropna().index

woia_indices = name.union(name_1)\
                   .union(descriptions)\
                   .union(features_description)
draft_output['IS_WOIA'] = False
draft_output.loc[woia_indices, 'IS_WOIA'] = True
print('done')

done


In [33]:
# ... Finally we extact the degree type from the older columns, repeating the
# procedure above but with slightly different regexes

aas_like =\
    regex.compile(
        '''
        [\b\s](Associates)[\b\s]
        |[\b\s](A\.A\.S\.)[\b\s]
        |[\b\s](A\.S\.)[\b\s]
        |[\b\s](AS)[\b\s](?!A\s)
        |[\b\s](AAS)[\b\s]                               # applied associates of science
        ''',
        flags=regex.VERBOSE)
#        |(applied.*associate.*science.*\.)     # sentence containing applied science 

name =\
    draft_output['NAME'].dropna()\
                        .map(aas_like.search)\
                        .dropna().index

name_1 =\
    draft_output['NAME_1'].dropna()\
                          .map(aas_like.search)\
                          .dropna().index

descriptions =\
    draft_output['DESCRIPTION'].dropna()\
                          .map(aas_like.search)\
                          .dropna().index

features_description =\
    draft_output['FEATURESDESCRIPTION'].dropna()\
                          .map(aas_like.search)\
                          .dropna().index

aas_indices = name.union(name_1)\
                   .union(descriptions)\
                   .union(features_description)

# This matchign is overly eager, remove those rows that mention
# Program... 
                   
draft_output.loc[aas_indices, 'Degree_Type'] = 'Associates'

# draft_output.loc[aas_indices, ['Degree_Type', 'DESCRIPTION', 'NAME_1', 'NAME', 'FEATURESDESCRIPTION']].head(15)
# Now we go back for those rows havign an empty degree type and check for
# the word certificate and assign those
cert_like =\
    regex.compile(
        '''
        (certification)
        |(certificate)
        |[\s\b](cert)[\s\b]
        ''',
        flags=regex.I|regex.VERBOSE)

name =\
    draft_output['NAME'].dropna()\
                        .map(cert_like.search)\
                        .dropna().index

name_1 =\
    draft_output['NAME_1'].dropna()\
                          .map(cert_like.search)\
                          .dropna().index

descriptions =\
    draft_output['DESCRIPTION'].dropna()\
                          .map(cert_like.search)\
                          .dropna().index

features_description =\
    draft_output['FEATURESDESCRIPTION'].dropna()\
                          .map(cert_like.search)\
                          .dropna().index

cert_indices = name.union(name_1)\
                   .union(descriptions)\
                   .union(features_description)

draft_output.loc[cert_indices, 'Degree_Type'] = 'Certificate'
draft_output.sample(15).loc[:,['Degree_Type', 'DESCRIPTION', 'NAME_1', 'NAME', 'FEATURESDESCRIPTION']] 

Unnamed: 0,Degree_Type,DESCRIPTION,NAME_1,NAME,FEATURESDESCRIPTION
14993,Certificate,"If you are a certified Nurse Aide (CNA) and would like to boost your career as a Patient Care Technician this is a course for you. This 50 hours course prepares you to learn the subjects and skills you need to sit for Patient Care Technician certification exam through the National Center for Competency Testing (NCCT). Upon successful completion of this course students are eligible to sit for the American Society of Phlebotomy Technicians (ASPT) National Certification Exam, which is being held at RVCC. There is additional charge for the ASPT National exam that is not included in your tuition.",Patient Care Technician (PCT) Trak II,Raritan Valley Community College - Non Credit,"Pre-requisites include clear understanding of written and spoken English, professional liability insurance and proof of the immunization."
8026,Associates,Hands on training give students the opportunity to develop the necessary skills to operate a tractor trailer and bus properly and safely.,CDL Class A Refresher and Passenger Endorsement,"Ideal Driving School, Inc. - Clifton",
2688,Certificate,"To provide the student with skills in assembling, troubleshooting and repairing computers of various kinds. Job titles include, but are not limited to PC Technician, LAN Technician, computer service technician, help-desk services, etc.",A + Technician,New Wave Computer Training - PVS,$ 264 for 2 A+ Certification exams. $ 132 each exam.
14907,Certificate,"A+ Technician Course, MCSE, and CCNA Certifications.",Network Action Pack,Access Careers - Hicksville,
20248,Associates,"Business Administration (M.B.A.). The master of business administration (M.B.A.) is a 48 credit program (18 credits of the lower courses may be waived) which features a cross-functional pedagogical approach. Students have the opportunity to study in the general M.B.A. program or one of four new concentrations ¿ accounting, entrepreneurship, finance, and music management. The accounting concentration satisfies the CPA education requirements while preparing students to take the CPA exam. The entrepreneurship concentration provides the skills to start, manage, operate, grow, and sustain an entrepreneurial venture. The finance concentration prepares students to take the CFA examination. The music management concentration is one of only two programs in the U.S. and takes advantage of William Paterson¿s close proximity to New York City. The Cotsakos College of Business at William Paterson University gives future business leaders an edge through a dynamic curriculum that is designed to meet the changing demands of an internationally and technologically oriented marketplace.\n\nThe M.B.A. program is accredited through AACSB International - The Association to Advance Collegiate Schools of Business. AACSB accreditation represents the highest standard of achievement for business schools worldwide. Only the top 15 percent of business schools in the world are accredited by AACSB.\n",Master of Business Administration,"William Paterson University, School of Continuing and Professional Education","The MBA curriculum also offers a broad range of elective courses in each of the schools¿ four departments: Accounting and Law; Economics, Finance, and Global Business; Marketing and Management Sciences, and Professional Sales. The program¿s rigorous coursework requires a familiarity with requisite quantitative and technological concepts. Upon enrollment, students are encouraged to take screening examinations in 1) quantitative analysis and 2) technology, and are advised on taking introductory courses in those areas if necessary.\n\nThe Christos M. Cotsakos College of Business has received accreditation from AACSB International - The Association to Advance Collegiate Schools of Business. AACSB accreditation represents the highest standard of achievement for business schools worldwide. Only about 15% Business Schools in the world are accredited by AACSB.\n\n"
20143,Associates,"The program is designed for the experienced journeyman, licensed contractor, senior electrician, or electrician preparing to take the New Jersey State Licensing Exam. The course will increase the individual¿s knowledge of electrical theory, the national electrical code and trouble shooting methods. Emphasis will be on commercial wiring applications and equipment.",Electrical Commercial,Technical Institute of Camden County - Sicklerville,"Out of County Tuition Fee is $1,480."
18595,Certificate,"The Patient Care Specialist Program includes the 90 hours mandated New Jersey State Department of Health Curriculum for Ancilliary Personnel in Long Term Care Facilities and the New Jersey Board of Nursing approved 76 hours Homemaker / Home Health Aide Curriculum. After successful completion of all components, participants will be eligible to take the New Jersey State Certification Examinations for both Certificates, Nurse's Assisant and Home Health Aides. The participants will be qualified to work in hospitals, clinics, clients homes, and nursing homes.",Certified Patient Care Specialist: a) Nurse Aide Certificate b) HHA Certificate,Exemplary Nursing Services Training Center,"Training program is divided into two categories, namely classroom and clinical training"
19116,Associates,"The main purpose behind Business Analysis is to provide management and other stakeholders with insights into ways to address business needs. Addressing these needs often lead to an improvement in organizational structures and processes. Platys Business Analysis course will teach students powerful quantitative methods that will have you making better, more informed effective business decisions. The course will be taught in a classroom environment with an on hands learning experience. The first part of the course would be training material teaching. The second half of the course will consist of practical learning experience with real-time projects.",Business Analyst Training Program,"Platys Group ""Level Up"" Training Center - TP","Our advance technology training program provide practical hands on training experience. In addition , we supply resume writing, interview coaching and job placement. After job/project placement, we supply additional support. The student we have access to the use of our help desk. This additional support is to be use if any student need assistance to over come any job/ project related objectives."
22155,Associates,"Instruction includes among other things preparing patients for examination and treatment, routine laboratory procedures, diagnostic testing, technical aspects of phlebotomy and the cardiac life cycle. Students will review important topics including phlebotomy, pharmacology, the proper use and administration of medications, taking and documenting vital signs, cardiology including proper lead placements, a professional workplace behavior, ethics and the legal aspects of healthcare.",Clinical Medical Assistant,Stockton University,Please consult the continuing studies website at www.stockton.edu/continuingstudies
13653,Associates,"The Jumpstart ABE with ESL program is designed to support students who read between 0 to 8th grade. (The TABE test score is of 3rd grade level and over). The Hispanic student with a language barrier will be taught English as a second language using the ABE curriculum while achieving their goals toward the GED test. All (Pre-GED) students will be utilizing in-class instruction and an interactive, computer-based learning tools that will address the individual educational needs of lower level students. This Pre-GED process will prepare students with the necessary skill sets to advance successfully in preparation for the GED test.\nThe high school equivalency assessment preparation program is a comprehensive program designed to provide adult students with mastery level secondary education completion skills necessary for successful passage of a NJ approved High School Equivalency Assessment exam. In addition, workplace skills development training, and career planning workshops are offered to help individuals seeking a sustainable employment opportunity, vocational training or transition to post-secondary education. Students in our HSEAP program participate in our College Tour Program",Adult Basic Education and High School Equivalency Assessment Preparation,Jumpstart Community Training and Services - Paterson - PCS,"The Adult Basic Education/ESL\n\nThe Adult Basic Education is a process:\n1. Getting the Student prepared to learn and focus\n2. Working with students according to their strengths and weaknesses in Math, Reading and Language Arts\n3. Enhancing the learning ability \n4. Assisting with bringing the test scores up to a 3.0\n5. Working with students using English as a second language. We use the Adult Basic Education as a tool to teach ESL.\n"


In [64]:
# 4) 
# Now we do some simple replacements for known degree related mentions in the name fields

degree_cert_variants =\
    ["A.S.",
     "AAS Degree",
     "AAS -",
     "A.S. Degree",
     "A.A.S. Degree",
     "Degree",
     "degree",
     "certificate",
     "Certificate",
     "Associate of Applied Science"]

draft_output['CLEANED_STANDARDIZED_NAME_1'] =\
    draft_output['STANDARDIZEDNAME_1'].replace(degree_cert_variants, "", regex=True)

In [88]:
# This is the evaluation part of the program and course name standardizations
# along with the provider name. My goal is to have 85%+ standardized, send out
# that 85% will come from the jefferey's interval

# Evaluation Rubric:
#   A) Here we label clearly wrong snippets, anything that is marginal we mark as
# standardized for purposes of this evaluation because we want to err on the side
# of giving overly specific information, which includes odd info
#   B) We also click through quickly, not overly dwelling one any one example, the
# goal here is to get the evaulation done quickly since it's so manual
#   C) For now we ignore casingl there does need to be a camel casing applied to
# all caps

# We create a series of data to evaluate
columns_to_check = ['CLEANED_STANDARDIZED_NAME_1'] # we know NAME is mostly fine, 'STANDARDIZEDNAME']
the_data =\
    np.concatenate(
        (
            draft_output[columns_to_check[0]].to_numpy(),
            #the_df[columns_to_check[1]].to_numpy()
        )
    )
    
# we shuffle the data to elminate any bias across/within the columns when
# evaluting
random.Random(42).shuffle(the_data)
print('done', f'The data is {len(the_data)} long')

done The data is 100 long


In [89]:
def display_func(row):
    """
    The display function gets passed your data - in the
    case of a dataframe, it gets passed a row - and then
    has to "display" your data in whatever way you want.

    It doesn't need to return anything
    """
    display(Markdown(row))
    #display(Markdown("**At:** " + row["timestamp"]))

def preprocessor(x, y):
    # only take standardized column, leave everything else
    return x, y

verification_widget = ClassLabeller(
    features=the_data,
    model=pipeline,
    model_preprocess=preprocessor,
    display_func=display_func,
    options=['standardized', 'not standardized'],
    acquisition_function='margin'
)

verification_widget

ClassLabeller(children=(HBox(children=(HBox(children=(FloatProgress(value=0.0, description='Progress:', max=1.…

In [92]:
# insert bionomial proprtion esimator here

def print_CI(labels, response_is_standardized = "standardized", method = "jeffreys"):
    successful_count = sum(
        response_is_standardized == label for label in labels
    )
    not_examined_count = sum(
        None == label for label in labels
    )

    CI = proportion_confint(
            count= successful_count,
            nobs= len(labels) - not_examined_count,
            alpha = 0.95,
            method=method
        )
    print(f"{method} bionomial proportion is: [{CI[0]:.2f}, {CI[1]:.2f}]",
)
    print(f"We examined {len(labels) - not_examined_count} labels, of which {successful_count} are correct. There are {len(labels)} labels.")
print_CI(labels=verification_widget.new_labels)


jeffreys bionomial proportion is: [0.95, 0.95]
We examined 100 labels, of which 95 are correct. There are 100 labels.


In [65]:
# 4)
# Now we write out the verfiied results
# ... finally we can write this out as our first complete lookup table
# for the NAME field
write_out = draft_output[
    [
        'IS_WOIA', 'Degree_Type',
        'STANDARDIZED_DESCRIPTION', 'STANDARDIZED_FEATURESDESCRIPTION', 
        'CLEANED_STANDARDIZED_NAME_1', 'STANDARDIZEDNAME',
        'STANDARDIZEDNAME_1', 'DESCRIPTION',
        'FEATURESDESCRIPTION', 'NAME_1', 'NAME'
    ]
]

print(
    "We're writing ...",
    write_out.columns
)

# shuffe the rows to better remove temporal baises
write_out =\
    write_out.sample(frac=1, random_state=42, axis=0).reset_index(drop=True)

write_out.to_csv(rootpath + interimpath + content_is + ".csv",
                index = False,
                chunksize = 10000)

write_out.to_excel(rootpath + processedpath + content_is + ".xls",
            sheet_name="Standardized Descriptions",
            index=False)
print('done')

Index(['IS_WOIA', 'Degree_Type', 'STANDARDIZED_DESCRIPTION',
       'STANDARDIZED_FEATURESDESCRIPTION', 'CLEANED_STANDARDIZED_NAME_1',
       'STANDARDIZEDNAME', 'STANDARDIZEDNAME_1', 'DESCRIPTION',
       'FEATURESDESCRIPTION', 'NAME_1', 'NAME'],
      dtype='object')
done
