In [1]:
# Importing the libraries 
import pandas as pd
import re

In [2]:
# Function to pad CIP codes to 6 digits
def pad_cipcodes(cip_series):
    """
    Ensure CIP codes are 6 digits by prepending a zero if necessary.
    """
    return cip_series.astype(str).str.zfill(6)

rootpath = "/hdd/work/d4ad_standardization/"
filepath = "./D4AD_Standardization/data/raw/etpl_all_programsJune3.xls"

columns = [
    "NAME",
    "NAME_1",
    "DESCRIPTION",
    "PREREQUISITES",
    "FEATURESDESCRIPTION",
    "STREET1",
    "CITY",
    "STATE",
    "ZIP",
    "WEBSITE",
    "COUNTY",
    "NONGOVAPPROVAL",
    "STATECOMMENTS",
    "CIPCODE",
    "PROVIDERID",
    "APPROVINGAGENCYID"
]

df = pd.read_excel(rootpath + filepath, usecols=columns)
print('done')

# Apply the padding function to the CIPCODE column
df['CIPCODE'] = pad_cipcodes(df['CIPCODE'])

done


In [3]:
# Set up columns to keep, fields, locations for writing
processedpath = "./D4AD_Standardization/data/processed/"
interimpath = "./D4AD_Standardization/data/interim/"

content_is = "standardized_name.csv"

the_df = df # df.sample(n=100, random_state=42)

columns_to_save = ['STANDARDIZEDNAME'] + columns #['STANDARDIZEDNAME', 'NAME', 'PROVIDERID',
                    #'APPROVINGAGENCYID', 'CIPCODE']
print('done')

done


In [4]:
# Let's transform this column into a final version...

# 1) we cover the simplest case of hyphenation
the_df['STANDARDIZEDNAME'] = the_df.NAME.str.split(" - ", n=1).str[0] # content before -

# 2) then cover cases of 'X-'
regex_pattern = '''
                ^                   # start from beginning
                (.+?                # capture everything non-greedily ...
                    (?:(?!-\s)      # ... except for the '- ', if it's there
                        .)          # and continue to match any character
                *)                  # ... as many times as we can
                '''

the_df.STANDARDIZEDNAME =\
    the_df.STANDARDIZEDNAME.str.extract(regex_pattern, flags=re.VERBOSE)

# 3) Then go after odd static patterns that are commeon
# ... people like to put the color orange, closed in the name of the provider
the_df.STANDARDIZEDNAME =\
    the_df.STANDARDIZEDNAME.str.replace("\(orange\)","", case=False)
the_df.STANDARDIZEDNAME =\
    the_df.STANDARDIZEDNAME.str.replace("closed","", case=False)

# # todo: check the data, maybe 200 cases to get the Jeffrey's interval here
# pd.set_option('display.max_rows', None)
# the_df.sample(n=100)[['STANDARDIZEDNAME', 'NAME']] # see some rows

In [5]:
# ... finally we can write this out as our first complete lookup table
# for the NAME field
the_df.to_csv(rootpath + interimpath + "{}".format(content_is),
              index = False,
              chunksize = 10000,
              columns=columns_to_save)