## Setup

In [57]:
import re
import pandas as pd
import numpy as np


FIELD_NAMES = ' WUC | MDS | DISC | CN | DIS | DISCREPANCY | CORRECTIVE NARRATIVE '
FIELD_LOOKUP = {" MDS ":"Equipment_Designator"," WUC ":"Work_Unit_Code"," DISC ": "Discrepancy_Narrative", 
" DIS ":"Discrepancy_Narrative", " CN ":"Corrective_Narrative", " DISCREPANCY ": "Discrepancy_Narrative",
" CORRECTIVE NARRATIVE ":"Corrective_Narrative"}

libraries = {"pandas":pd, "re":re, "numpy":np}

In [214]:
def keep_rule_number(existing_val, new_val, libraries):
    # add rule number to rule column
    pd = libraries["pandas"]
    
    if existing_val is None or pd.isnull(existing_val):
        return str(new_val)
    else:
        return

def apply_rule(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    re = libraries["re"]
    np = libraries["numpy"]
    
    # if rule contains "D/C" field, split into two rules - one for DISC, one for CN
    if re.search(r'D/C',rule) is not None:
        # discrepency
        dis_rule = re.sub(r'D/C',r'DISC',rule)
        if debug:
            print 'D/C rule as DISC rule: {} '.format(dis_rule)
        df1 = apply_rule(dis_rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=debug)
        if df1 is None:
            df1 = pd.DataFrame()
        
        # corrective
        cn_rule = re.sub(r'D/C',r'CN',rule)
        if debug:
            print 'D/C rule as CN rule: {}'.format(cn_rule)
        df2 = apply_rule(cn_rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=debug)
        if df2 is None:
            df2 = pd.DataFrame()
        
        if (len(df1) > 0) and (len(df2) > 0):
            df_final = df1.append(df2,ignore_index=False)
            df_final['index1'] = df_final.index
            df_final.drop_duplicates(inplace=True)
            df_final.drop('index1', axis=1)
            return df_final
        elif len(df1) > 0:
            return df1
        else:
            return df2

    df = df.copy()

    rule = rule.upper()

    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.match(r"\w+.\s?[S|A|']F?[0-9\-\A-Z]+",rule).group()
        rule_number = re.match(r'\w+',rule).group()
    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    # initially all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined

    # create null WUC_RULE column if it does not exist
    if 'WUC_Rule' not in df.columns:
        df.loc[:,'WUC_Rule'] = np.nan
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0])) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0])) & (df.WUC_Rule.isnull())].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                else:  # ends with
                    if debug:
                        print 'else clause - OR match_type:' + match_type
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                else:  # ends with
                    if debug:
                        print 'else clause - AND match_type: ' + match_type
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=keep_rule_number, args=(rule_number,libraries))

    print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df

## CREATE TEST FILES

In [220]:
# load file names and rules
rules_df = pd.read_csv('C://work//client-code//mdst//testing//file_rules.csv')
rules_df.drop(rules_df.tail(1).index, inplace=True)

In [224]:
# function to create output tsvs
def create_tsvs(df):
    for index, row in df.iterrows():
        rule = row['rule'][3:-3]
        file_name = row['file_name']
        
        input_file = "C://work//clockwork-etl//docker-compose//data/clockwork_etl//clockworkETL_workspace//mdst_c130//tests//resources//input//" + file_name + ".csv"
        df_input = pd.read_csv(input_file, dtype={'Work_Unit_Code':'object'})
        df = apply_rule(rule, df_input, pd.DataFrame(), libraries, FIELD_NAMES, FIELD_LOOKUP, debug=False)
        df_input.update(df)
        
        output_file = "C://work//clockwork-etl//docker-compose//data/clockwork_etl//clockworkETL_workspace//mdst_c130//tests//resources//output//" + file_name + ".tsv"
        df_input.to_csv(output_file, sep = "\t", index = False)

In [225]:
create_tsvs(rules_df)

rule 738. AF41-3H-152 changed 2 records to wuc 41400: 
rule 613. AF41-3H-27 changed 4 records to wuc 41Y00: 
rule 613. AF41-3H-27 changed 3 records to wuc 41Y00: 
rule 1. AF11-3H-1 changed 3 records to wuc 11300: 
rule 2396. AF13-3HP-1 changed 4 records to wuc 13400: 
rule 2396. AF13-3HP-1 changed 2 records to wuc 13400: 
rule 2580. AF61-3HP-1 changed 6 records to wuc 61500: 
rule 2580. AF61-3HP-1 changed 7 records to wuc 61500: 
rule 2484. AF46-3HP-1 changed 2 records to wuc 46700: 
rule 3039. AF11-3DHP-150 changed 7 records to wuc 11299: 
rule 3039. AF11-3DHP-150 changed 4 records to wuc 11299: 
rule 3343. AF87-3DHP-1 changed 4 records to wuc 87T00: 
rule 3289. AF49-3DHP-17 changed 4 records to wuc 49100: 
rule 3289. AF49-3DHP-17 changed 3 records to wuc 49100: 
rule 3345. AF11-3L-1 changed 7 records to wuc 11400: 
rule 4032. AF78-3L-5 changed 2 records to wuc 78G00: 
rule 4032. AF78-3L-5 changed 2 records to wuc 78G00: 
rule 3966. AF71-3L-18 changed 4 records to wuc 71G00: 
rule 396



rule 31130. SF11-34F5-1 changed 2 records to wuc 11420: 
rule 32353. SF67-34F5-1 changed 2 records to wuc 67AB0: 
rule 32353. SF67-34F5-1 changed 2 records to wuc 67AB0: 
rule 31694. SF22-34F5-47 changed 3 records to wuc 22510: 
rule 31694. SF22-34F5-47 changed 4 records to wuc 22510: 
rule 32452. SF11-35F5-1 changed 4 records to wuc 114FH: 
rule 32452. SF11-35F5-1 changed 5 records to wuc 114FH: 
rule 33680. SF67-35F5-1 changed 3 records to wuc 67ABB: 
rule 33680. SF67-35F5-1 changed 2 records to wuc 67ABB: 
rule 32536. SF11-35F5-85 changed 3 records to wuc 11425: 
rule 32536. SF11-35F5-85 changed 3 records to wuc 11425: 
rule 4512. AF11-4H-1 changed 2 records to wuc 11120: 
rule 9992. AF86-4H-65 changed 2 records to wuc 86LL0: 
rule 6420. AF32-4H-41 changed 3 records to wuc 321D0: 
rule 6420. AF32-4H-41 changed 4 records to wuc 321D0: 
rule 10168. AF11-4HP-1 changed 3 records to wuc 11110: 
rule 10580. AF77-4HP-7 changed 2 records to wuc 77WA0: 
rule 10197. AF11-4HP-30 changed 3 reco