## Setup

#### Libraries and DB

In [1]:
# import libraries and setup a dictionary in studio fashion
import numpy as np
import pandas as pd
import re
import collections
import MySQLdb
import time

libraries = {'numpy': np, 'pandas': pd, 're': re, "collections": collections, "time": time}

In [2]:
# set database credentials and create a database
dsn_database = "mdst_c130"
dsn_hostname = "localhost"
dsn_port = 3306
dsn_uid = "root"
dsn_pwd = "root"

conn = MySQLdb.connect(host = dsn_hostname, port = dsn_port, user = dsn_uid, passwd = dsn_pwd, db = dsn_database)

#### Parameters

In [3]:
# establish the level parameter
level = 3

if level is None or str(level).lower() not in ("3","4","5","all"):
    raise Exception, "requires WUC edit `level` parameter: 3, 4, 5, or all"
level = str(level).lower()

In [4]:
# field names and lookup
field_names = ' WUC | MDS | DISC | CN | DIS | DISCREPANCY | CORRECTIVE NARRATIVE | CORRECTIVE ACTION '
field_lookup = {" MDS ":"Equipment_Designator"," WUC ":"Work_Unit_Code"," DISC ": "Discrepancy_Narrative", 
    " DIS ":"Discrepancy_Narrative", " CN ":"Corrective_Narrative", " DISCREPANCY ": "Discrepancy_Narrative",
    " CORRECTIVE NARRATIVE ":"Corrective_Narrative", " CORRECTIVE ACTION ": "Corrective_Narrative"}

#### Import data

In [5]:
# import work unit codes data
table= 'mdst_c130.work_unit_code_names'
cut = ''

query = "SELECT Work_Unit_Code, WUC_Narrative FROM {} {}".format(table, cut)
df_wuc_narratives = pd.read_sql(sql=query, con=conn)

wucs = df_wuc_narratives.shape[0]
print("There are {:0,.0f} WUCs in the data.".format(wucs, df_wuc_narratives.shape[1]))

There are 25,323 WUCs in the data.


In [6]:
# import rules data
table = 'rules_list'
cut = ''

if level == 'all':
    query = "SELECT rule FROM {} WHERE type = 'Edit' {}".format(db, cut)
else:
    query = "SELECT rule FROM {} WHERE type = 'Edit' AND level = 'Level {}' {}".format(table, level, cut)

# make a list of rules
rules = list(pd.read_sql(sql=query, con=conn).rule)

print("There are {} rules in the data.".format(len(rules)))

There are 201 rules in the data.


In [7]:
# get main data
table1 = 'mdst_c130.import_remis_data'
table2 = 'mdst_c130.remove_general_support_wucs'

# pull wucs and data from the same component
primary_key_fields_list = pd.read_sql(sql="SHOW KEYS FROM {}".format(table1), con=conn).Column_name
primary_key_fields = ', '.join(primary_key_fields_list)
query = """SELECT {}, Work_Unit_Code, Equipment_Designator, Discrepancy_Narrative, Corrective_Narrative FROM {}""".format(primary_key_fields, table1)
df_total = pd.read_sql(sql=query, con=conn)

# create a filter dataset
df_filter = pd.read_sql(sql="SELECT {} FROM {}".format(primary_key_fields, table2), con=conn)

# drop specific column if overlapping
if 'Work_Unit_Code' in list(df_filter.columns):
    df_total.drop('Work_Unit_Code', axis=1, inplace=True)

# inner join to limit solution set
df1 = df_total.merge(df_filter, on=list(primary_key_fields_list))

if 'WUC_Rule' not in list(df.columns):
    # initialize rules as nan
    df['WUC_Rule'] = np.nan
    # data size
    
df2 = df1.copy()

rows_total = df_total.shape[0]  
rows = df1.shape[0]
print("There are {:0,.0f} rows in the original data and {:0,.0f} rows in the final data.".format(rows_total, rows))

NameError: name 'df' is not defined

## Speed Testing

#### Function Setup

In [None]:
def append_rule_number(existing_val, new_val, libraries):
    # add rule number to rule column
    pd = libraries["pandas"]
    
    if existing_val is None or pd.isnull(existing_val):
        return str(new_val)
    else:
        return ', '.join([str(existing_val), str(new_val)])

In [None]:
def apply_rule(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    re = libraries["re"]

    df = df.copy()

    rule = rule.upper()
    
    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.match(r'\w+.\s?\'[0-9\-]+',rule).group()
        rule_number = re.match(r'\w+',rule).group()
    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    # initialy all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            # print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    if 'WUC_Rule' in df.columns:
        df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=append_rule_number, args=(rule_number,libraries))
    else:
        df.loc[:,'WUC_Rule'] = str(rule_number)
    
    # print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df

In [None]:
def apply_rule_fast(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    re = libraries["re"]

    df = df.copy()

    rule = rule.upper()
    
    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.match(r'\w+.\s?\'[0-9\-]+',rule).group()
        rule_number = re.match(r'\w+',rule).group()
    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    # initialy all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            # print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    if 'WUC_Rule' in df.columns:
        df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=append_rule_number, args=(rule_number,libraries))
    else:
        df.loc[:,'WUC_Rule'] = str(rule_number)
    
    # print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df

In [None]:
def evaluate_speed(rules, df, df_wuc_narratives, libraries, field_names, field_lookup, version, deb='False'):
    total_rows = 0
    start = time.time()
    
    rule_size = len(rules)
    df_size = df.shape[0]
    
    if version == 'original':    
        for rule in rules:
            df_subset = apply_rule(rule, df, df_wuc_narratives, libraries, field_names, field_lookup)
            if df_subset is not None:
                total_rows += df_subset.shape[0]
                # update WUCs within original dataframe
                df.update(df_subset)
                # update rules metrics
                wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
                wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
                each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

    else:    
        for rule in rules:
            df_subset = apply_rule_fast(rule, df, df_wuc_narratives, libraries, field_names, field_lookup)
            if df_subset is not None:
                total_rows += df_subset.shape[0]
                # update WUCs within original dataframe
                df.update(df_subset)
                # update rules metrics
                wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
                wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
                each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

    end = time.time()
    total_time = round(end-start,2)
    
    
    print("--------------------------------------------------------------------")
    print("For {} rules and {} maintenance records, it took {} seconds to make {} changes made across all edits".format(rule_size, df_size, total_time, total_rows))
    print("--------------------------------------------------------------------")
    
    return total_time 

In [None]:
tt1 = evaluate_speed(rules, df1, df_wuc_narratives, libraries, field_names, field_lookup, deb='False', version = 'original')
# tt2 = evaluate_speed(rules, df2, df_wuc_narratives, libraries, field_names, field_lookup, deb='False', version = 'fast')
# difference = tt1 - tt2
# percent_difference = round(difference / tt1 * 100, 2)
# print("The improvements resulted in a {} improvement, or {} % upgrade".format(difference, percent_difference))

In [121]:
percent_difference = tt1 - tt2
percent_difference = round(difference / tt1 * 100, 2)
print(percent_difference)

-5.48


#### Setup metrics and iterate

In [None]:
# wuc_group
wuc_group_re = re.compile("\w+.\s?\'(\w+)")
wuc_group_metrics = collections.defaultdict(int)
wuc_group_indices = collections.defaultdict(set)

# each_rule
each_rule_re = re.compile("\w+.\s?\'([0-9\-]+)")
each_rule_indices = collections.defaultdict(set)

In [None]:
total_rows = 0
start = time.time()
for rule in rules:
    df_subset = apply_rule(rule, df, df_wuc_narratives, libraries, field_names, field_lookup)
    if df_subset is not None:
        total_rows += df_subset.shape[0]
        # update WUCs within original dataframe
        df.update(df_subset)
        # update rules metrics
        wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
        wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
        each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

end = time.time()
total_time = end-start
print "It took {} seconds to make {} changes made across all edits (sometimes one row is changed more than once)".format(total_time, total_rows)

#### Improvements

## Results