## Setup

In [449]:
# import libraries and setup a dictionary in studio fashion
import numpy as np
import pandas as pd
import re
import collections
import MySQLdb
import time
from bokeh import plotting, layouts, models

libraries = {'numpy': np, 'pandas': pd, 're': re, "collections": collections, "time": time}

In [450]:
# set database credentials and create a database
dsn_database = "mdst_c130"
dsn_hostname = "localhost"
dsn_port = 3306
dsn_uid = "root"
dsn_pwd = "root"

conn = MySQLdb.connect(host = dsn_hostname, port = dsn_port, user = dsn_uid, passwd = dsn_pwd, db = dsn_database)

In [451]:
# import work unit codes data
table= 'mdst_c130.work_unit_code_names'
cut = ''

query = "SELECT Work_Unit_Code, WUC_Narrative FROM {} {}".format(table, cut)
df_wuc_narratives = pd.read_sql(sql=query, con=conn)

wucs = df_wuc_narratives.shape[0]
print("There are {:0,.0f} WUCs in the data.".format(wucs, df_wuc_narratives.shape[1]))

There are 25,323 WUCs in the data.


In [452]:
# establish the level parameter
pred = 'sort_rules_list'
level = 'all'
params = {'level':level}

if level is None or str(level).lower() not in ("3","4","5","all"):
    raise Exception, "requires WUC edit `level` parameter: 3, 4, 5, or all"
level = str(level).lower()

# Rules as list
if level == 'all':
    query = """SELECT level, rule FROM {}""".format(pred)
else:
    query = "SELECT level, rule FROM {} WHERE left(level,7) = 'Level {}'".format(pred, params.get('level'))
rules = list(pd.read_sql(sql=query, con=conn).rule)
if len(rules) == 0:
    raise Exception, 'Script requires uploaded rules in the rules list component.'

# make a list of rules
rules_df = pd.read_sql(sql=query, con=conn)
rules = list(rules_df.rule)

print("There are {} rules in the data.".format(len(rules)))

There are 37594 rules in the data.


In [453]:
# get main data
table1 = 'mdst_c130.import_remis_data'
table2 = 'mdst_c130.remove_general_support_wucs'
limit = 'limit 15000'

# pull wucs and data from the same component
primary_key_fields_list = pd.read_sql(sql="SHOW KEYS FROM {}".format(table1), con=conn).Column_name
primary_key_fields = ', '.join(primary_key_fields_list)
query = """SELECT {}, Work_Unit_Code, Equipment_Designator, Discrepancy_Narrative, Corrective_Narrative FROM {} {}""".format(primary_key_fields, table1, limit)
df_total = pd.read_sql(sql=query, con=conn)

# create a filter dataset
df_filter = pd.read_sql(sql="SELECT {} FROM {}".format(primary_key_fields, table2), con=conn)

# drop specific column if overlapping
if 'Work_Unit_Code' in list(df_filter.columns):
    df_total.drop('Work_Unit_Code', axis=1, inplace=True)

# inner join to limit solution set
data1 = df_total.merge(df_filter, on=list(primary_key_fields_list))

if 'WUC_Rule' not in list(data1.columns):
    # initialize rules as nan
    data1['WUC_Rule'] = np.nan
    # data size
    
# data2 = data1.copy()

rows_total = df_total.shape[0]  
rows = data1.shape[0]
print("There are {:0,.0f} rows in the original data and {:0,.0f} rows in the final data.".format(rows_total, rows))

There are 15,000 rows in the original data and 13,455 rows in the final data.


In [454]:
# close the mysqldb connection
conn.close()

## Rule Development

In [455]:
def keep_rule_number(existing_val, new_val, libraries):
    # add rule number to rule column
    pd = libraries["pandas"]
    
    if existing_val is None or pd.isnull(existing_val):
        return str(new_val)
    else:
        return

In [458]:
def apply_rule(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    re = libraries["re"]
    np = libraries["numpy"]
    
    # if rule contains "D/C" field, split into two rules - one for DISC, one for CN
    if re.search(r'D/C',rule) is not None:
        dis_rule = re.sub(r'D/C',r'DISC',rule)
        if debug:
            print 'D/C rule as DISC rule: {}'.format(dis_rule)
        df1 = apply_rule(dis_rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=debug)
        cn_rule = re.sub(r'D/C',r'CN',rule)
        if debug:
            print 'D/C rule as CN rule: {}'.format(cn_rule)
        df2 = apply_rule(cn_rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=debug)
        if df1 is not None and df2 is not None:
            return df1.append(df2,ignore_index=False).drop_duplicates(inplace=True)
        elif df1 is not None:
            return df1
        else:
            return df2

    df = df.copy()

    rule = rule.upper()

    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.search(r'\'[0-9\-]+|AF[0-9\-\A-Z]+',rule).group()
        print(rule_name)
        rule_number = re.match(r'\w+',rule).group()

    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    # initially all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined

    # create null WUC_RULE column if it does not exist
    if 'WUC_Rule' not in df.columns:
        df.loc[:,'WUC_Rule'] = np.nan
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0])) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0])) & (df.WUC_Rule.isnull())].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                else:  # ends with
                    if debug:
                        print 'else clause - OR match_type:' + match_type
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                else:  # ends with
                    if debug:
                        print 'else clause - AND match_type: ' + match_type
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            # print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=keep_rule_number, args=(rule_number,libraries))

    # print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df

In [459]:
# field names and lookup
field_names = ' WUC | MDS | DISC | CN | DIS | DISCREPANCY | CORRECTIVE NARRATIVE | CORRECTIVE ACTION '
field_lookup = {" MDS ":"Equipment_Designator"," WUC ":"Work_Unit_Code"," DISC ": "Discrepancy_Narrative", 
    " DIS ":"Discrepancy_Narrative", " CN ":"Corrective_Narrative", " DISCREPANCY ": "Discrepancy_Narrative",
    " CORRECTIVE NARRATIVE ":"Corrective_Narrative", " CORRECTIVE ACTION ": "Corrective_Narrative"}

df_wuc_counts = df.groupby("Work_Unit_Code").size()
df_wuc_counts = df_wuc_counts.reset_index(name="counts").sort_values(by="counts", ascending=False)
df_wuc_counts = df_wuc_counts[0:9]

# top ten WUCs by count
p = plotting.figure(title = "Top Work Unit Codes - Before Cleaning", x_range=list(df_wuc_counts.Work_Unit_Code))
p.vbar(x=df_wuc_counts.Work_Unit_Code, top=df_wuc_counts.counts,
    width=0.5, bottom=0, color="dodgerblue")
p.xaxis.axis_label = 'Work Unit Code'
p.yaxis.axis_label = 'Maintenance Record Count'

# wuc_group
wuc_group_re = re.compile("\w+.\s?\'(\w+)|\w+.\s?AF(\w+)")
wuc_group_metrics = collections.defaultdict(int)
wuc_group_indices = collections.defaultdict(set)

# each_rule
each_rule_re = re.compile("\w+.\s?\'([0-9\-]+)|\w+.\s?AF([0-9\-\A-Z]+)")
each_rule_indices = collections.defaultdict(set)

total_rows = 0
start = time.time()
for rule in rules:
    df_subset = apply_rule(rule, data1, df_wuc_narratives, libraries, field_names, field_lookup)
    if df_subset is not None:
        total_rows += df_subset.shape[0]
        # print(total_rows)
        # update WUCs within original dataframe
        df.update(df_subset)
        # update rules metrics
        if wuc_group_re.match(rule) is None:
            print(rule)
        else:
            wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
            wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
            each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

end = time.time()
total_time = end-start
print "It took {} seconds to make {} changes made across all edits (sometimes one row is changed more than once)".format(total_time, total_rows)

'11-3-1
'11-3-2
'11-3-3
'11-3-4
'11-3-5
'11-3-6
'11-3-7
'11-3-8
'11-3-9
'11-3-10
'11-3-11
'11-3-12
'11-3-13
'11-3-14
'11-3-15
'11-3-16
'11-3-17
'11-3-18
'11-3-19
'11-3-20
'11-3-21
'11-3-22
'11-3-23
'11-3-24
'11-3-25
'11-3-26
'11-3-27
'11-3-28
'11-3-29
'11-3-30
'11-3-31
'11-3-32
'12-3-1
'12-3-2
'12-3-3
'12-3-4
'12-3-5
'13-3-1
'13-3-2
'13-3-3
'13-3-4
'13-3-5
'13-3-6
'13-3-7
'13-3-8
'13-3-9
'13-3-10
'13-3-11
'13-3-12
'13-3-13
'13-3-14
'13-3-15
'13-3-16
'14-3-1
'14-3-2
'14-3-3
'14-3-4
'14-3-5
'14-3-6
'14-3-7
'14-3-8
'14-3-9
'14-3-10
'14-3-11
'14-3-12
'14-3-13
'14-3-14
'14-3-15
'14-3-16
'14-3-17
'14-3-18
'14-3-19
'22-3-1
'22-3-2
'22-3-3
'22-3-4
'22-3-5
'22-3-6
'22-3-7
'22-3-8
'22-3-9
'22-3-10
'22-3-11
'22-3-12
'22-3-13
'22-3-14
'24-3-1
'24-3-2
'24-3-3
'24-3-4
'24-3-5
'24-3-6
'32-3-2
'41-3-1
'41-3-2
'41-3-3
'41-3-4
'41-3-5
'41-3-6
'42-3-1
'42-3-2
'42-3-3
'42-3-4
'44-3-1
'44-3-2
'44-3-3
'44-3-4
'44-3-5
'44-3-6
'44-3-7
'45-3-1
'45-3-2
'45-3-3
'45-3-4
'46-3-1
'46-3-2
'46-3-3
'46-3-4
'46-3-5
'46



AF11-3H-47
AF11-3H-48
AF11-3H-48
AF11-3H-49
AF11-3H-49
AF11-3H-50
AF11-3H-50
AF11-3H-51
AF11-3H-51
AF11-3H-52
AF11-3H-52
AF11-3H-53
AF11-3H-54
AF11-3H-55
AF11-3H-56
AF11-3H-57
AF11-3H-58
AF11-3H-59
AF11-3H-60
AF11-3H-61
AF11-3H-62
AF11-3H-63
AF11-3H-64
AF11-3H-65
AF11-3H-66
AF11-3H-67
AF11-3H-68
AF11-3H-69
AF11-3H-70
AF11-3H-71
AF11-3H-71
AF11-3H-72
AF11-3H-72
AF11-3H-73
AF11-3H-73
AF11-3H-74
AF11-3H-74
AF11-3H-75
AF11-3H-75
AF11-3H-76
AF11-3H-76
AF11-3H-77
AF11-3H-77
AF11-3H-78
AF11-3H-78
AF11-3H-79
AF11-3H-79
AF11-3H-80
AF11-3H-80
AF11-3H-81
AF11-3H-81
AF11-3H-82
AF11-3H-82
AF11-3H-83
AF11-3H-83
AF11-3H-84
AF11-3H-84
AF11-3H-85
AF11-3H-85
AF11-3H-86
AF11-3H-86
AF11-3H-87
AF11-3H-87
AF11-3H-88
AF11-3H-88
AF11-3H-89
AF11-3H-89
AF11-3H-90
AF11-3H-90
AF11-3H-91
AF11-3H-91
AF11-3H-92
AF11-3H-92
AF11-3H-93
AF11-3H-93
AF11-3H-94
AF11-3H-94
AF11-3H-95
AF11-3H-95
AF11-3H-96
AF11-3H-96
AF11-3H-97
AF11-3H-97
AF11-3H-98
AF11-3H-98
AF11-3H-99
AF11-3H-99
AF11-3H-100
AF11-3H-100
AF11-3H-101
AF11-3H

AF24-3H-29
AF24-3H-30
AF24-3H-30
AF24-3H-31
AF24-3H-31
AF24-3H-32
AF24-3H-32
AF24-3H-33
AF24-3H-34
AF24-3H-35
AF24-3H-35
AF24-3H-36
AF24-3H-36
AF24-3H-37
AF24-3H-37
AF24-3H-38
AF24-3H-38
AF24-3H-39
AF24-3H-40
AF24-3H-41
AF24-3H-42
AF24-3H-43
AF24-3H-44
AF24-3H-44
AF24-3H-45
AF24-3H-46
AF24-3H-47
AF24-3H-48
AF24-3H-48
AF24-3H-49
AF24-3H-49
AF24-3H-50
AF24-3H-50
AF24-3H-51
AF24-3H-51
AF24-3H-52
AF24-3H-52
AF24-3H-53
AF24-3H-53
AF24-3H-54
AF24-3H-54
AF24-3H-55
AF24-3H-55
AF24-3H-56
AF24-3H-56
AF24-3H-57
AF24-3H-58
AF24-3H-59
AF24-3H-60
AF24-3H-60
AF24-3H-61
AF24-3H-62
AF24-3H-63
AF24-3H-63
AF24-3H-64
AF24-3H-64
AF24-3H-65
AF24-3H-65
AF24-3H-66
AF32-3H-1
AF32-3H-2
AF32-3H-3
AF32-3H-3
AF32-3H-4
AF32-3H-4
AF32-3H-5
AF32-3H-5
AF32-3H-6
AF32-3H-6
AF32-3H-7
AF32-3H-7
AF32-3H-8
AF32-3H-8
AF32-3H-9
AF32-3H-9
AF32-3H-10
AF32-3H-10
AF32-3H-11
AF32-3H-11
AF32-3H-13
AF32-3H-14
AF32-3H-15
AF32-3H-16
AF32-3H-17
AF32-3H-18
AF32-3H-19
AF32-3H-20
AF32-3H-21
AF32-3H-22
AF32-3H-24
AF32-3H-25
AF32-3H-25
AF32

AF46-3H-153
AF46-3H-154
AF46-3H-155
AF46-3H-156
AF46-3H-157
AF46-3H-157
AF46-3H-158
AF46-3H-159
AF46-3H-160
AF46-3H-161
AF46-3H-162
AF46-3H-163
AF46-3H-164
AF46-3H-165
AF46-3H-165
AF46-3H-166
AF46-3H-167
AF46-3H-168
AF46-3H-169
AF46-3H-170
AF46-3H-171
AF46-3H-172
AF46-3H-173
AF46-3H-174
AF46-3H-175
AF46-3H-176
AF46-3H-176
AF46-3H-177
AF46-3H-178
AF46-3H-178
AF46-3H-179
AF46-3H-179
AF46-3H-180
AF46-3H-180
AF46-3H-181
AF46-3H-181
AF46-3H-182
AF46-3H-182
AF46-3H-183
AF46-3H-183
AF46-3H-184
AF46-3H-184
AF46-3H-185
AF46-3H-185
AF46-3H-186
AF46-3H-186
AF46-3H-187
AF46-3H-187
AF46-3H-188
AF46-3H-188
AF46-3H-189
AF46-3H-189
AF46-3H-190
AF46-3H-191
AF46-3H-192
AF46-3H-193
AF46-3H-194
AF46-3H-195
AF46-3H-196
AF46-3H-197
AF46-3H-198
AF46-3H-199
AF46-3H-200
AF46-3H-201
AF46-3H-202
AF46-3H-203
AF46-3H-203
AF46-3H-204
AF46-3H-205
AF46-3H-205
AF46-3H-206
AF46-3H-207
AF46-3H-208
AF46-3H-208
AF46-3H-209
AF46-3H-210
AF46-3H-211
AF46-3H-212
AF46-3H-213
AF46-3H-213
AF46-3H-214
AF46-3H-215
AF46-3H-216
AF46

AF52-3H-27
AF52-3H-28
AF52-3H-29
AF52-3H-30
AF52-3H-31
AF52-3H-32
AF52-3H-33
AF52-3H-33
AF52-3H-34
AF52-3H-34
AF52-3H-35
AF52-3H-35
AF52-3H-36
AF52-3H-36
AF52-3H-37
AF52-3H-37
AF52-3H-38
AF52-3H-38
AF52-3H-39
AF52-3H-39
AF52-3H-40
AF52-3H-40
AF52-3H-41
AF52-3H-42
AF52-3H-43
AF52-3H-43
AF52-3H-44
AF52-3H-45
AF52-3H-46
AF52-3H-47
AF52-3H-48
AF52-3H-49
AF52-3H-50
AF52-3H-50
AF55-3H-2
AF55-3H-2
AF55-3H-3
AF55-3H-3
AF55-3H-6
AF55-3H-6
AF55-3H-7
AF55-3H-7
AF56-3H-1
AF56-3H-2
AF56-3H-4
AF56-3H-4
AF56-3H-5
AF56-3H-5
AF56-3H-6
AF56-3H-6
AF56-3H-7
AF56-3H-7
AF56-3H-8
AF56-3H-8
AF56-3H-9
AF56-3H-9
AF56-3H-11
AF56-3H-12
AF56-3H-13
AF56-3H-14
AF56-3H-15
AF56-3H-17
AF56-3H-17
AF56-3H-18
AF56-3H-18
AF56-3H-19
AF56-3H-19
AF56-3H-20
AF56-3H-20
AF56-3H-21
AF56-3H-21
AF56-3H-22
AF56-3H-22
AF56-3H-24
AF56-3H-25
AF56-3H-26
AF61-3H-1
AF61-3H-1
AF61-3H-2
AF61-3H-2
AF61-3H-3
AF61-3H-3
AF61-3H-4
AF61-3H-4
AF61-3H-5
AF61-3H-5
AF61-3H-6
AF61-3H-6
AF61-3H-7
AF61-3H-7
AF61-3H-8
AF61-3H-8
AF61-3H-9
AF61-3H-9
AF61-3

AF75-3H-29
AF75-3H-29
AF75-3H-30
AF75-3H-31
AF75-3H-32
AF75-3H-33
AF75-3H-34
AF75-3H-34
AF76-3H-1
AF76-3H-2
AF76-3H-2
AF76-3H-3
AF76-3H-3
AF76-3H-4
AF76-3H-4
AF76-3H-5
AF76-3H-6
AF76-3H-7
AF76-3H-7
AF76-3H-8
AF76-3H-8
AF76-3H-9
AF76-3H-10
AF76-3H-11
AF76-3H-11
AF76-3H-12
AF76-3H-13
AF76-3H-13
AF76-3H-14
AF76-3H-14
AF76-3H-15
AF76-3H-15
AF76-3H-16
AF77-3H-1
AF77-3H-1
AF77-3H-2
AF77-3H-2
AF77-3H-3
AF77-3H-4
AF77-3H-4
AF77-3H-5
AF77-3H-5
AF77-3H-6
AF78-3H-1
AF78-3H-1
AF78-3H-2
AF78-3H-2
AF78-3H-3
AF78-3H-3
AF78-3H-4
AF78-3H-4
AF78-3H-5
AF78-3H-5
AF78-3H-6
AF78-3H-6
AF78-3H-7
AF78-3H-7
AF78-3H-8
AF78-3H-8
AF78-3H-9
AF78-3H-9
AF78-3H-10
AF78-3H-10
AF78-3H-11
AF78-3H-12
AF78-3H-12
AF78-3H-13
AF78-3H-13
AF78-3H-14
AF78-3H-14
AF78-3H-15
AF78-3H-15
AF78-3H-16
AF78-3H-16
AF78-3H-17
AF78-3H-17
AF78-3H-18
AF78-3H-18
AF78-3H-19
AF78-3H-19
AF78-3H-20
AF78-3H-20
AF78-3H-21
AF78-3H-21
AF78-3H-22
AF82-3H-1
AF82-3H-1
AF82-3H-2
AF82-3H-3
AF82-3H-4
AF82-3H-5
AF82-3H-6
AF82-3H-7
AF82-3H-7
AF82-3H-8
AF82-3H

KeyboardInterrupt: 

In [467]:
test = re.compile('\w+.\s?\'[0-9\-]+|\w+.\s?AF[0-9\-\A-Z]+')
rule1 = """284. AF14-3H-6. WUC BEGINS WITH "14" AND ENDS WITH 000 AND D/C CONTAINS " FLAP " AND D/C DOES NOT CONTAIN " 02514 " OR (" 04130T.O./FIG " OR " 04130 T.O./FIG ") OR (" 100GUAGE " OR " 100 GUAGE ") OR (" 13-5784 " OR " 135784 ") OR (" 2012T.O./FIG " OR " 2012 T.O./FIG ") OR " 29-13-14 " OR (" A0264CANNIBALIZATION " OR " A0264 CANNIBALIZATION ") OR (" A0962CANNIBALIZATION " OR " A0962 CANNIBALIZATION ") OR " A5733 " OR (" AILERON " OR " AILE RON ") OR (" AMUSENSOR " OR " AMU SENSOR ") OR (" BOOSTERDIVERTER " OR " BOOSTER DIVERTER ") OR (" C130HUTIL " OR " C130H UTIL ") OR (" CONTROLWHEEL " OR " CONTROL WHEEL ") OR (" DOCJ412GC20380024 " OR " DOC J412GC20380024 ") OR (" DRIVECOVER " OR " DRIVE COVER ") OR " DRYDEN " OR " DZUS " OR " ELEV " OR " ELEVATOR " OR " ESTOQUIA " OR (" FLAPGUAGE " OR " FLAP GUAGE ") OR (" HANDLEGAGE " OR " HANDLE GAGE ") OR " INDICATOR " OR (" JONE03JAN13 " OR " JONE 03JAN13 ") OR (" MECHFLAP " OR " MECH FLAP ") OR " PROCESS " OR " RUDDER " OR (" UTILITYDIVERTER " OR " UTILITY DIVERTER ") OR (" WE129 " OR " WE 129 ") OR " XMTER " THEN CHANGE WUC TO 14400"""
rule_name1 = test.match(rule1).group()
print(rule_name1)

284. AF14-3H-6


In [None]:
if wuc_group_re.match(rule) is None:
            # print(str(wuc_group_re))
            print(rule)
        else:
            wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
            wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
            each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))


In [393]:
df_wuc_counts = df.groupby("Work_Unit_Code").size()
df_wuc_counts = df_wuc_counts.reset_index(name="counts").sort_values(by="counts", ascending=False)
df_wuc_counts = df_wuc_counts[0:9]

# top ten WUCs by count
q = plotting.figure(title = "Top Work Unit Codes - After Cleaning", x_range=list(df_wuc_counts.Work_Unit_Code))
q.vbar(x=df_wuc_counts.Work_Unit_Code, top=df_wuc_counts.counts,
    width=0.5, bottom=0, color="dodgerblue")
q.xaxis.axis_label = 'Work Unit Code'
q.yaxis.axis_label = 'Maintenance Record Count'

# wuc groups changed to
df_wuc_group_metrics = pd.DataFrame.from_dict(wuc_group_metrics, orient="index")
df_wuc_group_metrics.reset_index(inplace=True)
df_wuc_group_metrics.columns = ['wuc', 'wuc_count']
df_wuc_group_metrics = df_wuc_group_metrics.sort_values(by=['wuc_count'], ascending=False)

r = plotting.figure(title = "Records Modified into Each Work Unit Code Group", x_range=list(df_wuc_group_metrics.wuc))
r.vbar(x=df_wuc_group_metrics.wuc, top=df_wuc_group_metrics.wuc_count,
    width=0.5, bottom=0, color="dodgerblue")
r.xaxis.axis_label = 'Work Unit Code Group'
r.yaxis.axis_label = 'Maintenance Records Changed'

In [336]:
print(data1.WUC_Rule.unique())

[nan '75' '65' '69' '107' '70' '144' '121' '117' '93' '120' '45' '71' '72'
 '25' '46' '50' '195' '26' '146' '30' '313' '103' '44' '24' '632' '17'
 '108' '176' '52' '171' '156' '284']


In [297]:
test1 = "SUCCESS a contains and does not contain b"
test2 = "SUCCESS a contain b"
test3 = "FAILURE a does not contains b"
test7 = "FAILURE a does not contain b"
test4 = "SUCCESS a contain and does not contain and contains b"
test5 = "SUCCESS a does not contains and contains b"
test6 = "a hello world b"

tests = [test1, test2, test3, test4, test5, test6, test7]
r'^(?!.*not contain)|^.*(?<!not )contain'

pattern = r'^.*(?<!not )contains.*$|^.*(?<!not )contain.*$|^(?!.*contain).*^'

# ^(?!.*not contain)|.

for test in tests:
    if re.search(pattern, test):
        print(test, "matched")

('SUCCESS a contains and does not contain b', 'matched')
('SUCCESS a contain b', 'matched')
('SUCCESS a contain and does not contain and contains b', 'matched')
('SUCCESS a does not contains and contains b', 'matched')
('a hello world b', 'matched')


In [298]:
def sort_rules(df, libraries):
    
    pd = libraries['pandas']
    re = libraries['re']
    
    # regex for finding all contain/contains that are not proceeded by 'not', OR anything that does not contain 'contain'
    pattern = r'^.*(?<!not )contains.*$|^.*(?<!not )contain.*$|^(?!.*contain).*^'

    # create flag based on regex and sort by contains_flag then rule_number
    df['contains_flag'] = df.rule.apply(lambda x: 0 if re.search(pattern, x, re.IGNORECASE) else 1)
    
    # remove all letters before int conversion
    df.rule_number = df.rule_number.str.extract('(\d+)', expand=False).astype(int)

    df.sort_values(by=['contains_flag', 'rule_number'], ascending=True, inplace=True)
    
    # set the sort order according to AF spec
    level_order = ['Level 3', 'Level 3 High', 'Level 3 High Promoted', 'Level 3 High Double Promoted', 'Level 3 Low',
                   'Level 3 Low Promoted', 'Level 3 Low Double Promoted', 'Level 3 4F4', 'Level 3 4F5', 'Level 3 5F5'
                   'Level 4', 'Level 4 High', 'Level 4 High Promoted', 'Level 4 Low', 'Level 4 Low Promoted', 
                   'Level 4 5F5', 'Level 5', 'Level 5 High', 'Level 5 Low']
    
    # create subset dfs and concat at the end
    df_sort_list = []

    for level in level_order:
        df_sort_list.append(df[df.level == level])
        
    df_sorted = pd.concat(df_sort_list)
    
    # drop the contains_flag
    df_sorted.drop(['rule_number', 'contains_flag'], axis=1, inplace=True)
    
    return df_sorted
    

test = sort_rules(rules_df, libraries)
test.to_csv('test_csv')

## Speed Testing

In [27]:
def time_this(td, old_time, new_time):
    current_time = time.time() - old_time
    td[new_time] = current_time
    return old_time, td

In [19]:
def apply_rule_fast(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    time_dict = {}
    
    start_time = time.time()
    
    re = libraries["re"]

    df = df.copy()

    rule = rule.upper()
    
    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.match(r'\w+.\s?\'[0-9\-]+',rule).group()
        rule_number = re.match(r'\w+',rule).group()
    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name
    
    #################################################################################################
    delete_time, time_dict = time_this(time_dict, start_time, "delete_time")
    #################################################################################################

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    #################################################################################################
    find_remove_time, time_dict = time_this(time_dict, delete_time, "find_remove_time")
    ################################################################################################# 
        
    # initialy all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            # print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    #################################################################################################
    loop_time, time_dict = time_this(time_dict, find_remove_time, "loop_time")
    #################################################################################################
    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    if 'WUC_Rule' in df.columns:
        df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=append_rule_number, args=(rule_number,libraries))
    else:
        df.loc[:,'WUC_Rule'] = str(rule_number)
    
    # print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df, time_dict

In [52]:
def evaluate_speed(rules, df, d_w_narratives, lib, f_names, f_lookup, ver, deb='False'):
    total_rows = 0
    time_df = pd.DataFrame(columns=['Action', 'Time'])
    start = time.time()
    
    rule_size = len(rules)
    df_size = df.shape[0]
    
    # wuc_group
    wuc_group_re = re.compile("\w+.\s?\'(\w+)")
    wuc_group_metrics = collections.defaultdict(int)
    wuc_group_indices = collections.defaultdict(set)

    # each_rule
    each_rule_re = re.compile("\w+.\s?\'([0-9\-]+)")
    each_rule_indices = collections.defaultdict(set)
    
    if ver == 'original':
        time.sleep(5)
        for rule in rules:
            df_subset = apply_rule(rule, df, d_w_narratives, lib, f_names, f_lookup)
            if df_subset is not None:
                total_rows += df_subset.shape[0]
                # update WUCs within original dataframe
                df.update(df_subset)
                # add the time_dict to the time_dict_df
                
                # update rules metrics
                wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
                wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
                each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

    else:    
        for rule in rules:
            df_subset = apply_rule_fast(rule, df, df_wuc_narratives, lib, f_names, f_lookup)
            if df_subset is not None:
                total_rows += df_subset.shape[0]
                # update WUCs within original dataframe
                df.update(df_subset)
                # update rules metrics
                wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
                wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
                each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

    end = time.time()
    total_time = round(end-start,2)
    
    print("For {} rules and {} maintenance records, it took {} seconds to make {} changes made across all edits".format(rule_size, df_size, total_time, total_rows))
    
    return total_time, time_df

In [53]:
tt1, tdf1 = evaluate_speed(rules, data1, df_wuc_narratives, libraries, field_names, field_lookup, ver = 'original')
# tt2, tdf2 = evaluate_speed(rules, data2, df_wuc_narratives, libraries, field_names, field_lookup, ver = 'fast')
tdf1.head(100)
# tdf2.head(100)tdf2.head(100)
# difference = tt1 - tt2
# percent_difference = round(difference / tt1 * 100, 2)
# print("The improvements resulted in a {} improvement, or {} % upgrade".format(difference, percent_difference))

For 2 rules and 284209 maintenance records, it took 9.67 seconds to make 13 changes made across all edits


Unnamed: 0,Action,Time
