## Setup

In [380]:
# import libraries and setup a dictionary in studio fashion
import numpy as np
import pandas as pd
import re
import collections
import MySQLdb
import time
from bokeh import plotting, layouts, models

libraries = {'numpy': np, 'pandas': pd, 're': re, "collections": collections, "time": time}

In [381]:
# set database credentials and create a database
dsn_database = "mdst_c130"
dsn_hostname = "localhost"
dsn_port = 3306
dsn_uid = "root"
dsn_pwd = "root"

conn = MySQLdb.connect(host = dsn_hostname, port = dsn_port, user = dsn_uid, passwd = dsn_pwd, db = dsn_database)

In [382]:
# import work unit codes data
table= 'mdst_c130.work_unit_code_names'
cut = ''

query = "SELECT Work_Unit_Code, WUC_Narrative FROM {} {}".format(table, cut)
df_wuc_narratives = pd.read_sql(sql=query, con=conn)

wucs = df_wuc_narratives.shape[0]
print("There are {:0,.0f} WUCs in the data.".format(wucs, df_wuc_narratives.shape[1]))

There are 25,323 WUCs in the data.


In [383]:
# establish the level parameter
pred = 'sort_rules_list'
level = 'all'
params = {'level':level}

if level is None or str(level).lower() not in ("3","4","5","all"):
    raise Exception, "requires WUC edit `level` parameter: 3, 4, 5, or all"
level = str(level).lower()

# Rules as list
if level == 'all':
    query = """SELECT level, rule FROM {}""".format(pred)
else:
    query = "SELECT level, rule FROM {} WHERE left(level,7) = 'Level {}'".format(pred, params.get('level'))
rules = list(pd.read_sql(sql=query, con=conn).rule)
if len(rules) == 0:
    raise Exception, 'Script requires uploaded rules in the rules list component.'

# make a list of rules
rules_df = pd.read_sql(sql=query, con=conn)
rules = list(rules_df.rule)

print("There are {} rules in the data.".format(len(rules)))

There are 38916 rules in the data.


In [384]:
# get main data
table1 = 'mdst_c130.import_remis_data'
table2 = 'mdst_c130.remove_general_support_wucs'
limit = 'limit 15000'

# pull wucs and data from the same component
primary_key_fields_list = pd.read_sql(sql="SHOW KEYS FROM {}".format(table1), con=conn).Column_name
primary_key_fields = ', '.join(primary_key_fields_list)
query = """SELECT {}, Work_Unit_Code, Equipment_Designator, Discrepancy_Narrative, Corrective_Narrative FROM {} {}""".format(primary_key_fields, table1, limit)
df_total = pd.read_sql(sql=query, con=conn)

# create a filter dataset
df_filter = pd.read_sql(sql="SELECT {} FROM {}".format(primary_key_fields, table2), con=conn)

# drop specific column if overlapping
if 'Work_Unit_Code' in list(df_filter.columns):
    df_total.drop('Work_Unit_Code', axis=1, inplace=True)

# inner join to limit solution set
data1 = df_total.merge(df_filter, on=list(primary_key_fields_list))

if 'WUC_Rule' not in list(data1.columns):
    # initialize rules as nan
    data1['WUC_Rule'] = np.nan
    # data size
    
# data2 = data1.copy()

rows_total = df_total.shape[0]  
rows = data1.shape[0]
print("There are {:0,.0f} rows in the original data and {:0,.0f} rows in the final data.".format(rows_total, rows))

There are 15,000 rows in the original data and 13,455 rows in the final data.


In [385]:
# close the mysqldb connection
conn.close()

## Rule Development

In [386]:
def keep_rule_number(existing_val, new_val, libraries):
    # add rule number to rule column
    pd = libraries["pandas"]
    
    if existing_val is None or pd.isnull(existing_val):
        return str(new_val)
    else:
        return

In [428]:
rule1 = """331. '12-4-36. WUC BEGINS WITH 12D AND ENDS WITH 00 OR 99 AND DISC contains "RAMP" then change WUC to 12DA0"""
rule2 = """30. AF11-3H-30. WUC BEGINS WITH "11" AND ENDS WITH "000" AND D/C CONTAINS "HORIZONTAL " AND D/C DOES NOT CONTAIN "00101 " OR "14083 " OR ("146-176.NOTE " OR "146176.NOTE ") OR ("1C-130A-23CL-1BELLY " OR "1C-130A-23CL-1 BELLY ") OR "1C-130H-2-13CLEAN " OR "1C-130H-2-61JG-10-1OBSERVE " OR "1C-130J-3.< " OR ("202AFTO " OR "202 AFTO ") OR "2061C-130H-2-12JG-10-2 " OR ("212228 " OR "212 228 ") OR "247IAW " OR "300-14-2.CLEAN " OR "4085965 " OR "517 " OR "56-00-01 " OR ("APPLYSTM40-114 " OR "APPLY STM40-114 " OR "APPLYSTM40114 " OR "APPLY STM40114 ") OR "BALALNCE " OR ("BELLYB.L " OR "BELLY B.L ") OR "BOMB " OR ("CLEAN-UP " OR "CLEANUP ") OR "CREW " OR ("CW-23 " OR "CW23 ") OR ("DEICERBOOT " OR "DEICER BOOT ") OR "DEPRE " OR "DESTROY " OR ("EC130HLANDING " OR "EC130H LANDING ") OR ("ENGINE " OR "ENG INE ") OR ("EXITHATCHT.O " OR "EXIT HATCHT.O ") OR "HATCH " OR "HATCHREF " OR ("IIISPRAYABLE " OR "III SPRAYABLE ") OR "LOWER " OR ("MC130PEXT " OR "MC130P EXT ") OR ("OO-ALC " OR "OOALC ") OR ("OVERHEADCONTROL " OR "OVERHEAD CONTROL ") OR "PARATROOP " OR ("PLANKADJACENT " OR "PLANK ADJACENT ") OR "RAMP " OR ("SANDINGCHEMICALWIPE " OR "SANDING CHEMICALWIPE ") OR "STAB " OR "STABILIZER " OR "TRANSFORMER " OR "UNLESS " OR "WELDON " OR "WINDOW " OR "WIPER " THEN CHANGE WUC TO 11600"""
rule_name1 = re.search(r'\'[0-9\-]+|AF[0-9\-\A-Z]+',rule1).group()
rule_name2 = re.search(r'\'[0-9\-]+|AF[0-9\-\A-Z]+',rule2).group()
print(rule_name1)
print(rule_name2)

'12-4-36
AF11-3H-30


In [429]:
def apply_rule(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    re = libraries["re"]
    np = libraries["numpy"]
    
    # if rule contains "D/C" field, split into two rules - one for DISC, one for CN
    if re.search(r'D/C',rule) is not None:
        dis_rule = re.sub(r'D/C',r'DISC',rule)
        if debug:
            print 'D/C rule as DISC rule: {}'.format(dis_rule)
        df1 = apply_rule(dis_rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=debug)
        cn_rule = re.sub(r'D/C',r'CN',rule)
        if debug:
            print 'D/C rule as CN rule: {}'.format(cn_rule)
        df2 = apply_rule(cn_rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=debug)
        if df1 is not None and df2 is not None:
            return df1.append(df2,ignore_index=False).drop_duplicates(inplace=True)
        elif df1 is not None:
            return df1
        else:
            return df2

    df = df.copy()

    rule = rule.upper()

    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.search(r'\'[0-9\-]+|AF[0-9\-\A-Z]+',rule).group()
        rule_number = re.match(r'\w+',rule).group()

    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    # initially all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined

    # create null WUC_RULE column if it does not exist
    if 'WUC_Rule' not in df.columns:
        df.loc[:,'WUC_Rule'] = np.nan
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0])) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0])) & (df.WUC_Rule.isnull())].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                else:  # ends with
                    if debug:
                        print 'else clause - OR match_type:' + match_type
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False)) & (df.WUC_Rule.isnull())].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE)) & (df.WUC_Rule.isnull())].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                else:  # ends with
                    if debug:
                        print 'else clause - AND match_type: ' + match_type
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False)) & (df.WUC_Rule.isnull())].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            # print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=keep_rule_number, args=(rule_number,libraries))

    # print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df

In [None]:
# field names and lookup
field_names = ' WUC | MDS | DISC | CN | DIS | DISCREPANCY | CORRECTIVE NARRATIVE | CORRECTIVE ACTION '
field_lookup = {" MDS ":"Equipment_Designator"," WUC ":"Work_Unit_Code"," DISC ": "Discrepancy_Narrative", 
    " DIS ":"Discrepancy_Narrative", " CN ":"Corrective_Narrative", " DISCREPANCY ": "Discrepancy_Narrative",
    " CORRECTIVE NARRATIVE ":"Corrective_Narrative", " CORRECTIVE ACTION ": "Corrective_Narrative"}

df_wuc_counts = df.groupby("Work_Unit_Code").size()
df_wuc_counts = df_wuc_counts.reset_index(name="counts").sort_values(by="counts", ascending=False)
df_wuc_counts = df_wuc_counts[0:9]

# top ten WUCs by count
p = plotting.figure(title = "Top Work Unit Codes - Before Cleaning", x_range=list(df_wuc_counts.Work_Unit_Code))
p.vbar(x=df_wuc_counts.Work_Unit_Code, top=df_wuc_counts.counts,
    width=0.5, bottom=0, color="dodgerblue")
p.xaxis.axis_label = 'Work Unit Code'
p.yaxis.axis_label = 'Maintenance Record Count'

# wuc_group
wuc_group_re = re.compile("\w+.\s?\'(\w+)|\w+.\s?AF(\w+)")
wuc_group_metrics = collections.defaultdict(int)
wuc_group_indices = collections.defaultdict(set)

# each_rule
each_rule_re = re.compile("\w+.\s?\'([0-9\-]+)|\w+.\s?AF([0-9\-]+)")
each_rule_indices = collections.defaultdict(set)

total_rows = 0
start = time.time()
for rule in rules:
    df_subset = apply_rule(rule, data1, df_wuc_narratives, libraries, field_names, field_lookup)
    if df_subset is not None:
        total_rows += df_subset.shape[0]
        # print(total_rows)
        # update WUCs within original dataframe
        df.update(df_subset)
        # update rules metrics
        if wuc_group_re.match(rule) is None:
            print(rule)
        else:
            wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
            wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
            each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

end = time.time()
total_time = end-start
print "It took {} seconds to make {} changes made across all edits (sometimes one row is changed more than once)".format(total_time, total_rows)

'11-3-1
1
'11-3-2
2
'11-3-3
3
'11-3-4
4
'11-3-5
5
'11-3-6
6
'11-3-7
7
'11-3-8
8
'11-3-9
9
'11-3-10
10
'11-3-11
11
'11-3-12
12
'11-3-13
13
'11-3-14
14
'11-3-15
15
'11-3-16
16
'11-3-17
17
'11-3-18
18
'11-3-19
19
'11-3-20
20
'11-3-21
21
'11-3-22
22
'11-3-23
23
'11-3-24
24
'11-3-25
25
'11-3-26
26
'11-3-27
27
'11-3-28
28
'11-3-29
29
'11-3-30
30
'11-3-31
31
'11-3-32
32
'12-3-1
33
'12-3-2
34
'12-3-3
35
'12-3-4
36
'12-3-5
37
'13-3-1
38
'13-3-2
39
'13-3-3
40
'13-3-4
41
'13-3-5
42
'13-3-6
43
'13-3-7
44
'13-3-8
45
'13-3-9
46
'13-3-10
47
'13-3-11
48
'13-3-12
49
'13-3-13
50
'13-3-14
51
'13-3-15
52
'13-3-16
53
'14-3-1
54
'14-3-2
55
'14-3-3
56
'14-3-4
57
'14-3-5
58
'14-3-6
59
'14-3-7
60
'14-3-8
61
'14-3-9
62
'14-3-10
63
'14-3-11
64
'14-3-12
65
'14-3-13
66
'14-3-14
67
'14-3-15
68
'14-3-16
69
'14-3-17
70
'14-3-18
71
'14-3-19
72
'22-3-1
73
'22-3-2
74
'22-3-3
75
'22-3-4
76
'22-3-5
77
'22-3-6
78
'22-3-7
79
'22-3-8
80
'22-3-9
81
'22-3-10
82
'22-3-11
83
'22-3-12
84
'22-3-13
85
'22-3-14
86
'24-3-1
87
'24-3-2



AF11-3H-47
47
AF11-3H-48
48
AF11-3H-48
48
AF11-3H-49
49
AF11-3H-49
49
AF11-3H-50
50
AF11-3H-50
50
AF11-3H-51
51
AF11-3H-51
51
AF11-3H-52
52
AF11-3H-52
52
AF11-3H-53
53
AF11-3H-54
54
AF11-3H-55
55
AF11-3H-56
56
AF11-3H-57
57
AF11-3H-58
58
AF11-3H-59
59
AF11-3H-60
60
AF11-3H-61
61
AF11-3H-62
62
AF11-3H-63
63
AF11-3H-64
64
AF11-3H-65
65
AF11-3H-66
66
AF11-3H-67
67
AF11-3H-68
68
AF11-3H-69
69
AF11-3H-70
70
AF11-3H-71
71
AF11-3H-71
71
AF11-3H-72
72
AF11-3H-72
72
AF11-3H-73
73
AF11-3H-73
73
AF11-3H-74
74
AF11-3H-74
74
AF11-3H-75
75
AF11-3H-75
75
AF11-3H-76
76
AF11-3H-76
76
AF11-3H-77
77
AF11-3H-77
77
AF11-3H-78
78
AF11-3H-78
78
AF11-3H-79
79
AF11-3H-79
79
AF11-3H-80
80
AF11-3H-80
80
AF11-3H-81
81
AF11-3H-81
81
AF11-3H-82
82
AF11-3H-82
82
AF11-3H-83
83
AF11-3H-83
83
AF11-3H-84
84
AF11-3H-84
84
AF11-3H-85
85
AF11-3H-85
85
AF11-3H-86
86
AF11-3H-86
86
AF11-3H-87
87
AF11-3H-87
87
AF11-3H-88
88
AF11-3H-88
88
AF11-3H-89
89
AF11-3H-89
89
AF11-3H-90
90
AF11-3H-90
90
AF11-3H-91
91
AF11-3H-91
91
AF11-3

AF14-3H-16
294
AF14-3H-16
294
AF14-3H-17
295
AF14-3H-17
295
AF14-3H-18
296
AF14-3H-18
296
AF14-3H-19
297
AF14-3H-19
297
AF14-3H-20
298
AF14-3H-20
298
AF14-3H-21
299
AF14-3H-21
299
299.  AF14-3H-21. WUC BEGINS WITH "14" AND ENDS WITH 000 AND D/C CONTAINS "RUDDER" AND D/C DOES NOT CONTAIN "15IAW" OR ("1C-130H-2-131C-130" OR "1C-130H-2-13 1C-130") OR "46-11-10" OR ("AILERON" OR "AILE RON") OR "CONTWHEEL" OR ("D1-390" OR "D1390") OR "ELEVATOR" OR "FLAP" OR "FLAPINDICATING" OR ("LEVERROD" OR "LEVER ROD") OR ("TABXMITTER" OR "TAB XMITTER") THEN CHANGE WUC TO 14300
AF14-3H-22
300
AF14-3H-22
300
AF14-3H-23
301
AF14-3H-23
301
AF14-3H-24
302
AF14-3H-24
302
AF14-3H-25
303
AF14-3H-25
303
AF14-3H-26
304
AF14-3H-26
304
AF14-3H-27
305
AF14-3H-28
306
306.  AF14-3H-28. WUC BEGINS WITH "14" AND ENDS WITH 000 AND DISCREPANCY CONTAINS " ELEVATOR" AND DISCREPANCY DOES NOT CONTAIN (" 03668BE" OR " 03668 BE") OR " 056C144" OR (" 1-2SEC" OR " 1-2 SEC" OR " 1 2SEC" OR " 1 2 SEC" OR " 12SEC" OR " 12 SEC") OR " 

AF32-3H-25
557
AF32-3H-25
557
AF32-3H-26
558
AF32-3H-26
558
AF32-3H-27
559
AF32-3H-28
560
AF32-3H-29
561
AF32-3H-30
562
AF32-3H-30
562
AF32-3H-31
563
AF32-3H-31
563
AF32-3H-32
564
AF32-3H-32
564
AF32-3H-33
565
AF32-3H-33
565
AF32-3H-34
566
AF32-3H-34
566
AF32-3H-35
567
AF32-3H-35
567
AF32-3H-36
568
AF32-3H-36
568
AF32-3H-37
569
AF32-3H-37
569
AF32-3H-38
570
AF32-3H-38
570
AF32-3H-40
572
AF32-3H-41
573
AF32-3H-42
574
AF32-3H-43
575
AF32-3H-44
576
AF32-3H-45
577
AF32-3H-46
578
AF32-3H-47
579
AF32-3H-48
580
AF32-3H-49
581
AF32-3H-51
583
AF32-3H-52
584
AF32-3H-52
584
AF32-3H-53
585
AF32-3H-53
585
AF32-3H-54
586
AF41-3H-1
587
AF41-3H-2
588
AF41-3H-3
589
AF41-3H-4
590
AF41-3H-4
590
AF41-3H-5
591
AF41-3H-6
592
AF41-3H-7
593
AF41-3H-8
594
AF41-3H-8
594
AF41-3H-9
595
AF41-3H-10
596
AF41-3H-11
597
AF41-3H-12
598
AF41-3H-13
599
AF41-3H-14
600
AF41-3H-14
600
AF41-3H-15
601
AF41-3H-15
601
AF41-3H-16
602
AF41-3H-16
602
AF41-3H-17
603
AF41-3H-17
603
AF41-3H-18
604
AF41-3H-18
604
AF41-3H-19
605
AF41-3

AF45-3H-4
811
AF45-3H-5
812
AF45-3H-6
813
AF45-3H-6
813
AF45-3H-7
814
AF45-3H-8
815
AF45-3H-8
815
AF45-3H-9
816
AF45-3H-10
817
AF45-3H-10
817
AF45-3H-11
818
AF45-3H-11
818
AF45-3H-12
819
AF45-3H-12
819
AF45-3H-13
820
AF46-3H-1
821
AF46-3H-2
822
AF46-3H-2
822
AF46-3H-3
823
AF46-3H-4
824
AF46-3H-4
824
AF46-3H-5
825
AF46-3H-6
826
AF46-3H-7
827
AF46-3H-7
827
AF46-3H-8
828
AF46-3H-9
829
AF46-3H-10
830
AF46-3H-11
831
AF46-3H-12
832
AF46-3H-12
832
AF46-3H-13
833
AF46-3H-14
834
AF46-3H-15
835
AF46-3H-15
835
AF46-3H-16
836
AF46-3H-17
837
AF46-3H-18
838
AF46-3H-18
838
AF46-3H-19
839
AF46-3H-20
840
AF46-3H-21
841
AF46-3H-22
842
AF46-3H-22
842
AF46-3H-23
843
AF46-3H-24
844
AF46-3H-25
845
AF46-3H-26
846
AF46-3H-27
847
AF46-3H-28
848
AF46-3H-29
849
AF46-3H-30
850
AF46-3H-31
851
AF46-3H-31
851
AF46-3H-32
852
AF46-3H-33
853
AF46-3H-34
854
AF46-3H-35
855
AF46-3H-36
856
AF46-3H-37
857
AF46-3H-38
858
AF46-3H-39
859
AF46-3H-40
860
AF46-3H-41
861
AF46-3H-42
862
AF46-3H-42
862
AF46-3H-43
863
AF46-3H-43
863


AF46-3H-292
1112
AF46-3H-293
1113
AF46-3H-293
1113
AF46-3H-294
1114
AF46-3H-294
1114
AF46-3H-295
1115
AF46-3H-295
1115
AF46-3H-296
1116
AF46-3H-296
1116
AF46-3H-297
1117
AF46-3H-297
1117
AF46-3H-298
1118
AF46-3H-298
1118
AF46-3H-299
1119
AF46-3H-299
1119
AF46-3H-300
1120
AF46-3H-300
1120
AF46-3H-301
1121
AF46-3H-301
1121
AF46-3H-302
1122
AF46-3H-302
1122
AF46-3H-303
1123
AF46-3H-303
1123
AF46-3H-304
1124
AF46-3H-304
1124
AF46-3H-305
1125
AF46-3H-305
1125
AF46-3H-306
1126
AF46-3H-306
1126
AF46-3H-307
1127
AF46-3H-307
1127
AF46-3H-308
1128
AF46-3H-308
1128
AF46-3H-309
1129
AF46-3H-309
1129
AF46-3H-310
1130
AF46-3H-310
1130
AF46-3H-311
1131
AF46-3H-311
1131
AF46-3H-312
1132
AF46-3H-312
1132
AF46-3H-313
1133
AF46-3H-313
1133
AF46-3H-314
1134
AF46-3H-314
1134
AF46-3H-315
1135
AF46-3H-315
1135
AF46-3H-316
1136
AF46-3H-316
1136
AF46-3H-317
1137
AF46-3H-317
1137
AF46-3H-318
1138
AF46-3H-318
1138
AF46-3H-319
1139
AF46-3H-319
1139
AF46-3H-320
1140
AF46-3H-320
1140
AF46-3H-321
1141
AF46-3H-321
11

In [None]:
if wuc_group_re.match(rule) is None:
            # print(str(wuc_group_re))
            print(rule)
        else:
            wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
            wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
            each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))


In [393]:
df_wuc_counts = df.groupby("Work_Unit_Code").size()
df_wuc_counts = df_wuc_counts.reset_index(name="counts").sort_values(by="counts", ascending=False)
df_wuc_counts = df_wuc_counts[0:9]

# top ten WUCs by count
q = plotting.figure(title = "Top Work Unit Codes - After Cleaning", x_range=list(df_wuc_counts.Work_Unit_Code))
q.vbar(x=df_wuc_counts.Work_Unit_Code, top=df_wuc_counts.counts,
    width=0.5, bottom=0, color="dodgerblue")
q.xaxis.axis_label = 'Work Unit Code'
q.yaxis.axis_label = 'Maintenance Record Count'

# wuc groups changed to
df_wuc_group_metrics = pd.DataFrame.from_dict(wuc_group_metrics, orient="index")
df_wuc_group_metrics.reset_index(inplace=True)
df_wuc_group_metrics.columns = ['wuc', 'wuc_count']
df_wuc_group_metrics = df_wuc_group_metrics.sort_values(by=['wuc_count'], ascending=False)

r = plotting.figure(title = "Records Modified into Each Work Unit Code Group", x_range=list(df_wuc_group_metrics.wuc))
r.vbar(x=df_wuc_group_metrics.wuc, top=df_wuc_group_metrics.wuc_count,
    width=0.5, bottom=0, color="dodgerblue")
r.xaxis.axis_label = 'Work Unit Code Group'
r.yaxis.axis_label = 'Maintenance Records Changed'

In [336]:
print(data1.WUC_Rule.unique())

[nan '75' '65' '69' '107' '70' '144' '121' '117' '93' '120' '45' '71' '72'
 '25' '46' '50' '195' '26' '146' '30' '313' '103' '44' '24' '632' '17'
 '108' '176' '52' '171' '156' '284']


In [297]:
test1 = "SUCCESS a contains and does not contain b"
test2 = "SUCCESS a contain b"
test3 = "FAILURE a does not contains b"
test7 = "FAILURE a does not contain b"
test4 = "SUCCESS a contain and does not contain and contains b"
test5 = "SUCCESS a does not contains and contains b"
test6 = "a hello world b"

tests = [test1, test2, test3, test4, test5, test6, test7]
r'^(?!.*not contain)|^.*(?<!not )contain'

pattern = r'^.*(?<!not )contains.*$|^.*(?<!not )contain.*$|^(?!.*contain).*^'

# ^(?!.*not contain)|.

for test in tests:
    if re.search(pattern, test):
        print(test, "matched")

('SUCCESS a contains and does not contain b', 'matched')
('SUCCESS a contain b', 'matched')
('SUCCESS a contain and does not contain and contains b', 'matched')
('SUCCESS a does not contains and contains b', 'matched')
('a hello world b', 'matched')


In [298]:
def sort_rules(df, libraries):
    
    pd = libraries['pandas']
    re = libraries['re']
    
    # regex for finding all contain/contains that are not proceeded by 'not', OR anything that does not contain 'contain'
    pattern = r'^.*(?<!not )contains.*$|^.*(?<!not )contain.*$|^(?!.*contain).*^'

    # create flag based on regex and sort by contains_flag then rule_number
    df['contains_flag'] = df.rule.apply(lambda x: 0 if re.search(pattern, x, re.IGNORECASE) else 1)
    
    # remove all letters before int conversion
    df.rule_number = df.rule_number.str.extract('(\d+)', expand=False).astype(int)

    df.sort_values(by=['contains_flag', 'rule_number'], ascending=True, inplace=True)
    
    # set the sort order according to AF spec
    level_order = ['Level 3', 'Level 3 High', 'Level 3 High Promoted', 'Level 3 High Double Promoted', 'Level 3 Low',
                   'Level 3 Low Promoted', 'Level 3 Low Double Promoted', 'Level 3 4F4', 'Level 3 4F5', 'Level 3 5F5'
                   'Level 4', 'Level 4 High', 'Level 4 High Promoted', 'Level 4 Low', 'Level 4 Low Promoted', 
                   'Level 4 5F5', 'Level 5', 'Level 5 High', 'Level 5 Low']
    
    # create subset dfs and concat at the end
    df_sort_list = []

    for level in level_order:
        df_sort_list.append(df[df.level == level])
        
    df_sorted = pd.concat(df_sort_list)
    
    # drop the contains_flag
    df_sorted.drop(['rule_number', 'contains_flag'], axis=1, inplace=True)
    
    return df_sorted
    

test = sort_rules(rules_df, libraries)
test.to_csv('test_csv')

## Speed Testing

In [27]:
def time_this(td, old_time, new_time):
    current_time = time.time() - old_time
    td[new_time] = current_time
    return old_time, td

In [19]:
def apply_rule_fast(rule, df, df_wuc_narratives, libraries, field_names, field_lookup, debug=False):
    """function takes a rule text and the df, prints rule number how many edits were made 
    and returns a filtered dataframe with an updated WUC"""

    time_dict = {}
    
    start_time = time.time()
    
    re = libraries["re"]

    df = df.copy()

    rule = rule.upper()
    
    # first check if rule is deleted
    if rule.split(' ')[2] == "DELETED":
        print 'rule {} is deleted'.format(str(re.match(r'\w+.\s?\'[0-9\-]+',rule).group()))
        return None

    try:
        rule_name = re.match(r'\w+.\s?\'[0-9\-]+',rule).group()
        rule_number = re.match(r'\w+',rule).group()
    except:
        rule_name = re.match(r'\w+',rule).group()
        rule_number = rule_name
    
    #################################################################################################
    delete_time, time_dict = time_this(time_dict, start_time, "delete_time")
    #################################################################################################

    # what to change wuc to
    change_to_str = re.search(r'\s+then\s+change\s+wuc to\s+|\s+then wuc equals\s+|\s+then wuc =\s+|\s+then change\s+the wuc to\s+', 
        rule, flags=re.IGNORECASE)
    new_wuc = re.search(r"[0-9a-z]+", rule[change_to_str.end():], flags=re.IGNORECASE).group()
    if debug:
        print 'originally {} rows'.format(df.shape[0])
        print 'change wuc to: ' + repr(new_wuc)

    # find text for new_wuc
    try:
        wuc_narr = df_wuc_narratives[df_wuc_narratives.Work_Unit_Code == new_wuc].WUC_Narrative.iloc[0]
    except (IndexError, AttributeError):
        wuc_narr = ''
    
    # remove change wuc to for looping through categories
    rule = rule[0:change_to_str.start()]
    if debug:
        print 'rule after removing new wuc info: ' + rule

    #################################################################################################
    find_remove_time, time_dict = time_this(time_dict, delete_time, "find_remove_time")
    ################################################################################################# 
        
    # initialy all indices may be changed
    matching_indices_defined = set(df.index)
    # to account for joining contains/not contains clauses with ORs, need to maintain list of 'pending' indices 
    #   that must be anded with what's already defined (e.g. by WUC matches)
    matching_indices_pending = matching_indices_defined
    
    # loop through specific category matches (i.e. field names)
    # assume always an 'AND' between category matches (e.g. CN = 'X' AND DISC = 'Y')
    rule_split = re.split(field_names, rule, flags=re.IGNORECASE)
    for ii, cat in enumerate(re.findall(field_names, rule, re.IGNORECASE)):
        cat_field = field_lookup[cat]
        rule_element = rule_split[1+ii]  # first piece is the wuc begins with, skip
        
        if debug:
            print cat_field
        # loop through contains or not contains clauses similar to looping through categories
        #   cannot assume an 'AND' between all the contains/not contains matches
        #   identify indices to adjust, using set unions and set intersections for ORs and ANDs
        match_type_options_regex = r"\s*contains\s*|\s*does not contain\s*|\s*=\s*|\s*equals\s*|\s*begins with\s*|\s*ends with\s*"
        if debug:
            print 'category element: ' + rule_element
        match_type_split = re.split(match_type_options_regex, rule_element, flags=re.IGNORECASE)
        match_type_split = [item for item in match_type_split if item] # remove empty strings
        for jj, match_type in enumerate(re.findall(match_type_options_regex, rule_element, re.IGNORECASE)):
            joined_by_and = True
            # contains or not contains (true or false)
            #cat_direction = re.search("\s?contains|\s?does not contain\s?\"?", rule_element, flags=re.IGNORECASE).group()
            #match_type = re.search('contains', cat_direction, re.IGNORECASE) is not None
            if debug:
                print "match type: " + match_type # ('contains' if match_type else 'does not contain')

            # find what we're looking for for this category
            #cat_remaining = rule_element[len(cat_direction):]  # rest of phrase after contains/not contains. 
            #   also remove any trailing 'and the', e.g. for wuc ends with in "WUC begins with 14 AND ENDS WITH 00 OR 99 AND the CN contains "BOOST""
            cat_remaining = re.sub(' THE$', '', match_type_split[jj].strip())
            cat_remaining = re.sub(' AND$', '', cat_remaining).strip()
            # remove trailing OR from contains/not contains cluse and take note
            if re.search(' OR$', cat_remaining, flags=re.IGNORECASE):
                cat_remaining = re.split(' OR$', cat_remaining, flags=re.IGNORECASE)[0].strip()
                joined_by_and = False

            if debug:
                print 'cat_remaining: ' + cat_remaining
            #cat_ANDs = cat_remaining.count(' AND ', re.IGNORECASE) # this is wrong if multiple cateogries linked by ANDs
            cat_ORs = cat_remaining.count(' OR ')

            # group all the ands into a list
            cat_remaining_andsplit = re.split(' AND ', cat_remaining, flags=re.IGNORECASE)

            # multiple category elements are linked by 'ands'.  remove these to avoid tricking the 'and' search later
            # remove any empty strings, '', at the end of these categories
            cat_remaining_andsplit = [element for element in cat_remaining_andsplit if element]

            # if any ORs, split them off from the last item in the AND list
            if cat_ORs > 0:
                cat_remaining_orsplit = cat_remaining_andsplit.pop()
                cat_remaining_orsplit = re.split(' OR ', cat_remaining_orsplit, flags=re.IGNORECASE)
                # strip and remove quotes and parentheses from all the OR matches.
                cat_pattern_ORs = [blurb.strip().strip(',').replace("\"","").replace("(","").replace(")","") for blurb in cat_remaining_orsplit]
                cat_pattern_ORs = '|'.join(cat_pattern_ORs)
                # periods are wildcards, so escape
                cat_pattern_ORs = cat_pattern_ORs.replace('.',r'\.')
                if debug:
                    print 'filter for ' + cat_pattern_ORs

                if 'CONTAINS' in match_type:
                    # update indices: ensure new matches always match to the defined (e.g. WUC-starts-with) indice matches
                    #  if joined by and: pending = intersection(pending, intersection(defined, new-matches))
                    #  if joined by or: pending = union(pending, intersection(defined, new-matches))
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:  # exclude if any words match
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_pattern_ORs,na=False,flags=re.IGNORECASE)) & (df[cat_field].str.len() == len(cat_pattern_ORs.split('|')[0]))].index)))                            
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(tuple(cat_pattern_ORs.split('|')),na=False))].index)))

            if debug:
                    print 'after matching any ORs: {} matches'.format(len(matching_indices_pending))

            # strip and remove quotes from all the AND matches. escape periods (wildcards). keep a list
            cat_pattern_ANDs = [blurb.strip().strip(',').replace("\"","").replace('.',r'\.') for blurb in cat_remaining_andsplit]

            # filter and phrases for this category
            for cat_AND in cat_pattern_ANDs:
                if debug:
                    print 'filter for ' + cat_AND

                if 'CONTAINS' in match_type:    
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'NOT CONTAIN' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[~(df[cat_field].str.contains(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif '=' in match_type or 'EQUALS' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.match(pat=cat_AND,na=False,flags=re.IGNORECASE))].index)))
                elif 'BEGINS WITH' in match_type:
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.startswith(cat_AND,na=False))].index)))
                else:  # ends with
                    if joined_by_and:
                        matching_indices_pending = matching_indices_pending.intersection(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))
                    else:
                        matching_indices_pending = matching_indices_pending.union(matching_indices_defined.intersection(set(df[(df[cat_field].str.endswith(cat_AND,na=False))].index)))

            if debug:
                print 'after filtering match type: {} matches'.format(len(matching_indices_pending))

        # finished looping through contains/not contains clauses for this field/category. update defined indices
        matching_indices_defined = matching_indices_pending

        if debug:
            print 'after filtering category: {} matches'.format(len(matching_indices_defined))
        if len(matching_indices_defined) == 0:
            # print 'rule {} changed 0 records to wuc {}: {}'.format(str(rule_name), new_wuc, wuc_narr)
            return None

    #################################################################################################
    loop_time, time_dict = time_this(time_dict, find_remove_time, "loop_time")
    #################################################################################################
    # filter
    df = df.filter(items=matching_indices_defined, axis="index")
    if debug:
        print 'matching_indices: ' + str(matching_indices_defined)
        print 'filtered df: '
        print df
    # update wuc and rule
    df.loc[:,'Work_Unit_Code'] = new_wuc
    if 'WUC_Rule' in df.columns:
        df.loc[:, 'WUC_Rule'] = df.WUC_Rule.apply(func=append_rule_number, args=(rule_number,libraries))
    else:
        df.loc[:,'WUC_Rule'] = str(rule_number)
    
    # print 'rule {} changed {:0,.0f} records to wuc {}: {}'.format(str(rule_name), df.shape[0], new_wuc, wuc_narr)
            
    return df, time_dict

In [52]:
def evaluate_speed(rules, df, d_w_narratives, lib, f_names, f_lookup, ver, deb='False'):
    total_rows = 0
    time_df = pd.DataFrame(columns=['Action', 'Time'])
    start = time.time()
    
    rule_size = len(rules)
    df_size = df.shape[0]
    
    # wuc_group
    wuc_group_re = re.compile("\w+.\s?\'(\w+)")
    wuc_group_metrics = collections.defaultdict(int)
    wuc_group_indices = collections.defaultdict(set)

    # each_rule
    each_rule_re = re.compile("\w+.\s?\'([0-9\-]+)")
    each_rule_indices = collections.defaultdict(set)
    
    if ver == 'original':
        time.sleep(5)
        for rule in rules:
            df_subset = apply_rule(rule, df, d_w_narratives, lib, f_names, f_lookup)
            if df_subset is not None:
                total_rows += df_subset.shape[0]
                # update WUCs within original dataframe
                df.update(df_subset)
                # add the time_dict to the time_dict_df
                
                # update rules metrics
                wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
                wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
                each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

    else:    
        for rule in rules:
            df_subset = apply_rule_fast(rule, df, df_wuc_narratives, lib, f_names, f_lookup)
            if df_subset is not None:
                total_rows += df_subset.shape[0]
                # update WUCs within original dataframe
                df.update(df_subset)
                # update rules metrics
                wuc_group_metrics[str(wuc_group_re.match(rule).group(1))] += df_subset.shape[0]
                wuc_group_indices[str(wuc_group_re.match(rule).group(1))] = wuc_group_indices[str(wuc_group_re.match(rule).group(1))].union(set(df_subset.index))
                each_rule_indices[str(each_rule_re.match(rule).group(1))] = each_rule_indices[str(each_rule_re.match(rule).group(1))].union(set(df_subset.index))

    end = time.time()
    total_time = round(end-start,2)
    
    print("For {} rules and {} maintenance records, it took {} seconds to make {} changes made across all edits".format(rule_size, df_size, total_time, total_rows))
    
    return total_time, time_df

In [53]:
tt1, tdf1 = evaluate_speed(rules, data1, df_wuc_narratives, libraries, field_names, field_lookup, ver = 'original')
# tt2, tdf2 = evaluate_speed(rules, data2, df_wuc_narratives, libraries, field_names, field_lookup, ver = 'fast')
tdf1.head(100)
# tdf2.head(100)tdf2.head(100)
# difference = tt1 - tt2
# percent_difference = round(difference / tt1 * 100, 2)
# print("The improvements resulted in a {} improvement, or {} % upgrade".format(difference, percent_difference))

For 2 rules and 284209 maintenance records, it took 9.67 seconds to make 13 changes made across all edits


Unnamed: 0,Action,Time
