In [3]:
import boto3
import botocore
import sagemaker
import sys


bucket = 'sciforma-performance-data'   # <--- specify a bucket you have access to
prefix = 'sagemaker/rcf-benchmarks'
execution_role = sagemaker.get_execution_role()


# check if the bucket exists
try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/{}'.format(bucket, prefix))


import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()
data_key = 'performanceDataWithRole_Loc_mag_project_resource.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

performanceData = pd.read_csv(data_location)
performanceData.drop(columns='Unnamed: 0', inplace=True)
performanceData['ELAPSED'] = performanceData['ELAPSED'].div(1000) 
performanceData.drop(performanceData[performanceData['ELAPSED']>1500].index, inplace=True)
performanceData.head()

Training input/output will be stored in: s3://sciforma-performance-data/sagemaker/rcf-benchmarks


Unnamed: 0,STARTDATE,ELAPSED,USERID,EVENTID,EVENT_DESC,WORKSPACEID,WS_DESC,TRANSACTIONTYPE,CORE_ID,HR_ORGANIZATION,LOCATION,USER_ROLE,MAG_CODE
0,2019-04-06 16:22:34,2.324,2493686,BkTdYe4nUz,Project Resources,,Exception,1,wpmsp,The Demo Org.SP Demo Org,TX30,_Program Manager,RB4
1,2019-04-06 17:45:20,7.978,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9
2,2019-04-06 17:49:06,3.96,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9
3,2019-04-06 17:50:49,4.537,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9
4,2019-04-06 17:54:16,4.846,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9


In [6]:
performanceData.median()

ELAPSED            4.477500e+00
USERID             8.697462e+06
TRANSACTIONTYPE    1.000000e+00
dtype: float64

In [4]:
performanceData.ELAPSED.median()

performanceData.ELAPSED.describe()
#performanceData["time_bin"]= ['0_to_1' if ((x <= 1) and (x >0)) else '2_to_more' for x in performanceData['ELAPSED']]  
    
def time_bin_method(df):
    if (0 < df['ELAPSED'] <= 2):
        return '0_to_2'
    elif (2 < df['ELAPSED'] <= 4):
        return '2_to_4'
    elif (4 < df['ELAPSED'] <= 6):
        return '4_to_6'
    elif (df['ELAPSED'] > 6):
        return '6_or_more'

def time_bin_method_pr(df):
    if (0 < df['ELAPSED'] <= 4.4):
        return '0_to_4.4'
    else :
        return '4.4_or_more'    
performanceData['time_bin'] = performanceData.apply(time_bin_method_pr, axis = 1)
performanceData.head(3)

Unnamed: 0,STARTDATE,ELAPSED,USERID,EVENTID,EVENT_DESC,WORKSPACEID,WS_DESC,TRANSACTIONTYPE,CORE_ID,HR_ORGANIZATION,LOCATION,USER_ROLE,MAG_CODE,time_bin
0,2019-04-06 16:22:34,2.324,2493686,BkTdYe4nUz,Project Resources,,Exception,1,wpmsp,The Demo Org.SP Demo Org,TX30,_Program Manager,RB4,0_to_4.4
1,2019-04-06 17:45:20,7.978,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9,4.4_or_more
2,2019-04-06 17:49:06,3.96,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9,0_to_4.4


In [5]:
performanceData['HR_ORGANIZATION'] = 'org_' + performanceData['HR_ORGANIZATION'].astype(str)
performanceData['LOCATION'] = 'loc_' + performanceData['LOCATION'].astype(str)
performanceData['USER_ROLE'] = 'role_' + performanceData['USER_ROLE'].astype(str)
performanceData.head(3)

Unnamed: 0,STARTDATE,ELAPSED,USERID,EVENTID,EVENT_DESC,WORKSPACEID,WS_DESC,TRANSACTIONTYPE,CORE_ID,HR_ORGANIZATION,LOCATION,USER_ROLE,MAG_CODE,time_bin
0,2019-04-06 16:22:34,2.324,2493686,BkTdYe4nUz,Project Resources,,Exception,1,wpmsp,org_The Demo Org.SP Demo Org,loc_TX30,role__Program Manager,RB4,0_to_4.4
1,2019-04-06 17:45:20,7.978,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,org_HPMS.STI,loc_IN-BLR01-s1,role__Program Manager,RC9,4.4_or_more
2,2019-04-06 17:49:06,3.96,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,org_HPMS.STI,loc_IN-BLR01-s1,role__Program Manager,RC9,0_to_4.4


In [6]:
filters = ['4.4_or_more']
performance_subset = performanceData[performanceData.time_bin.isin(filters)]
#performance_subset = performanceData
#datasetApriori = performance_subset.drop(performance_subset.columns.difference(['HR_ORGANIZATION','LOCATION','USER_ROLE','time_bin']), 1)
datasetApriori = performance_subset.drop(performance_subset.columns.difference(['HR_ORGANIZATION','LOCATION','USER_ROLE']), 1)
datasetApriori.dropna(inplace = True)
datasetApriori = datasetApriori.values.tolist()
#print(datasetApriori)

In [None]:
# Apriori analysis
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

te = TransactionEncoder()
te_ary = te.fit(datasetApriori).transform(datasetApriori)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head(3)


In [8]:
performanceData['time_bin'].describe()

#apriori(df, min_support=0.05)

count           21858
unique              2
top       4.4_or_more
freq            11084
Name: time_bin, dtype: object

In [9]:
frequent_itemsets= apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets,length
138,0.024811,"(role__Program Manager, org_HPMS.STI, loc_IN-B...",3
139,0.012901,"(org_HPMS.AAA, loc_NL-HTC01-s1, role__PMO+RM)",3
140,0.011007,"(org_HPMS.CBS, role__PM+RM, loc_NL-HTC01-s1)",3
141,0.015337,"(org_HPMS.AAA, loc_NL-NYM01-s1, role__PMO+RM)",3
142,0.012541,"(org_HPMS.IDA, role__Program Manager, loc_NL-N...",3
143,0.010646,"(role__Program Manager, loc_NL-NYM01-s1, org_H...",3
144,0.014796,"(org_T&O.TO-FO, loc_NL-NYM01-s1, role__PMO+RM)",3
145,0.046463,"(role__Core Team Support User, org_The Support...",3
146,0.026795,"(loc_US-AUS01, org_HPMS.MICR, role__PMO+RM)",3
147,0.010014,"(loc_US-CHD01, role__PM+RM, org_HPMS.AAA)",3


In [10]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
#rules.to_csv("project_scheduler_rules_mlxtend.csv")

In [None]:
confidence = rules['confidence'] > 0.8
lift = rules['lift'] > 20
#elderly = df['age'] > 50

rules= rules.sort_values(by=['confidence','lift'], ascending=False)
rules[confidence]

In [11]:

def unionizeFrozenset(rules):
        return frozenset.union(rules['antecedents'],rules['consequents'])

performanceData['time_bin'] = performanceData.apply(time_bin_method, axis = 1)



rules['Combined_frozenset']=rules.apply(unionizeFrozenset, axis = 1)

In [12]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Combined_frozenset
0,(org_HPMS.STI),(loc_AT-GRK01-s1),0.388578,0.173944,0.148051,0.381008,2.190399,0.08046,1.334517,"(org_HPMS.STI, loc_AT-GRK01-s1)"
1,(loc_AT-GRK01-s1),(org_HPMS.STI),0.173944,0.388578,0.148051,0.851141,2.190399,0.08046,4.107392,"(org_HPMS.STI, loc_AT-GRK01-s1)"
2,(loc_BE-LEU01-s1),(org_HPMS.SIP),0.012541,0.046644,0.012541,1.0,21.439072,0.011956,inf,"(loc_BE-LEU01-s1, org_HPMS.SIP)"
3,(org_HPMS.SIP),(loc_BE-LEU01-s1),0.046644,0.012541,0.012541,0.268859,21.439072,0.011956,1.350573,"(loc_BE-LEU01-s1, org_HPMS.SIP)"
4,(org_HPMS.STI),(loc_CN-SHA01-s1),0.388578,0.015698,0.014435,0.037149,2.366423,0.008335,1.022278,"(org_HPMS.STI, loc_CN-SHA01-s1)"


In [13]:
uninionizedRules = rules.drop(columns=['antecedents','consequents'])
uninionizedRules.describe()

Unnamed: 0,antecedent support,consequent support,support,confidence,lift,leverage,conviction
count,190.0,190.0,190.0,190.0,190.0,190.0,190.0
mean,0.106447,0.106447,0.024698,0.448751,6.593655,0.017472,inf
std,0.114415,0.114415,0.022822,0.323818,6.027551,0.014013,
min,0.010736,0.010736,0.010014,0.027165,2.005313,0.005428,1.014368
25%,0.026232,0.026232,0.011954,0.171457,2.532195,0.008868,1.143885
50%,0.05544,0.05544,0.013894,0.378125,4.363969,0.010927,1.477239
75%,0.127256,0.127256,0.030314,0.732444,7.883607,0.019754,2.839563
max,0.388578,0.388578,0.148051,1.0,34.433288,0.08046,inf


In [14]:
uninionizedRules = uninionizedRules.groupby(['Combined_frozenset']).mean()


In [39]:
uninionizedRules.reset_index(inplace=True)
uninionizedRules.head()

Unnamed: 0,Combined_frozenset,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(org_HPMS.STI, loc_AT-GRK01-s1)",0.281261,0.281261,0.148051,0.616074,2.190399,0.08046,2.720954
1,"(role__PM+RM, org_HPMS.AMP)",0.157254,0.157254,0.013443,0.473948,3.013908,0.008983,4.127023
2,"(org_HPMS.CBS, role__PM+RM)",0.179222,0.179222,0.036359,0.369724,2.062933,0.018734,1.452542
3,"(org_HPMS.SEN, role__PMO+RM)",0.123827,0.123827,0.01642,0.370146,2.989213,0.010927,1.691908
4,"(org_T&O.SSC-GTI, role__PMO+RM)",0.127301,0.127301,0.018495,0.334322,2.626237,0.011453,1.465727


In [163]:
uninionizedRules.to_csv("project_resource_Union_rules_mlxtend.csv")

In [66]:
import re
uniqueRole = performanceData.USER_ROLE.unique()
uniqueHROrg = performanceData.HR_ORGANIZATION.unique()
uniqueLOCATION = performanceData.LOCATION.unique()

def search(list_v,subs):
    for val in list_v:
        if val.startswith(subs):
            return val

def calculatemean(df):
    #print(df['Combined_frozenset'])
    role_in_set = [x for x in df['Combined_frozenset'] if re.search('role_', x)]
    org_in_set = [x for x in df['Combined_frozenset'] if re.search('org_', x)]
    loc_in_set = [x for x in df['Combined_frozenset'] if re.search('loc_', x)]
    if len(role_in_set) > 0 :
        user_role_filter = role_in_set
       # print("role_in_set")
    else :
        user_role_filter = uniqueRole
    if len(org_in_set)>0 :
        org_filter = org_in_set
        #print("org_in_set")
    else :
        org_filter = uniqueHROrg
    if len(loc_in_set)>0 :
        loc_filter = loc_in_set
       # print("loc_in_set")
    else :
        loc_filter = uniqueLOCATION
        
    filteredPerformance = performanceData[(performanceData.USER_ROLE.isin(user_role_filter)) 
                                          & (performanceData.HR_ORGANIZATION.isin(org_filter)) 
                                          & (performanceData.LOCATION.isin(loc_filter))]
    return filteredPerformance.ELAPSED.mean()
    
uninionizedRules['time_elapsed_aggregate'] = uninionizedRules.apply(calculatemean, axis = 1)
uninionizedRules.head(30)

Unnamed: 0,Combined_frozenset,antecedent support,consequent support,support,confidence,lift,leverage,conviction,time_elapsed_aggregate
0,"(org_HPMS.STI, loc_AT-GRK01-s1)",0.281261,0.281261,0.148051,0.616074,2.190399,0.08046,2.720954,7.233511
1,"(role__PM+RM, org_HPMS.AMP)",0.157254,0.157254,0.013443,0.473948,3.013908,0.008983,4.127023,13.099603
2,"(org_HPMS.CBS, role__PM+RM)",0.179222,0.179222,0.036359,0.369724,2.062933,0.018734,1.452542,7.364921
3,"(org_HPMS.SEN, role__PMO+RM)",0.123827,0.123827,0.01642,0.370146,2.989213,0.010927,1.691908,6.502451
4,"(org_T&O.SSC-GTI, role__PMO+RM)",0.127301,0.127301,0.018495,0.334322,2.626237,0.011453,1.465727,6.259602
5,"(org_T&O.TO-FO, role__PMO+RM)",0.124188,0.124188,0.025352,0.556837,4.483819,0.019698,inf,4.547097
6,"(role__Core Team Support User, org_The Support...",0.05138,0.05138,0.046463,0.91266,17.762821,0.043848,inf,9.220211
7,"(org_HPMS.STI, role__PM+RM, loc_AT-GRK01-s1)",0.221039,0.221039,0.048448,0.515123,2.330458,0.027659,3.77792,5.882204
8,"(org_HPMS.STI, loc_AT-GRK01-s1, role__PMO+RM)",0.102986,0.102986,0.022465,0.415279,4.032371,0.016894,1.939016,8.346808
9,"(role__Program Manager, org_HPMS.STI, loc_AT-G...",0.196905,0.196905,0.066582,0.463793,2.396075,0.038562,1.99712,7.743782


In [67]:
uninionizedRules.to_csv("project_resource_set_w_time_aggre.csv")