In [148]:
import boto3
import botocore
import sagemaker
import sys


bucket = 'sciforma-performance-data'   # <--- specify a bucket you have access to
prefix = 'sagemaker/rcf-benchmarks'
execution_role = sagemaker.get_execution_role()


# check if the bucket exists
try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/{}'.format(bucket, prefix))


import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()
data_key = 'performanceDataWithRole_Loc_mag_project_resource.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

performanceData = pd.read_csv(data_location)
performanceData.drop(columns='Unnamed: 0', inplace=True)
performanceData['ELAPSED'] = performanceData['ELAPSED'].div(1000) 
performanceData.drop(performanceData[performanceData['ELAPSED']>1500].index, inplace=True)
performanceData.head()

Training input/output will be stored in: s3://sciforma-performance-data/sagemaker/rcf-benchmarks


Unnamed: 0,STARTDATE,ELAPSED,USERID,EVENTID,EVENT_DESC,WORKSPACEID,WS_DESC,TRANSACTIONTYPE,CORE_ID,HR_ORGANIZATION,LOCATION,USER_ROLE,MAG_CODE
0,2019-04-06 16:22:34,2.324,2493686,BkTdYe4nUz,Project Resources,,Exception,1,wpmsp,The Demo Org.SP Demo Org,TX30,_Program Manager,RB4
1,2019-04-06 17:45:20,7.978,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9
2,2019-04-06 17:49:06,3.96,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9
3,2019-04-06 17:50:49,4.537,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9
4,2019-04-06 17:54:16,4.846,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9


In [3]:
!pip install git

Collecting git
[31m  Could not find a version that satisfies the requirement git (from versions: )[0m
[31mNo matching distribution found for git[0m
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [150]:
performanceData.median()

ELAPSED            4.477500e+00
USERID             8.697462e+06
TRANSACTIONTYPE    1.000000e+00
dtype: float64

In [152]:
performanceData.ELAPSED.median()

performanceData.ELAPSED.describe()
#performanceData["time_bin"]= ['0_to_1' if ((x <= 1) and (x >0)) else '2_to_more' for x in performanceData['ELAPSED']]  
    
def time_bin_method(df):
    if (0 < df['ELAPSED'] <= 2):
        return '0_to_2'
    elif (2 < df['ELAPSED'] <= 4):
        return '2_to_4'
    elif (4 < df['ELAPSED'] <= 6):
        return '4_to_6'
    elif (df['ELAPSED'] > 6):
        return '6_or_more'

def time_bin_method_pr(df):
    if (0 < df['ELAPSED'] <= 4.4):
        return '0_to_4.4'
    else :
        return '4.4_or_more'    
performanceData['time_bin'] = performanceData.apply(time_bin_method_pr, axis = 1)
#performanceData.loc[(performanceData.ELAPSED >= 0) and (performanceData.ELAPSED < 1), 'time_bin'] = '0_to_1'  
performanceData.head()

Unnamed: 0,STARTDATE,ELAPSED,USERID,EVENTID,EVENT_DESC,WORKSPACEID,WS_DESC,TRANSACTIONTYPE,CORE_ID,HR_ORGANIZATION,LOCATION,USER_ROLE,MAG_CODE,time_bin
0,2019-04-06 16:22:34,2.324,2493686,BkTdYe4nUz,Project Resources,,Exception,1,wpmsp,The Demo Org.SP Demo Org,TX30,_Program Manager,RB4,0_to_4.4
1,2019-04-06 17:45:20,7.978,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9,4.4_or_more
2,2019-04-06 17:49:06,3.96,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9,0_to_4.4
3,2019-04-06 17:50:49,4.537,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9,4.4_or_more
4,2019-04-06 17:54:16,4.846,8929278,BkTdYe4nUz,Project Resources,,Exception,1,NXA24209,HPMS.STI,IN-BLR01-s1,_Program Manager,RC9,4.4_or_more


In [153]:
filters = ['4.4_or_more']
performance_subset = performanceData[performanceData.time_bin.isin(filters)]
#performance_subset = performanceData
#datasetApriori = performance_subset.drop(performance_subset.columns.difference(['HR_ORGANIZATION','LOCATION','USER_ROLE','time_bin']), 1)
datasetApriori = performance_subset.drop(performance_subset.columns.difference(['HR_ORGANIZATION','LOCATION','USER_ROLE']), 1)
datasetApriori.dropna(inplace = True)
datasetApriori = datasetApriori.values.tolist()
#print(datasetApriori)

In [154]:
# Apriori analysis
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

te = TransactionEncoder()
te_ary = te.fit(datasetApriori).transform(datasetApriori)
df = pd.DataFrame(te_ary, columns=te.columns_)
df


Unnamed: 0,<No Rights>,AT-GRK01-s1,BE-LEU01-s1,BR-CPQ01,CA-KAN01,CN-BJS01,CN-CDU01,CN-CQI02,CN-SHA01-s1,CN-SHA02,...,_PLM+RM,_PM+RM,_PM+RM+Portfolio,_PMO+RM,_PMO+RM View Only,_Product Engineering Manager,_Product Line Marketer,_Program Manager,_Resource Controller,_Resource Manager
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
6,False,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [155]:
performanceData['time_bin'].describe()

#apriori(df, min_support=0.05)

count           21858
unique              2
top       4.4_or_more
freq            11084
Name: time_bin, dtype: object

In [156]:
frequent_itemsets= apriori(df, min_support=0.001, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets,length
385,0.001002,"(The Support IT.A Support PMO, _Core Team User...",3
386,0.005737,"(_Program Manager, RU-MOW02, T&O.TO-PT)",3
387,0.003187,"(SG-SGP01-s4, _Program Manager, T&O.SSC-GTI)",3
388,0.009471,"(_PMO+RM, US-AUS01, T&O.SSC-GTI)",3
389,0.001548,"(US-AUS01, _Program Manager, T&O.SSC-GTI)",3
390,0.00173,"(_PMO+RM, US-CHD01, T&O.SSC-QAL)",3
391,0.002277,"(_PMO+RM, US-AUS01, T&O.TO-FO)",3
392,0.007012,"(_PMO+RM, US-AUS02, T&O.TO-FO)",3
393,0.001366,"(_PMO+RM, T&O.TO-FO, US-CHD01)",3
394,0.001821,"(T&O.TO-PT, _PM+RM, US-CHD01)",3


In [157]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
#rules.to_csv("project_scheduler_rules_mlxtend.csv")

In [121]:
confidence = rules['confidence'] > 0.8
lift = rules['lift'] > 20
#elderly = df['age'] > 50

rules= rules.sort_values(by=['confidence','lift'], ascending=False)
rules[confidence]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
176,"(The Support IT.A Support PMO, 6_or_more)",(NXP-REMOTE),0.020883,0.033365,0.020883,1.000000,29.971223,0.020187,inf
293,"(_Core Team Support User, NXP-REMOTE)",(The Support IT.A Support PMO),0.030725,0.033365,0.030725,1.000000,29.971223,0.029700,inf
394,"(4_to_6, _Core Team Support User, The Support ...",(NXP-REMOTE),0.010802,0.033365,0.010802,1.000000,29.971223,0.010441,inf
461,"(_Core Team Support User, 6_or_more)","(The Support IT.A Support PMO, NXP-REMOTE)",0.019923,0.033365,0.019923,1.000000,29.971223,0.019258,inf
459,"(_Core Team Support User, 6_or_more, NXP-REMOTE)",(The Support IT.A Support PMO),0.019923,0.033365,0.019923,1.000000,29.971223,0.019258,inf
458,"(_Core Team Support User, 6_or_more, The Suppo...",(NXP-REMOTE),0.019923,0.033365,0.019923,1.000000,29.971223,0.019258,inf
395,"(4_to_6, _Core Team Support User, NXP-REMOTE)",(The Support IT.A Support PMO),0.010802,0.033365,0.010802,1.000000,29.971223,0.010441,inf
120,"(4_to_6, The Support IT.A Support PMO)",(NXP-REMOTE),0.012482,0.033365,0.012482,1.000000,29.971223,0.012066,inf
48,(_Core Team Support User),(NXP-REMOTE),0.030725,0.033365,0.030725,1.000000,29.971223,0.029700,inf
54,(_Core Team Support User),(The Support IT.A Support PMO),0.030725,0.033365,0.030725,1.000000,29.971223,0.029700,inf


In [158]:

def unionizeFrozenset(rules):
        return frozenset.union(rules['antecedents'],rules['consequents'])

performanceData['time_bin'] = performanceData.apply(time_bin_method, axis = 1)



rules['Combined_frozenset']=rules.apply(unionizeFrozenset, axis = 1)

In [159]:
rules.describe()

Unnamed: 0,antecedent support,consequent support,support,confidence,lift,leverage,conviction
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.076096,0.076096,0.009293,0.377477,19.355419,0.006767,inf
std,0.108714,0.108714,0.015228,0.359085,73.654193,0.009897,
min,0.001002,0.001002,0.001002,0.002948,2.010035,0.000531,1.001587
25%,0.009858,0.009858,0.001821,0.059939,2.698034,0.001398,1.047332
50%,0.027775,0.027775,0.004827,0.228155,4.442152,0.0031,1.236243
75%,0.072489,0.072489,0.010291,0.678571,12.614726,0.00769,2.612386
max,0.392223,0.392223,0.14944,1.0,998.272727,0.080575,inf


In [160]:
uninionizedRules = rules.drop(columns=['antecedents','consequents'])
uninionizedRules.describe()

Unnamed: 0,antecedent support,consequent support,support,confidence,lift,leverage,conviction
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.076096,0.076096,0.009293,0.377477,19.355419,0.006767,inf
std,0.108714,0.108714,0.015228,0.359085,73.654193,0.009897,
min,0.001002,0.001002,0.001002,0.002948,2.010035,0.000531,1.001587
25%,0.009858,0.009858,0.001821,0.059939,2.698034,0.001398,1.047332
50%,0.027775,0.027775,0.004827,0.228155,4.442152,0.0031,1.236243
75%,0.072489,0.072489,0.010291,0.678571,12.614726,0.00769,2.612386
max,0.392223,0.392223,0.14944,1.0,998.272727,0.080575,inf


In [161]:
uninionizedRules = uninionizedRules.groupby(['Combined_frozenset']).mean()


In [162]:
uninionizedRules.head()

Unnamed: 0_level_0,antecedent support,consequent support,support,confidence,lift,leverage,conviction
Combined_frozenset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(<No Rights>, GB-TON01)",0.007877,0.007877,0.00255,0.596552,75.731034,0.002516,inf
"(HPMS.AAA, _PMO+RM, FR-TLS02)",0.059011,0.059011,0.010291,0.359286,5.834917,0.008323,5.212335
"(HPMS.AAA, FR-TLS02, _Program Manager)",0.065887,0.065887,0.032693,0.60599,9.714034,0.0291,3.721155
"(_Program Manager, FR-TLS02, HPMS.IDA)",0.185958,0.185958,0.001275,0.50172,2.698034,0.000802,inf
"(HPMS.SEN, FR-TLS02, _Program Manager)",0.107003,0.107003,0.001366,0.305386,3.29398,0.000937,inf


In [163]:
uninionizedRules.to_csv("project_resource_Union_rules_mlxtend.csv")