In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
df = pd.read_csv("data.csv")
reqd = ["ResourceGroupId", "Feature", "CategoryName", "VerificationResult", "ControlStringId"]
df = df[reqd]

In [3]:
df.head()


Unnamed: 0,ResourceGroupId,Feature,CategoryName,VerificationResult,ControlStringId
0,2,SQLDatabase,Storage,Verify,Azure_SQLDatabase_AuthZ_Firewall_Deny_Access_A...
1,2,SQLDatabase,DataProcessing,Verify,Azure_SQLDatabase_AuthZ_Firewall_Deny_Access_A...
2,2,SQLDatabase,Reporting,Verify,Azure_SQLDatabase_AuthZ_Firewall_Deny_Access_A...
3,2,AppService,Web Front End,Verify,Azure_AppService_AuthZ_Grant_Min_RBAC_Access
4,2,AppService,APIs,Verify,Azure_AppService_AuthZ_Grant_Min_RBAC_Access


In [4]:
# Create combination dict
feature_combinations = defaultdict(set)
for idx, row in df.iterrows():
    feature_combinations[row["ResourceGroupId"]].add(row["Feature"])

In [5]:
feature_combinations

defaultdict(set,
            {2: {'AppService',
              'Automation',
              'EventHub',
              'KeyVault',
              'SQLDatabase',
              'Storage',
              'StreamAnalytics'},
             6: {'AppService',
              'Automation',
              'EventHub',
              'KeyVault',
              'SQLDatabase',
              'Storage',
              'StreamAnalytics'},
             8: {'AppService', 'KeyVault', 'LogicApps', 'Storage'},
             260: {'AppService', 'EventHub', 'KeyVault', 'Storage'},
             261: {'AppService', 'EventHub', 'KeyVault', 'Storage'},
             361: {'AppService', 'KeyVault', 'LogicApps', 'Storage'},
             362: {'AppService',
              'Automation',
              'EventHub',
              'KeyVault',
              'SQLDatabase',
              'Storage',
              'StreamAnalytics'},
             363: {'AppService', 'KeyVault', 'LogicApps', 'Storage'},
             364: {'AppService',
     

In [6]:
failures = defaultdict(dict)
for idx, row in df.iterrows():
    totals = failures[row["ResourceGroupId"]].setdefault("Totals", 0)
    fails = failures[row["ResourceGroupId"]].setdefault("Fails", 0)
    success = failures[row["ResourceGroupId"]].setdefault("Success", 0)
    failures[row["ResourceGroupId"]]["Totals"] = totals + 1 
    if row["VerificationResult"] == "Passed":
        failures[row["ResourceGroupId"]]["Success"] = success + 1
    else:
        failures[row["ResourceGroupId"]]["Fails"] = fails + 1

In [53]:
feature_hash_map = {
    "SQLDatabase": 3940427,
    "AppService": 3940763,
    "StreamAnalytics": 1414297,
    "KeyVault": 3125831,
    "Storage": 5392313,
    "Automation": 6305339,
    "EventHub": 7368719,
    "LogicApps": 7368629,
    "TrafficManager": 7368787,
    "VirtualNetwork": 2523893,
    "DataLakeStore": 4284113,
    "CosmosDB": 5602973,
    "RedisCache": 5603713,
}

category_hash_map = {
    "Storage" : 1000003,
    "DataProcessing" : 1000033,
    "Reporting" : 1000037,
    "Web Front End" : 1000039,
    "APIs" : 1000081,
    "Security Infra" : 1000099,
    "SubscriptionCore" : 1000117,
    "Commuincation Hub" : 1000121,
    "Hybrid" : 1000133,
    "Network Isolation" : 1000151,
    "Cache" : 1000159
}

parent_map = {
    "AppService": ["Web Front End", "APIs"],
    "SQLDatabase": ["Storage", "DataProcessing", "Reporting"],
    "Storage": ["Storage", "Reporting", "DataProcessing"],
    "LogicApps": ["DataProcessing"],
    "DataFactory": ["DataProcessing"],
    "DataLakeAnalytics": ["DataProcessing", "Reporting"],
    "DataLakeStore": ["Storage", "Reporting", "DataProcessing"],
    "NotificationHub": ["Commuincation Hub"],
    "ServiceFabric": ["Web Front End", "APIs", "Backend Processing"],
    "Search": ["APIs", "Backend Processing"],
    "VirtualMachine": ["Web Front End", "APIs", "Backend Processing", "DataProcessing"],
    "VirtualNetwork": ["Network Isolation", "Hybrid"],
    "AnalysisServices": ["DataProcessing", "Reporting"],
    "Batch": ["Backend Processing"],
    "RedisCache": ["Cache"],
    "EventHub": ["Commuincation Hub", "Hybrid"],
    "ODG": ["Hybrid"],
    "TrafficManager": ["Network Isolation"],
    "ERvNet": ["Hybrid", "Network Isolation"],
    "Automation": ["SubscriptionCore"],
    "CosmosDB": ["Storage", "DataProcessing", "Reporting"],
    "StreamAnalytics": ["DataProcessing", "Reporting"],
    "CloudService": ["Web Front End", "APIs", "Backend Processing"],
    "LoadBalancer": ["Network Isolation"],
    "APIConnection": ["DataProcessing"],
    "BotService": ["APIs", "Commuincation Hub", "Web Front End"],
    "ContainerInstances": ["Web Front End", "APIs", "DataProcessing", "Backend Processing"],
    "DataFactoryV2": ["DataProcessing", "Backend Processing"],
    "KeyVault": ["Security Infra"]
}

BIG_PRIME = 824633720831

def get_feature_hash(features):
    hash_val = 1
    for feature in features:
        hash_val *= feature_hash_map[feature]
        hash_val %= BIG_PRIME
    return hash_val

def get_category_hash(categories):
    hash_val = 1
    for category in categories:
        hash_val *= category_hash_map[category]
        hash_val %= BIG_PRIME
    return hash_val

In [9]:
df["Feature"].unique()

array(['SQLDatabase', 'AppService', 'StreamAnalytics', 'KeyVault',
       'Storage', 'Automation', 'EventHub', 'LogicApps', 'TrafficManager',
       'VirtualNetwork', 'DataLakeStore', 'CosmosDB', 'RedisCache'],
      dtype=object)

In [10]:
get_hash(['AppService', 'StreamAnalytics', 'KeyVault', 'Storage', 'SQLDatabase'])

397933297130

In [40]:
master_hash_table = dict()
for res_id in feature_combinations:
    features = feature_combinations[res_id]
    feature_hash = get_hash(features)
    int_list = master_hash_table.setdefault(feature_hash, {"features": features, "counts": 0, "info": failures[res_id]})
    int_list["counts"] += 1
    # master_hash_table[feature_hash] = int_list

In [42]:
for x in master_hash_table:
    print(master_hash_table[x])
    print("*" * 100)

{'features': {'StreamAnalytics', 'Storage', 'Automation', 'SQLDatabase', 'EventHub', 'KeyVault', 'AppService'}, 'counts': 6, 'info': {'Totals': 709, 'Fails': 448, 'Success': 261}}
****************************************************************************************************
{'features': {'AppService', 'LogicApps', 'KeyVault', 'Storage'}, 'counts': 6, 'info': {'Totals': 285, 'Fails': 193, 'Success': 92}}
****************************************************************************************************
{'features': {'KeyVault', 'Storage', 'EventHub', 'AppService'}, 'counts': 2, 'info': {'Totals': 363, 'Fails': 221, 'Success': 142}}
****************************************************************************************************
{'features': {'Automation', 'EventHub'}, 'counts': 1, 'info': {'Totals': 34, 'Fails': 31, 'Success': 3}}
****************************************************************************************************
{'features': {'TrafficManager'}, 'counts': 5, '

In [59]:
list_ = ['StreamAnalytics', 'Storage', 'Automation']

temp_par = {
    "StreamAnalytics": ["a", "b"],
    "Storage": ["c", "d"],
    "Automation": ["e"]
}


def recurse(my_list, cache):
    if my_list:
        for parent in temp_par[my_list[0]]:
            recurse(my_list[1:], cache * parent)
    else:
        print(cache)
        print("END")

recurse(list_, "")

ace
END
ade
END
bce
END
bde
END


In [36]:
parent_map = defaultdict(list)

for indx, row in pdf.iterrows():
    parent_map[row["Feature"]].append(row["Category"])