In [18]:
from typing import Dict
import pandas as pd
import requests

import bs4
from tqdm.auto import tqdm

SCHEMA_CATS_URL = "https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category"

def fetch_az_mon_categories() -> requests.models.Response:
    """Return the AzMonitor reference page."""
    return requests.get(SCHEMA_CATS_URL)


def get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag:
    """Extract the list after the security header."""
    soup = bs4.BeautifulSoup(resp.text, "html.parser")

    result = soup.find("div", class_="content")
    sec_header =result.find("h2", id="security")
    return sec_header.find_next_sibling()


def build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]:
    """From the html list, build an index of URLs."""
    table_prefix = "https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}"
    return {
        item.a.contents[0]: {
            "href": item.a.attrs.get("href"),
            "url": table_prefix.format(**(item.a.attrs)),
        }
        for item in security_cat_list.find_all("li")
    }


def read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame:
    """Read table schema from a URL."""
    table_data = pd.read_html(ref["url"])[0]
    table_data["Table"] = table
    table_data["Url"] = ref["url"]
    print(table, table_data.columns)
    return table_data


def fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame:
    """Combine schema tables into single DF."""
    print(f"Reading schemas for {len(sec_url_dict)} tables...")
    all_tables = [
        read_table_from_url(table, ref)
        for table, ref in tqdm(sec_url_dict.items())
    ]
    return pd.concat(all_tables, ignore_index=True)



sec_cat_list = get_security_category_list(fetch_az_mon_categories())
sec_url_dict = build_table_index(sec_cat_list)
sec_url_dict = {key: val for key, val in sec_url_dict.items() if key.startswith("S")}
comb_tables = fetch_table_schemas(sec_url_dict)

Reading schemas for 11 tables...


  9%|▉         | 1/11 [00:00<00:02,  3.41it/s]

SecurityAlert Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 18%|█▊        | 2/11 [00:00<00:03,  2.49it/s]

SecurityBaseline Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 27%|██▋       | 3/11 [00:01<00:03,  2.63it/s]

SecurityBaselineSummary Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 36%|███▋      | 4/11 [00:01<00:02,  2.79it/s]

SecurityDetection Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 45%|████▌     | 5/11 [00:01<00:01,  3.21it/s]

SecurityEvent Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 64%|██████▎   | 7/11 [00:02<00:01,  3.85it/s]

SecurityIoTRawEvent Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')
SecurityRecommendation Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 73%|███████▎  | 8/11 [00:02<00:00,  3.95it/s]

SentinelAudit Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 82%|████████▏ | 9/11 [00:02<00:00,  3.85it/s]

SentinelHealth Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


 91%|█████████ | 10/11 [00:02<00:00,  3.77it/s]

SigninLogs Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')


100%|██████████| 11/11 [00:03<00:00,  3.47it/s]

Syslog Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')





In [3]:
comb_tables.head()

NameError: name 'comb_tables' is not defined

In [41]:
t_dict = {}
for table, df in comb_tables.groupby("Table"):
    url = df.iloc[0]["Url"]
    t_dict[table] = {
        "url": url,
        "schema": df.drop(columns=["Table", "Url"]).to_dict(orient="records")[0]
    }

t_dict
import json
display(json.dumps(t_dict))
display(pd.read_json(json.dumps(t_dict), orient="index"))
display(pd.json_normalize(t_dict))

'{"SecurityAlert": {"url": "https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityalert", "schema": {"Column": "AlertLink", "Type": "string", "Description": NaN}}, "SecurityBaseline": {"url": "https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitybaseline", "schema": {"Column": "ActualResult", "Type": "string", "Description": NaN}}, "SecurityBaselineSummary": {"url": "https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitybaselinesummary", "schema": {"Column": "AssessmentId", "Type": "string", "Description": NaN}}, "SecurityDetection": {"url": "https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitydetection", "schema": {"Column": "AccountsSeen", "Type": "int", "Description": NaN}}, "SecurityEvent": {"url": "https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityevent", "schema": {"Column": "AccessMask", "Type": "string", "Description": NaN}}, "SecurityIoTRawEvent": {"url": "https://learn.micros

Unnamed: 0,url,schema
SecurityAlert,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AlertLink', 'Type': 'string', 'Des..."
SecurityBaseline,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'ActualResult', 'Type': 'string', '..."
SecurityBaselineSummary,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AssessmentId', 'Type': 'string', '..."
SecurityDetection,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AccountsSeen', 'Type': 'int', 'Des..."
SecurityEvent,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AccessMask', 'Type': 'string', 'De..."
SecurityIoTRawEvent,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AgentVersion', 'Type': 'string', '..."
SecurityRecommendation,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AssessedResourceId', 'Type': 'stri..."
SentinelAudit,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'CorrelationId', 'Type': 'string', ..."
SentinelHealth,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'Description', 'Type': 'string', 'D..."
SigninLogs,https://learn.microsoft.com/azure/azure-monito...,"{'Column': 'AADTenantId', 'Type': 'string', 'D..."


Unnamed: 0,SecurityAlert.url,SecurityAlert.schema.Column,SecurityAlert.schema.Type,SecurityAlert.schema.Description,SecurityBaseline.url,SecurityBaseline.schema.Column,SecurityBaseline.schema.Type,SecurityBaseline.schema.Description,SecurityBaselineSummary.url,SecurityBaselineSummary.schema.Column,...,SentinelHealth.schema.Type,SentinelHealth.schema.Description,SigninLogs.url,SigninLogs.schema.Column,SigninLogs.schema.Type,SigninLogs.schema.Description,Syslog.url,Syslog.schema.Column,Syslog.schema.Type,Syslog.schema.Description
0,https://learn.microsoft.com/azure/azure-monito...,AlertLink,string,,https://learn.microsoft.com/azure/azure-monito...,ActualResult,string,,https://learn.microsoft.com/azure/azure-monito...,AssessmentId,...,string,The operation description.,https://learn.microsoft.com/azure/azure-monito...,AADTenantId,string,,https://learn.microsoft.com/azure/azure-monito...,Computer,string,Computer that the event was collected from.


In [2]:
comb_tables["Table"].unique()

NameError: name 'comb_tables' is not defined