## MicroVision

### Log Parsing Module using Drain3 
### Log Enrichment - Metadata

In [1]:
import os

if os.getcwd() == '/Users/matildamwendwa/Desktop/Desktop - Admin‚Äôs MacBook Pro/Python_Projects/microvision/notebooks':
    os.chdir('/Users/matildamwendwa/Desktop/Desktop - Admin‚Äôs MacBook Pro/Python_Projects/microvision')
    print("Changed!!")

print("Current working directory:", os.getcwd())

Changed!!
Current working directory: /Users/matildamwendwa/Desktop/Desktop - Admin‚Äôs MacBook Pro/Python_Projects/microvision


#### Install & Import Dependencies

In [2]:
%pip install drain3 pandas matplotlib tqdm --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import json
from tqdm import tqdm
from drain3 import TemplateMiner
from drain3.file_persistence import FilePersistence
from drain3.template_miner_config import TemplateMinerConfig
from drain3.template_miner_config import MaskingInstruction

print("‚úÖ Drain3 and dependencies imported successfully.")

‚úÖ Drain3 and dependencies imported successfully.


#### Configurations and Setup

In [5]:
config = {
        "DATA_DIR": "data",
        "DATASET_NAME": "OpenStack",
        # "DATASET_LOG": "_full.log",
        "DATASET_LOG": "_2k.log",

        "OUTPUT_CSV": "_structured.csv",
        "TEMPLATES_CSV": "_templates.csv",
        "CLEANED_CSV": "_cleaned_templates.csv",
        "MAX_LINES": None,  # Set to None to process all lines
        # "MAX_LINES": 2000,
        "PERSISTENCE_PATH": "persistence",
        "DRAIN_PATH": "drain3_state",
        # "ENRICHED_CSV": "_enriched.csv",
}


DATASET_PATH = f"{config['DATA_DIR']}/{config['DATASET_NAME']+config['DATASET_LOG']}"

# The output file will contain the structured log level csv
OUTPUT_CSV = f"{config['DATA_DIR']}/{config['DATASET_NAME']}{config['DATASET_LOG']}{config['OUTPUT_CSV']}"
TEMPLATES_CSV = f"{config['DATA_DIR']}/{config['DATASET_NAME']}{config['DATASET_LOG']}{config['TEMPLATES_CSV']}"
CLEANED_CSV = f"{config['DATA_DIR']}/{config['DATASET_NAME']}{config['DATASET_LOG']}{config['CLEANED_CSV']}"


persistence_dir = os.path.join(f"{config['DATA_DIR']}/{config['PERSISTENCE_PATH']}", config['DRAIN_PATH'])
os.makedirs(persistence_dir, exist_ok=True)
persistence = FilePersistence(f"{persistence_dir}/drain3_state.bin")

print("Dataset Path:", DATASET_PATH)
print("The extracted templates (TEMPLATE-LEVEL) will be written to:", TEMPLATES_CSV)
print("The structured log templates enriched with metadata will be written to:", OUTPUT_CSV)
print("Persistence path (Drain3)", f"{persistence_dir}")


Dataset Path: data/OpenStack_2k.log
The extracted templates (TEMPLATE-LEVEL) will be written to: data/OpenStack_2k.log_templates.csv
The structured log templates enriched with metadata will be written to: data/OpenStack_2k.log_structured.csv
Persistence path (Drain3) data/persistence/drain3_state


### Utility Functions for Log Parsing Module

In [6]:
import glob
import re

# ------- UF: Loading Raw Log files

def load_raw_logs(log_dir: str, dataset_name: str):
    log_files = glob.glob(f"{log_dir}/{dataset_name}*.log")
    raw_logs = []
    for file_path in log_files:
        with open(file_path, "r") as f:
            raw_logs.extend(f.readlines())
    return raw_logs


# ------- UF: Convert LogPai-style format to regex

def log_format_to_regex(log_format: str) -> str:
    tokens = re.findall(r"<([^>]+)>", log_format)
    regex = re.escape(log_format)
    for t in tokens:
        esc = re.escape(f"<{t}>")
        if t.lower() == "content":
            repl = rf"(?P<{t}>.*)"
        else:
            repl = rf"(?P<{t}>.+?)"
        regex = regex.replace(esc, repl, 1)
    regex = regex.replace(r"\ ", r"\s+")
    return rf"^{regex}$"


# ------- UF: Parse log line with given format 

def parse_line_with_format(line: str, log_format: str):
    """Return dict of matched groups (or {'Content': line} fallback)."""
    if not log_format:
        return {"Content": line}
    regex = log_format_to_regex(log_format)
    line = line.strip()
    m = re.match(regex, line)
    if not m:
        return {"ParseError": True, "Raw": line, "Content": line}
    return m.groupdict()


### Dynamically Select Log Format Mapping for current dataset

In [7]:
# LOG FORMAT MAPPINGS FROM LogPai's GitHub
log_format_mappings = {
    "OpenStack": "<Date> <Time> <Pid> <Level> <Component> <Content>",
    "Hadoop": "<Date> <Time> <Pid> <Level> <Component>: <Content>",
    "HDFS": "<Date> <Time> <Level> <Component>: <Content>",
    "Spark": "<Date> <Time> <Level> <Component>: <Content>",
    "Zookeeper": "<Date> <Time> <Level> <Component>: <Content>"
}

# Auto-select format for our dataset
log_format = log_format_mappings.get(f"{config['DATASET_NAME']}", None)

if log_format:
    print(f"‚úÖ Log format for {config['DATASET_NAME']}: {log_format}")
else:
    print(f"‚ö†Ô∏è No log format found for {config['DATASET_NAME']}. Please update mapping.")


‚úÖ Log format for OpenStack: <Date> <Time> <Pid> <Level> <Component> <Content>


In [8]:
# ===============================================
# üß† SEMANTIC LOG ENRICHMENT (Pre-encoding Stage)
# ===============================================

import re
import pandas as pd
from typing import List, Dict, Optional
from collections import defaultdict

# -----------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------
FIELD_CONFIG = {
    "core_fields": ["Component", "Level", "Method", "URL"],             # Key identifiers
    "enrich_fields": [
        "ReqID", "UserID", "TenantID", "IP", "Status",
        "ResponseLength", "ResponseTime", "Service"                           # Added new enrich fields
    ],
    "metadata_fields": [
        "Component", "Level", "Pid", "ReqID", "UserID",
        "TenantID", "IP", "Status", "Method", "URL",
        "ResponseLength", "ResponseTime", "Service"
    ],
}

# -----------------------------------------------------------
# ‚öôÔ∏è CACHED REGEXES (case-insensitive)
EXTRA_FIELD_PATTERNS = {
    # "ReqID": re.compile(r"(?:req[-_]?id)\s*[:=]?\s*\[?([\w-]+)\]?", re.IGNORECASE),
    "ReqID": re.compile(r"\[req-([\w-]+)\b", re.IGNORECASE),

    "UserID": re.compile(r"(?:user[-_]?id)\s*[:=]\s*([\w-]+)", re.IGNORECASE),
    "TenantID": re.compile(r"(?:tenant[-_]?id)\s*[:=]\s*([\w-]+)", re.IGNORECASE),
    "IP": re.compile(r"\b(\d{1,3}(?:\.\d{1,3}){3})\b", re.IGNORECASE),
    "Status": re.compile(r"(?:status)\s*[:=]\s*(\d{3})", re.IGNORECASE),
    "Method": re.compile(r"\b(GET|POST|PUT|DELETE|PATCH|OPTIONS)\b", re.IGNORECASE),
    "URL": re.compile(r"(https?://[^\s]+|/[\w./-]+)", re.IGNORECASE),
    "ResponseLength": re.compile(r"len[:=]\s*(\d+)", re.IGNORECASE),
    "ResponseTime": re.compile(r"time[:=]\s*([\d\.]+)", re.IGNORECASE),
}


def extract_and_build_metadata(
    content: str,
    component: str = None,
    base_row: Optional[pd.Series] = None,
    metadata_fields: Optional[List[str]] = None,
) -> Dict[str, str]:
    meta = {}

    # --- 1Ô∏è‚É£ Regex-based extraction ---
    for field, pattern in EXTRA_FIELD_PATTERNS.items():
        match = pattern.search(content)
        if match:
            meta[field.lower()] = match.group(1).strip()

    # --- 2Ô∏è‚É£ Merge base_row metadata if provided ---
    if base_row is not None and metadata_fields:
        for f in metadata_fields:
            if f in base_row and pd.notna(base_row[f]):
                key = f.lower()
                # Avoid overwriting regex-extracted values
                if key not in meta:
                    meta[key] = base_row[f]

    # --- 3Ô∏è‚É£ Timestamp reconstruction ---
    date_str = meta.get("date") or (base_row.get("Date") if base_row is not None else "")
    time_str = meta.get("time") or (base_row.get("Time") if base_row is not None else "")
    date_str = str(date_str).strip()
    time_str = str(time_str).strip()

    if date_str and time_str:
        ts = pd.to_datetime(f"{date_str} {time_str}",
                            format="%Y-%m-%d %H:%M:%S.%f",
                            errors="coerce")
        meta["timestamp"] = ts if pd.notna(ts) else None
    else:
        meta["timestamp"] = None

    # --- 4Ô∏è‚É£ Service shortname ---
    if "service" not in meta and component:
        depth = 2
        parts = component.split('.')
        # Limit depth to the number of available parts
        selected = parts[:min(depth, len(parts))]
        service = ".".join(selected)

        meta["service"] = service.replace('_', '-')

    # print(content, component)

    return meta


In [None]:
# # Additional Metadata Enrichment Functions  ---- OUTDATED

# ENRICHMENT_CONFIG = {
#     "core_fields": ["Component", "Level", "Method", "URL"],             # Core log identifiers
#     "enrich_fields": ["ReqID", "UserID", "TenantID", "IP", "Status"],   # Enrichment layer
#     "metadata_fields": [
#         "Component", "Level", "Pid", "ReqID", "UserID", 
#         "TenantID", "IP", "Status", "Method", "URL", 
#         # "Timestamp"
#     ],  
# }

# import datetime
# import pandas as pd
# from typing import List, Dict

# # ---------------- Semantic Text Builder ----------------
# def build_semantic_text(row: pd.Series, 
#                         core_fields: List[str], 
#                         enrich_fields: List[str]) -> str:
#     parts = []
#     for f in core_fields:
#         if f in row and pd.notna(row[f]):
#             parts.append(f"[{row[f]}]")
#     base = " ".join(parts) + " " + str(row.get("template", ""))

#     extras = []
#     for f in enrich_fields:
#         if f in row and pd.notna(row[f]):
#             extras.append(f"{f.lower()}={row[f]}")

#     # Return combined text
#     return base + (" " + " ".join(extras) if extras else "")


# # ---------------- Structured Metadata Builder ----------------
# def build_structured_metadata(row: pd.Series, 
#                               metadata_fields: List[str]) -> Dict[str, str]:
#     meta = {}
#     for f in metadata_fields:
#         if f in row and pd.notna(row[f]):
#             meta[f.lower()] = row[f]

#     # Step 2: Combine Date + Time into Timestamp 
#     date_str = str(row.get("Date", "")).strip()
#     time_str = str(row.get("Time", "")).strip()

#     if date_str and time_str:
#         # Use consistent format for speed and reliability
#         ts = pd.to_datetime(
#             f"{date_str} {time_str}",
#             format="%Y-%m-%d %H:%M:%S.%f",   # consistent microsecond precision
#             errors="coerce"                  # returns NaT if parsing fails
#         )
#         meta["timestamp"] = ts if pd.notna(ts) else None
#     else:
#         meta["timestamp"] = None

#     return meta


# # -------- Apply Enrichment to DataFrame ----------------
# def apply_enrichment(df: pd.DataFrame, config: dict):

#     print("üîß Applying Additional enrichment...")
#     df["semantic_text"] = df.apply(
#         lambda r: build_semantic_text(r, config["core_fields"], config["enrich_fields"]), axis=1
#     )
#     df["structured_metadata"] = df.apply(
#         lambda r: build_structured_metadata(r, config["metadata_fields"]), axis=1
#     )

#     # Extract Timestamp from structured_metadata dict
#     df["Timestamp"] = df["structured_metadata"].apply(
#         lambda m: m.get("timestamp") if isinstance(m, dict) else None
#     )
#     # Sort by time for temporal consistency
#     df = df.sort_values("Timestamp").reset_index(drop=True)
    
#     return df


In [9]:
drain_config = TemplateMinerConfig()

# --- Core Parameters ---
drain_config.profiling_enabled = True
# drain_config.drain_sim_th = 0.62         
# drain_config.drain_depth = 6
drain_config.drain_sim_th = 0.4        
drain_config.drain_depth = 5             
drain_config.mask_prefix = "<*>"         
drain_config.extra_delimiters = ["=", ",", " ", ":", "-", "\"", "[", "]", "(", ")"]

# --- Dynamic Field Masking ---

drain_config.masking_instructions = [
        MaskingInstruction(r"req-[0-9a-f-]+", "<REQ_ID>"),
        MaskingInstruction(r"\b\d{1,3}(?:\.\d{1,3}){3}\b", "<IP>"),
        MaskingInstruction(
            # r"\[instance:\s*[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\]", 
            # r"\[instance:\s*([0-9a-fA-F\-]{36})\]",
            # r"\[instance:\s*[0-9a-fA-F\-]+\]",
            r"\[instance:\s*[0-9a-fA-F0-9\-]{36}\]",
            "<INSTANCE_ID>"
            ),
    ]

# # --- Initialize TemplateMiner ---
# try:
#     template_miner = TemplateMiner(persistence, drain_config)
#     print("‚úÖ Drain3 TemplateMiner initialized successfully.")
# except Exception as e:
#     print("‚ö†Ô∏è Error initializing TemplateMiner:", e)
#     raise



### MetaDataDrainParser Class - Custom class that 

In [14]:

class MetadataDrainParser:
    def __init__(self, log_format: str, 
                 structured_csv: str, save_every: int, templates_csv: str,
                 field_config: dict, 
                 mode: str,
                 ):
        # self.template_miner = template_miner
        self.log_format = log_format
        self.structured_csv = structured_csv
        self.save_every = save_every
        self.templates_csv = templates_csv
        self.field_config = field_config or FIELD_CONFIG
        self.mode = mode.lower()

        self._init_template_miner()

        self.buffer = []
        self.total = 0
        self.unique_templates = set()

        # Clean up existing files

        for path in [self.structured_csv, self.templates_csv]:
            try:
                if os.path.exists(path):
                    os.remove(path)
                    print(f"üóëÔ∏è Removed existing {path} for a fresh run.")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not remove {path}: {e}")

    def _init_template_miner(self):
        
        if self.mode == "fresh":
            print(f"üßπ Starting fresh parse (ignoring old state).")
            persist = None
        elif self.mode == "incremental":
            print(f"‚ôªÔ∏è Loading existing state from {persist} (incremental mode).")
            persist = FilePersistence(persistence)
        else:
            raise ValueError("mode must be 'fresh' or 'incremental'")

        self.template_miner = TemplateMiner(persist, drain_config)
        print("‚úÖ Drain3 TemplateMiner initialized successfully.")


    def detect_log_format(self, sample_line: str, base_format: str) -> str:

        rotated_pattern = r'^[\w\-.]+\.log(?:\.\d+)?\.\d{4}-\d{2}-\d{2}_\d{2}:\d{2}:\d{2}'
        if re.match(rotated_pattern, sample_line):
            if not base_format.startswith("<File>"):
                print("üß† Auto-detected rotated log prefix ‚Äî prepending <File> to log format.")
                return "<File> " + base_format
        return base_format

    def process_line(self, raw_line: str, line_no: int):
        """Parse one log line, enrich with metadata, and append to buffer."""
        raw = raw_line.rstrip("\n")
        
        if self.detect_log_format and line_no == 1:
            old_format = self.log_format
            self.log_format = self.detect_log_format(raw, old_format)
            if self.log_format != old_format:
                print(f"‚úÖ Adjusted log format ‚Üí {self.log_format}")

        # ------- Extract Structured metadata
        parsed_meta = parse_line_with_format(raw, self.log_format)
        content = parsed_meta.get("Content") or raw 
        # content = parsed_meta.get("Content", "")
        component = parsed_meta.get("Component")

        # ------- Extract additional metadata fields 
        unified_meta = extract_and_build_metadata(
            content,
            component,
            base_row=pd.Series(parsed_meta),
            metadata_fields=FIELD_CONFIG["metadata_fields"],
        )

        # -------- Send to Drain3 for template extraction
        try:
            result = self.template_miner.add_log_message(content)
        except Exception as e:
            result = {"cluster_id": None, "template_mined": None, "change_type": f"error:{e}"}

        # --------- Collect Drain3 + metadata output
        template = result.get("template_mined")
        template_id = result.get("cluster_id")

        self.unique_templates.add(template or f"__none_{template_id}")
        
        row = {
            "line_no": line_no,
            "raw": raw,
            "content": content,
            "template_id": template_id,
            "template": template,
        }

        row.update(unified_meta)
        # print(row)
        self.buffer.append(row)
        self.total += 1

        # Periodic flush - DRAIN
        if len(self.buffer) >= self.save_every:
            self.flush_to_csv()

                
    def flush_to_csv(self):
        df = pd.DataFrame(self.buffer)
        header = not os.path.exists(self.structured_csv)
        df.to_csv(self.structured_csv, mode="a", index=False, header=header)
        print(f"[flush] wrote {len(self.buffer)} rows ‚Üí {self.structured_csv} (total parsed {self.total})")
        self.buffer = []

    # ----- EXPORTING THE DRAIN TEMPLATES

    def export_templates(self):

        # --- GET ALL DRAIN CLUSTERS
        clusters = self.template_miner.drain.clusters
        if not clusters:
            print("‚ö†Ô∏è No templates discovered yet ‚Äî skipping export.")
            return

        # --- Normalize clusters iterable ---
        if isinstance(clusters, dict):
            cluster_list = clusters.values()
        else:
            cluster_list = clusters 

        templates = []
        for cluster in cluster_list:
            cid = getattr(cluster, "cluster_id", None)

            templates.append({
                "template_id": cid,
                "template": cluster.get_template(),
                "occurrences": getattr(cluster, "size", None),
            })
        df = pd.DataFrame(templates)
        header = not os.path.exists(self.templates_csv)
        df.to_csv(self.templates_csv, mode="w" if header else "a", index=False, header=header)

    def finalize(self):
        # ----- Final flush after finishing all lines.
        if self.buffer:
            self.flush_to_csv()
        print(f"‚úÖ Parsing complete. Total parsed lines: {self.total}")
        print(f"üß© Unique templates discovered: {len(self.unique_templates)}")
        # ----- Export template catalogue
        self.export_templates()
        print(f"üìä Template catalogue exported for validation and benchmarking.")


### Initialize and Configure Drain3

#### Running the Log Parsing and Metadata Extraction Process

In [15]:
parser = MetadataDrainParser(
    # template_miner=template_miner,
    log_format=log_format,
    field_config=FIELD_CONFIG,
    structured_csv=OUTPUT_CSV,
    templates_csv=TEMPLATES_CSV,
    mode="fresh",
    # save_every=500,
    save_every=50000,
)

with open(DATASET_PATH, "r", encoding="utf-8", errors="ignore") as fh:
    for i, line in enumerate(tqdm(fh, desc="Parsing lines"), start=1):
        if config['MAX_LINES'] and i > config['MAX_LINES']:
            break
        parser.process_line(line, i)

# Finalize and save
parser.finalize()

üßπ Starting fresh parse (ignoring old state).
‚úÖ Drain3 TemplateMiner initialized successfully.
üóëÔ∏è Removed existing data/OpenStack_2k.log_structured.csv for a fresh run.


Parsing lines: 210it [00:00, 2034.17it/s]

üß† Auto-detected rotated log prefix ‚Äî prepending <File> to log format.
‚úÖ Adjusted log format ‚Üí <File> <Date> <Time> <Pid> <Level> <Component> <Content>


Parsing lines: 2000it [00:00, 4138.23it/s]


[flush] wrote 2000 rows ‚Üí data/OpenStack_2k.log_structured.csv (total parsed 2000)
‚úÖ Parsing complete. Total parsed lines: 2000
üß© Unique templates discovered: 45
üìä Template catalogue exported for validation and benchmarking.


In [None]:
# PREVIEWING THE DRAIN CLUSTERS CREATED

# for cluster in template_miner.drain.clusters:
#     print(f"Template: {cluster.get_template()}")
#     print(f"Occurrences: {cluster.size}")
#     print(f"Cluster: {cluster}")

#### DATA CLEANING AND DEDUPLICATION

In [16]:
import re

def clean_template(text, preserve_symbols=":=0123456789"):
    # REMOVE ALL PLACEHOLDER PATTERNS
    text = re.sub(r"<\*>", "", text)
    # NORMALIZE UUIDS
    text = re.sub(r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b', "<*>", text)
    # NORMALIZE OTHER ALPHANUMERIC IDS
    text = re.sub(r'\b[0-9a-f]{20,}\b', "<*>", text)
    # REPLACE NUMBERS
    # text = re.sub(r'\b\d+\b', "<*>", text)
    # REMOVE ALL UNWANTED SYMBOLS EXCEPT THE ALLOWED SYMBOLS
    pattern = rf"[^\w\s{re.escape(preserve_symbols)}/.-]"
    text = re.sub(pattern, "", text)
    # NORMALIZE ALL WHITESPACES
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

In [None]:
os.makedirs(config["DATA_DIR"], exist_ok=True)

# CLEANING THE TEMPLATES CSV

def clean_and_group_templates(templates_csv: str, output_csv: str) -> pd.DataFrame:
    """
    Clean template texts, deduplicate by semantic content,
    and preserve all associated template_ids in a list.
    """
    templates_df = pd.read_csv(templates_csv)

    # Clean the templates
    templates_df['semantic_text'] = templates_df['template'].apply(clean_template)

    # Count occurrences
    templates_df['occurrences'] = templates_df.groupby('semantic_text')['semantic_text'].transform('count')

    # Group by the cleaned semantic_text
    grouped_df = templates_df.groupby('semantic_text').agg(
        template_ids=('template_id', lambda x: list(x)),
        occurrences=('occurrences', 'max')  # keep the max occurrences
    ).reset_index()

    # Optionally sort by occurrences descending
    grouped_df = grouped_df.sort_values('occurrences', ascending=False)

    # Save to CSV
    grouped_df.to_csv(output_csv, index=False)

    print(f"Cleaned & grouped templates saved: {output_csv}")
    print(f"Number of unique semantic templates: {len(grouped_df)}")

    return grouped_df

grouped_templates = clean_and_group_templates(TEMPLATES_CSV, CLEANED_CSV)
print(grouped_templates)


Cleaned & grouped templates saved: data/OpenStack_2k.log_cleaned_templates.csv
Number of unique semantic templates: 21
                                        semantic_text  template_ids  \
4                                    req_id - - - - -   [2, 10, 16]   
16                           req_id - - - instance_id  [13, 14, 19]   
0   - instance_id during sync_power_state the inst...          [25]   
11  req_id - - - - - the instance sync for host cp...          [23]   
19  req_id - - - instance_id took seconds to deall...          [15]   
18  req_id - - - instance_id took seconds to build...           [9]   
17  req_id - - - instance_id attempting claim: mem...          [18]   
15           req_id - - - http/1.1 status: len: time:           [1]   
14  req_id - - - http exception thrown: no instanc...          [17]   
13           req_id - - - creating event for instance           [6]   
12  req_id - - - - - while synchronizing instance ...          [24]   
10  req_id - - - - - running 