### MicroVision 

## Log Template Enrichment Notebook

In [None]:
import os

if os.getcwd() == '/Users/matildamwendwa/Desktop/Desktop - Admin‚Äôs MacBook Pro/Python_Projects/microvision/notebooks':
    os.chdir('/Users/matildamwendwa/Desktop/Desktop - Admin‚Äôs MacBook Pro/Python_Projects/microvision')
    print("Changed!!")

print("Current working directory:", os.getcwd())

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
%pip install numpy pandas tqdm  --quiet

## 1. Load and preprocess templates for semantic embedding

In [None]:
import os
import pandas as pd

config = {
        "DATA_DIR": "data",
        "DATASET_NAME": "OpenStack",
        "OUTPUT_SUFFIX": "_enriched.csv",
        "TEMPLATE_COL": "template",
        # "KNOWN_SERVICES": "known_services.json"
    }

os.makedirs(config["DATA_DIR"], exist_ok=True)

print(f"üìÇ Data directory: {config['DATA_DIR']}")
print(f"üìÇ Dataset Name: {config['DATASET_NAME']}")



In [None]:
## Utility Functions that facilitate the Enrichment Process

In [None]:
# ----- UF1: CLEANING FUNCTION -------
def clean_template(text, preserve_symbols=":="):
    pattern = rf"[^\w\s{re.escape(preserve_symbols)}/.-]"
    text = re.sub(r"<\*>", "", text)
    text = re.sub(pattern, "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

In [None]:
import re

def extract_service_hint(template, known_services=None):
    """
    Extracts the most likely service/component name from a Drain3 log template.
    Designed for use in distributed system logs (OpenStack, Hadoop, Spark, etc.).
    """

    text = template.lower()

    # 1Ô∏è‚É£ Tier 1: Check known service names first (from known_services.json)
    if known_services:
        matches = [svc for svc in known_services if re.search(rf"\b{svc}\b", text)]
        if matches:
            # Prefer the longest (most specific) match
            return sorted(matches, key=len, reverse=True)[0]

    # 2Ô∏è‚É£ Tier 2: Common distributed system service identifiers
    system_patterns = [
        r"(?P<service>nova[-_]api|nova[-_]compute|nova[-_]scheduler|neutron[-_]agent|cinder[-_]volume|glance|keystone)",
        r"(?P<service>namenode|datanode|hdfs|yarn|mapreduce|spark[-_]driver|spark[-_]executor|flink|zookeeper|kafka|hbase)",
    ]
    for p in system_patterns:
        m = re.search(p, text)
        if m:
            return m.group("service")

    # 3Ô∏è‚É£ Tier 3: Log filename or structured marker
    structured_patterns = [
        r"(?P<service>[a-z0-9_-]+)\.log",
        r"\[(?P<service>[a-z0-9_-]+)\]",
        r"\bmodule[:=]\s*(?P<service>[a-z0-9_-]+)\b",
        r"\bcomponent[:=]\s*(?P<service>[a-z0-9_-]+)\b",
    ]
    for p in structured_patterns:
        m = re.search(p, text)
        if m:
            return m.group("service")

    # 4Ô∏è‚É£ Tier 4: Token heuristics (fallback)
    tokens = re.findall(r"[a-z0-9_-]+", text)
    if tokens:
        # Prefer tokens containing known service-like substrings
        for t in tokens:
            if any(keyword in t for keyword in ["api", "compute", "agent", "service", "scheduler", "controller"]):
                return t
        # Avoid generic or numeric tokens
        meaningful_tokens = [t for t in tokens if len(t) > 3 and not re.match(r"^[0-9a-f]{8,}$", t)]
        if meaningful_tokens:
            return meaningful_tokens[0]

    return "unknown"


In [None]:
# ----- UF3: TIMESTAMP EXTRACTION FUNCTION ------- -> Moved to Future Plans

# def extract_timestamp(template):
#     match = re.search(r"\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}", template)
#     return match.group(0) if match else None

# def extract_log_level(template):
#     match = re.search(r"\b(INFO|WARN|ERROR|DEBUG|TRACE|CRITICAL)\b", template, re.IGNORECASE)
#     return match.group(0).upper() if match else None


### Main Enrichment Function definition

In [None]:
from tqdm import tqdm

def enrich_templates(df, dataset_name="generic", known_services=None, preserve_symbols=":="):
    print(f"üîß Enriching templates for dataset: {dataset_name}")
    
    tqdm.pandas(desc="Cleaning templates")
    df["clean_template"] = df["template"].progress_apply(
        lambda t: clean_template(t, preserve_symbols=preserve_symbols)
    )
    
    tqdm.pandas(desc="Extracting service hints")
    if known_services is None:
        known_services = df["template"].str.extract(r"([a-zA-Z0-9_-]+)\.log")[0].dropna().unique().tolist()
    
    df["service_hint"] = df["clean_template"].progress_apply(
        lambda x: extract_service_hint(x, known_services)
    )
    
    # tqdm.pandas(desc="Extracting timestamps")
    # df["timestamp"] = df["clean_template"].progress_apply(extract_timestamp)
    
    # tqdm.pandas(desc="Extracting log levels")
    # df["log_level"] = df["clean_template"].progress_apply(extract_log_level)
    
    print("‚úÖ Enrichment complete ‚Äî features added: clean_template, service_hint, timestamp, log_level")
    return df


## 2. Running the complete Enrichment Process and Saving the Output

In [None]:
# Cell that Loads the Parsed log templates and runs the Enrichment function


templates_df = pd.read_csv(f"{config['DATA_DIR']}/{config['DATASET_NAME']}_full.log_templates.csv")
print(f"Loaded {len(templates_df)} templates")

enriched_templates_df = enrich_templates(templates_df, dataset_name=config["DATASET_NAME"], preserve_symbols=":=")

enriched_templates_df.to_csv(f"{config['DATA_DIR']}/{config['DATASET_NAME']}_full.log_enriched_templates.csv", index=False)
print(f"‚úÖ Enriched templates saved to '{config['DATA_DIR']}/{config['DATASET_NAME']}_full.log_enriched_templates.csv'")

In [None]:
print(templates_df["service_hint"].value_counts().head(10))

### Utility Function: Persist Known Services in json file

In [None]:
# import os
# import json


# def update_known_services(new_services, services_path=f"{config['DATA_DIR']}/config['KNOWN_SERVICES_FILE']", verbose=True):

    
#     new_services = sorted(set(s.strip().lower() for s in new_services if s and isinstance(s, str)))

#     # Load existing services if file exists
#     if os.path.exists(services_path):
#         try:
#             with open(services_path, "r") as f:
#                 existing_services = json.load(f)
#         except Exception:
#             existing_services = []
#     else:
#         existing_services = []

#     # Merge and deduplicate
#     updated_services = sorted(set(existing_services + new_services))

#     # Determine if changes occurred
#     new_added = [s for s in updated_services if s not in existing_services]

#     if new_added:
#         with open(services_path, "w") as f:
#             json.dump(updated_services, f, indent=2)
#         if verbose:
#             print(f"üÜï Added {len(new_added)} new services to {services_path}: {new_added}")
#     else:
#         if verbose:
#             print(f"‚úÖ No new services found. Existing {len(existing_services)} services retained.")

#     return updated_services


In [None]:
# Running the KNOWN SERVIES persistence function
new_services = templates_df["service_hint"].dropna().unique().tolist()

# 2Ô∏è‚É£ Persist and auto-update known services registry
# known_services = update_known_services(new_services)

#### Context Window Builder Function

In [None]:
# from collections import defaultdict
# from typing import Optional
# from tqdm import tqdm
# import pandas as pd
# from transformers import AutoTokenizer

# def build_context_windows_tokenized(
#     enriched_df: pd.DataFrame,
#     service_col: str = "service_hint",
#     template_col: str = "template",
#     include_metadata: bool = True,
#     max_templates_per_service: Optional[int] = None,
#     tokenizer_name: str = "sentence-transformers/all-MiniLM-L6-v2",
#     max_tokens: int = 512,
#     overlap_tokens: int = 50
# ):
#     """
#     Build tokenized service-level context windows with optional metadata.
#     Ensures no window exceeds max_tokens. Uses overlapping windows for context continuity.
#     """
#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
#     service_contexts = defaultdict(list)

#     print("üîß Building tokenized service-level context windows...")
#     for svc, svc_df in tqdm(enriched_df.groupby(service_col), desc="Processing services"):
#         context_lines = []

#         # Step 1: Collect all snippets per service
#         for _, row in svc_df.iterrows():
#             snippet = row[template_col]
#             if include_metadata:
#                 meta_fields = {k: v for k, v in row.items() if k not in [service_col, template_col]}
#                 meta_str = ", ".join(f"{k}: {v}" for k, v in meta_fields.items() if pd.notna(v))
#                 snippet = f"[Template: {snippet}] [{meta_str}]" if meta_str else snippet
#             context_lines.append(snippet)

#         # Step 2: Trim large services if needed
#         if max_templates_per_service:
#             context_lines = context_lines[:max_templates_per_service]

#         # Step 3: Build token windows
#         win_id = 1
#         current_tokens = []
#         for snippet in context_lines:
#             snippet_tokens = tokenizer.encode(snippet, add_special_tokens=False)
#             while len(snippet_tokens) > 0:
#                 space_left = max_tokens - len(current_tokens)
#                 take_tokens = snippet_tokens[:space_left]
#                 current_tokens.extend(take_tokens)
#                 snippet_tokens = snippet_tokens[space_left:]

#                 if len(current_tokens) >= max_tokens:
#                     # Create window
#                     window_text = tokenizer.decode(current_tokens, skip_special_tokens=True)
#                     service_contexts[svc].append({"window_id": win_id, "context_text": window_text})
#                     win_id += 1

#                     # Prepare next window with overlap
#                     if overlap_tokens > 0:
#                         current_tokens = current_tokens[-overlap_tokens:]
#                     else:
#                         current_tokens = []

#         # Step 4: Save any remaining tokens as the last window
#         if current_tokens:
#             window_text = tokenizer.decode(current_tokens, skip_special_tokens=True)
#             service_contexts[svc].append({"window_id": win_id, "context_text": window_text})

#     total_windows = sum(len(wins) for wins in service_contexts.values())
#     print("‚úÖ Tokenized context windows complete.")
#     print(f"üì¶ Generated {total_windows} total context windows.\n")

#     return dict(service_contexts)




In [None]:
# # Build token-aware context windows
# from tqdm import tqdm 

# service_contexts = build_context_windows_tokenized(
#     enriched_df,
#     service_col="service_hint",
#     template_col="template",
#     include_metadata=True,
#     max_templates_per_service=None,  # adjust if needed
#     max_tokens=512,
#     overlap_tokens=50
# )

# print(f"[INFO] Constructed {len(service_contexts)} service-level context groups.")

# # Preview first 1‚Äì2 windows per service for sanity check
# for svc, windows in list(service_contexts.items())[:5]:  # first 5 services
#     print(f"\nüîπ Service: {svc} | Total windows: {len(windows)}")
#     for win in windows[:2]:  # show first 2 windows
#         print(f"Window {win['window_id']} ({len(win['context_text'].split())} tokens preview):")
#         print(repr(win['context_text'][:200]))  # first 200 chars
#         print("‚Ä¶")


# # Flatten token windows for saving
# records = []

# for svc, windows in tqdm(service_contexts.items(), desc="Flattening tokenized service contexts"):
#     for win in windows:
#         records.append({
#             "service": svc,
#             "window_id": win["window_id"],
#             "context_text": win["context_text"]
#         })

# context_windows_df = pd.DataFrame(records)

# # SAVING CONTEXT WINDOWS TO CSV/JSONL
# from datetime import datetime
# import os

# os.makedirs(f"{config['DATA_DIR']}/{config['CONTEXT_DIR']}", exist_ok=True)

# # Timestamped filenames for traceability
# timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
# context_path_csv = os.path.join(f"{config['DATA_DIR']}/{config['CONTEXT_DIR']}", f"context_windows_{timestamp_str}.csv")
# context_path_jsonl = os.path.join(f"{config['DATA_DIR']}/{config['CONTEXT_DIR']}", f"context_windows_{timestamp_str}.jsonl")

# context_windows_df.to_csv(context_path_csv, index=False)
# context_windows_df.to_json(context_path_jsonl, orient="records", lines=True, force_ascii=False)

# print(f"üíæ Tokenized context windows saved to:\n - CSV: {context_path_csv}\n - JSONL: {context_path_jsonl}")


üîß Building tokenized service-level context windows...


Processing services: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 151.01it/s]


‚úÖ Tokenized context windows complete.
üì¶ Generated 25 total context windows.

[INFO] Constructed 5 service-level context groups.

üîπ Service: api | Total windows: 1
Window 1 (122 tokens preview):
'[ template : nova. api. openstack. compute. server _ external _ events < * > * > f7b8d1f1d4d44643b07fa10ca7d021fb e9746973ac574c6b8a9e8857f56a7608 - - - ] creating event < * > * > for instance < * > *'
‚Ä¶

üîπ Service: compute | Total windows: 18
Window 1 (266 tokens preview):
'[ template : nova. compute. manager [ req - 3ea4052c - 895d - 4b64 - 9e2d - 04d64c4d94ab - - - - - ] [ instance : < * > * > vm < * > * > ( lifecycle event ) ] [ count : 88, clean _ template : nova. co'
‚Ä¶
Window 2 (184 tokens preview):
'##1fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - - ] [ instance : < * > * > terminating instance ] [ count : 22, clean _ template : nova. compute. manager 113d3a99c3da401fbd62cc2caa5b96d2 54f'
‚Ä¶

üîπ Service: nova | Total windows: 4
Window 1 (303 tokens preview):
'[

Flattening tokenized service contexts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 39053.11it/s]

üíæ Tokenized context windows saved to:
 - CSV: data/contexts/context_windows_20251023_194121.csv
 - JSONL: data/contexts/context_windows_20251023_194121.jsonl



