<a href="https://colab.research.google.com/github/phanhuy0410/LuanVan/blob/main/Forensic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tarfile
from pathlib import Path

BASE_DIR = Path("/content/drive/MyDrive/DARPA_TC")
OUTPUT_DIR = BASE_DIR / "E3_extracted"
OUTPUT_DIR.mkdir(exist_ok=True)

tar_files = [
    "ta1-theia-e3-official-1r.json.tar.gz",
    "ta1-theia-e3-official-3.json.tar.gz",
    "ta1-theia-e3-official-5m.json.tar.gz",
    "ta1-theia-e3-official-6r.json.tar.gz"
]

for tar_name in tar_files:
    scenario = tar_name.split("-")[-1].replace(".json.tar.gz", "")
    extract_dir = OUTPUT_DIR / scenario
    extract_dir.mkdir(exist_ok=True)

    tar_path = BASE_DIR / tar_name
    print(f"Extracting {tar_name} ‚Üí {extract_dir}")

    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=extract_dir)

print("‚úÖ Done extracting all 4 files")



Extracting ta1-theia-e3-official-1r.json.tar.gz ‚Üí /content/drive/MyDrive/DARPA_TC/E3_extracted/1r


  tar.extractall(path=extract_dir)


Extracting ta1-theia-e3-official-3.json.tar.gz ‚Üí /content/drive/MyDrive/DARPA_TC/E3_extracted/3
Extracting ta1-theia-e3-official-5m.json.tar.gz ‚Üí /content/drive/MyDrive/DARPA_TC/E3_extracted/5m
Extracting ta1-theia-e3-official-6r.json.tar.gz ‚Üí /content/drive/MyDrive/DARPA_TC/E3_extracted/6r
‚úÖ Done extracting all 4 files


In [1]:
import json
from pathlib import Path
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

DATA_DIR = Path("/content/drive/MyDrive/DARPA_TC/E3_extracted")
LABEL_FILE = Path("/content/drive/MyDrive/DARPA_TC/multiclass.json")
OUTPUT_PARQUET = "/content/drive/MyDrive/DARPA_TC/darpaTC_theia_labeled.parquet"

BATCH_SIZE = 100_000
with open(LABEL_FILE) as f:
    label_spec = json.load(f)

LABEL_MAPPING = {
    int(k): v["name"]
    for k, v in label_spec["label_definition"].items()
}

INDICATORS = label_spec["indicators"]
principal_info = {}

def parse_principal(record):
    datum = record.get("root", {}).get("datum", {})
    p = datum.get("com.bbn.tc.schema.avro.cdm18.Principal")
    if not p:
        return
    principal_info[p["uuid"]] = {
        "uid": p.get("uid"),
        "username": p.get("username")
    }
process_to_principal = {}

def parse_subject(record):
    datum = record.get("root", {}).get("datum", {})
    s = datum.get("com.bbn.tc.schema.avro.cdm18.Subject")
    if not s:
        return
    puuid = s.get("principal", {}).get("uuid")
    if puuid:
        process_to_principal[s["uuid"]] = puuid

def match_label(proc_name, obj_path, obj_ip):
    labels = []
    for ind in INDICATORS:
        if ind["type"] == "process_name" and proc_name == ind["value"]:
            labels.append(ind["label"])
        elif ind["type"] == "file_path" and obj_path == ind["value"]:
            labels.append(ind["label"])
        elif ind["type"] == "ip" and obj_ip == ind["value"]:
            labels.append(ind["label"])
    return max(labels) if labels else 0

def parse_event(record, scenario):
    datum = record.get("root", {}).get("datum", {})
    evt = datum.get("com.bbn.tc.schema.avro.cdm18.Event")
    if not evt:
        return None

    subj = evt.get("subject", {})
    obj = evt.get("object", {})

    proc_uuid = subj.get("uuid")
    proc_name = subj.get("name")

    obj_path = obj.get("path")
    obj_ip = obj.get("remoteAddress") or obj.get("ip")

    label_id = match_label(proc_name, obj_path, obj_ip)

    principal_uuid = process_to_principal.get(proc_uuid)
    user = principal_info.get(principal_uuid, {})

    return {
        "timestamp": evt.get("timestampNanos", 0) // 1_000_000_000,
        "operation": evt.get("predicate"),

        "proc_uuid": proc_uuid,
        "proc_name": proc_name,

        "user_uid": user.get("uid"),
        "username": user.get("username"),

        "obj_type": obj.get("type"),
        "obj_path": obj_path,
        "obj_ip": obj_ip,

        "label_id": label_id,
        "label_name": LABEL_MAPPING[label_id],
        "scenario": scenario
    }
writer = None
buffer = []

# ---------- PASS 1 & 2 ----------
print("üîç Building principal & process maps...")
for scenario_dir in DATA_DIR.iterdir():
    if not scenario_dir.is_dir():
        continue
    for f in scenario_dir.glob("*.json*"):
        with open(f) as fh:
            for line in fh:
                rec = json.loads(line)
                parse_principal(rec)
                parse_subject(rec)

# ---------- PASS 3 ----------
print("üß† Parsing events & labeling...")
for scenario_dir in DATA_DIR.iterdir():
    if not scenario_dir.is_dir():
        continue

    scenario = scenario_dir.name
    for f in scenario_dir.glob("*.json*"):
        with open(f, buffering=1024*1024) as fh:
            for line in tqdm(fh, desc=f.name):
                rec = json.loads(line)
                row = parse_event(rec, scenario)
                if not row:
                    continue

                buffer.append(row)
                if len(buffer) >= BATCH_SIZE:
                    table = pa.Table.from_pylist(buffer)
                    if writer is None:
                        writer = pq.ParquetWriter(
                            OUTPUT_PARQUET,
                            table.schema,
                            compression="snappy"
                        )
                    writer.write_table(table)
                    buffer.clear()

# ---------- FLUSH ----------
if buffer:
    writer.write_table(pa.Table.from_pylist(buffer))
if writer:
    writer.close()

print("‚úÖ DONE:", OUTPUT_PARQUET)


üîç Building principal & process maps...
üß† Parsing events & labeling...


ta1-theia-e3-official-1r.json: 5000000it [01:48, 45872.67it/s]
ta1-theia-e3-official-1r.json.1: 5000000it [01:39, 50477.01it/s]
ta1-theia-e3-official-1r.json.2: 5000000it [01:36, 51988.39it/s]
ta1-theia-e3-official-1r.json.3: 5000000it [01:40, 49806.02it/s]
ta1-theia-e3-official-1r.json.4: 5000000it [02:10, 38292.20it/s]
ta1-theia-e3-official-1r.json.5: 5000000it [01:51, 44806.13it/s]
ta1-theia-e3-official-1r.json.6: 5000000it [01:42, 48700.33it/s]
ta1-theia-e3-official-1r.json.7: 5000000it [01:43, 48307.35it/s]
ta1-theia-e3-official-1r.json.8: 5000000it [01:42, 48827.34it/s]
ta1-theia-e3-official-1r.json.9: 1761949it [00:35, 49112.66it/s]
ta1-theia-e3-official-3.json: 1588762it [00:32, 48681.76it/s]
ta1-theia-e3-official-5m.json: 1090138it [00:23, 46310.84it/s]
ta1-theia-e3-official-6r.json: 5000000it [01:55, 43405.67it/s]
ta1-theia-e3-official-6r.json.1: 5000000it [01:42, 48970.54it/s]
ta1-theia-e3-official-6r.json.2: 5000000it [01:37, 51511.64it/s]
ta1-theia-e3-official-6r.json.3: 5

‚úÖ DONE: /content/drive/MyDrive/DARPA_TC/darpaTC_theia_labeled.parquet





In [2]:
import pyarrow.parquet as pq
from collections import Counter

PARQUET_FILE = "/content/drive/MyDrive/DARPA_TC/darpaTC_theia_labeled.parquet"
BATCH_SIZE = 100_000

pf = pq.ParquetFile(PARQUET_FILE)

label_counter = Counter()
total_rows = 0

for batch in pf.iter_batches(batch_size=BATCH_SIZE, columns=["label_name"]):
    df = batch.to_pandas()
    label_counter.update(df["label_name"])
    total_rows += len(df)

print("üìä T·ªïng s·ªë event:", total_rows)
print("üìä Ph√¢n b·ªë h√†nh vi:")
for label, cnt in label_counter.items():
    print(f"{label:30s} {cnt}")


FileNotFoundError: [Errno 2] Failed to open local file '/content/drive/MyDrive/DARPA_TC/darpaTC_theia_labeled.parquet'. Detail: [errno 2] No such file or directory

In [None]:
import json
from pathlib import Path
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import gc

# --- C·∫§U H√åNH ---
DATA_DIR = Path("/content/drive/MyDrive/DARPA_TC/E3_extracted")
# File JSON ch·ª©a danh s√°ch nh√£n (Ground Truth)
LABEL_FILE = Path("/content/drive/MyDrive/DARPA_TC/multiclass.json")
OUTPUT_PARQUET = "/content/drive/MyDrive/DARPA_TC/darpaTC_theia_labeled.parquet"
BATCH_SIZE = 100_000

# Load nh√£n
with open(LABEL_FILE) as f:
    label_spec = json.load(f)

# Mapping ID -> Name (VD: 1 -> "Initial Access")
LABEL_MAPPING = label_spec["label_mapping"]
INDICATORS = label_spec["indicators"]

# --- GLOBAL MAPS (L∆∞u tr√™n RAM ƒë·ªÉ tra c·ª©u nhanh) ---
principal_info = {}       # User UUID -> {uid, username}
subject_info = {}         # Subject UUID -> {principal_uuid, proc_name}
object_info = {}          # Object UUID -> {path, ip, type}

# H√†m tr·ª£ gi√∫p: L·∫•y n·ªôi dung b√™n trong "datum" an to√†n
def get_datum(record):
    # D·ª±a tr√™n h√¨nh ·∫£nh b·∫°n g·ª≠i: root -> datum
    return record.get("root", {}).get("datum", {})

# ---------------------------------------------------------
# H√ÄM PASS 1: X√ÇY D·ª∞NG T·ª™ ƒêI·ªÇN (MAPPING UUID -> INFO)
# ---------------------------------------------------------
def parse_metadata(record):
    datum = get_datum(record)
    if not datum: return

    # 1. Parse Principal (User)
    p = datum.get("com.bbn.tc.schema.avro.cdm18.Principal")
    if p:
        principal_info[p["uuid"]] = {
            "uid": p.get("userId"),
            "username": p.get("username")
        }
        return

    # 2. Parse Subject (Process) - Quan tr·ªçng ƒë·ªÉ bi·∫øt t√™n Process
    s = datum.get("com.bbn.tc.schema.avro.cdm18.Subject")
    if s:
        # L·∫•y t√™n process. ∆Øu ti√™n 'properties.map.path', fallback sang 'cmdLine'
        proc_name = "unknown"

        # Th·ª≠ l·∫•y path t·ª´ properties
        props = s.get("properties", {}).get("map", {})
        if props and "path" in props:
             proc_name = str(props["path"]).split('/')[-1]

        # N·∫øu kh√¥ng c√≥ path, th·ª≠ cmdLine
        elif s.get("cmdLine"):
             # CmdLine v√≠ d·ª•: "/bin/bash -c ..." -> l·∫•y "bash"
             proc_name = str(s["cmdLine"]).split(' ')[0].split('/')[-1]

        subject_info[s["uuid"]] = {
            "principal_uuid": s.get("principal", {}).get("uuid") if isinstance(s.get("principal"), dict) else s.get("principal"),
            "proc_name": proc_name
        }
        return

    # 3. Parse FileObject - Quan tr·ªçng ƒë·ªÉ bi·∫øt file n√†o b·ªã truy c·∫≠p
    f_obj = datum.get("com.bbn.tc.schema.avro.cdm18.FileObject")
    if f_obj:
        # L·∫•y path t·ª´ properties
        props = f_obj.get("properties", {}).get("map", {})
        path = props.get("path")
        if path:
            object_info[f_obj["uuid"]] = {"path": path, "type": "FILE"}
        return

    # 4. Parse NetFlowObject - Quan tr·ªçng ƒë·ªÉ bi·∫øt IP k·∫øt n·ªëi
    n_obj = datum.get("com.bbn.tc.schema.avro.cdm18.NetFlowObject")
    if n_obj:
        ip = n_obj.get("remoteAddress")
        if ip:
            object_info[n_obj["uuid"]] = {"ip": ip, "type": "NETFLOW"}
        return

# ---------------------------------------------------------
# H√ÄM PASS 2: X·ª¨ L√ù S·ª∞ KI·ªÜN (EVENT) & G√ÅN NH√ÉN
# ---------------------------------------------------------
def match_label(proc_name, obj_path, obj_ip):
    labels = []
    # D√πng key "default_label" kh·ªõp v·ªõi file JSON ground truth
    target_key = "default_label"

    for ind in INDICATORS:
        # So s√°nh t√™n Process
        if ind["type"] == "process_name" and proc_name == ind["value"]:
            labels.append(ind[target_key])
        # So s√°nh ƒë∆∞·ªùng d·∫´n File
        elif ind["type"] == "file_path" and obj_path == ind["value"]:
            labels.append(ind[target_key])
        # So s√°nh IP
        elif ind["type"] == "ip" and obj_ip == ind["value"]:
            labels.append(ind[target_key])

    # L·∫•y nh√£n cao nh·∫•t (m·ª©c ƒë·ªô nghi√™m tr·ªçng nh·∫•t)
    return max(labels) if labels else 0

def parse_event(record, scenario):
    datum = get_datum(record)
    evt = datum.get("com.bbn.tc.schema.avro.cdm18.Event")

    # N·∫øu d√≤ng n√†y kh√¥ng ph·∫£i Event th√¨ b·ªè qua
    if not evt:
        return None

    # L·∫•y UUID Subject v√† Object t·ª´ Event
    # CDM ƒë√¥i khi l∆∞u uuid d·∫°ng string, ƒë√¥i khi d·∫°ng dict {"uuid": "..."}
    subj_raw = evt.get("subject")
    obj_raw = evt.get("object")

    subj_uuid = subj_raw.get("uuid") if isinstance(subj_raw, dict) else subj_raw
    obj_uuid = obj_raw.get("uuid") if isinstance(obj_raw, dict) else obj_raw

    # --- TRA C·ª®U TH√îNG TIN T·ª™ PASS 1 (LOOKUP) ---

    # 1. Th√¥ng tin Process (Subject)
    s_info = subject_info.get(subj_uuid, {})
    proc_name = s_info.get("proc_name", "unknown")
    principal_uuid = s_info.get("principal_uuid")

    # 2. Th√¥ng tin User (Principal)
    u_info = principal_info.get(principal_uuid, {})

    # 3. Th√¥ng tin Object (File/IP)
    o_info = object_info.get(obj_uuid, {})
    obj_path = o_info.get("path")
    obj_ip = o_info.get("ip")
    obj_type = o_info.get("type", "UNKNOWN")

    # --- G√ÅN NH√ÉN ---
    label_id = match_label(proc_name, obj_path, obj_ip)

    return {
        "timestamp": evt.get("timestampNanos", 0) // 1_000_000_000,
        "operation": evt.get("type"), # V√≠ d·ª•: EVENT_READ, EVENT_WRITE

        "proc_uuid": str(subj_uuid),
        "proc_name": proc_name,

        "user_uid": u_info.get("uid"),
        "username": u_info.get("username"),

        "obj_type": obj_type,
        "obj_path": obj_path,
        "obj_ip": obj_ip,

        "label_id": label_id,
        "label_name": LABEL_MAPPING.get(str(label_id), "Benign"),
        "scenario": scenario
    }

# =========================================================
# CH·∫†Y CH∆Ø∆†NG TR√åNH
# =========================================================

# T√¨m t·∫•t c·∫£ file JSON (lo·∫°i b·ªè file n√©n)
files = sorted([p for p in DATA_DIR.glob("**/*.json*") if "tar.gz" not in p.name])

# --- PASS 1: Build Maps (ƒê·ªçc l∆∞·ªõt ƒë·ªÉ l·∫•y Metadata) ---
print(f"üîç PASS 1: Qu√©t Metadata t·ª´ {len(files)} files...")
for f in tqdm(files, desc="Metadata Scan"):
    with open(f, 'r', encoding='utf-8') as fh:
        for line in fh:
            try:
                rec = json.loads(line)
                parse_metadata(rec)
            except:
                continue

print(f"üìä ƒê√£ h·ªçc ƒë∆∞·ª£c: {len(principal_info)} Users, {len(subject_info)} Process, {len(object_info)} Objects")

# --- PASS 2: Parse Events & Write Parquet ---
print("\nüß† PASS 2: X·ª≠ l√Ω Events & G√°n nh√£n...")
writer = None
buffer = []

for f in files:
    scenario = f.parent.name # L·∫•y t√™n th∆∞ m·ª•c l√†m t√™n k·ªãch b·∫£n

    with open(f, 'r', encoding='utf-8') as fh:
        for line in tqdm(fh, desc=f.name, leave=False):
            try:
                rec = json.loads(line)
                row = parse_event(rec, scenario)

                if row:
                    buffer.append(row)
            except:
                continue

            # Ghi xu·ªëng ƒëƒ©a khi buffer ƒë·∫ßy (tr√°nh tr√†n RAM)
            if len(buffer) >= BATCH_SIZE:
                table = pa.Table.from_pylist(buffer)
                if writer is None:
                    writer = pq.ParquetWriter(OUTPUT_PARQUET, table.schema, compression="snappy")
                writer.write_table(table)
                buffer.clear()

        gc.collect() # D·ªçn r√°c b·ªô nh·ªõ sau m·ªói file

# Ghi n·ªët ph·∫ßn c√≤n d∆∞
if buffer:
    table = pa.Table.from_pylist(buffer)
    if writer is None:
        writer = pq.ParquetWriter(OUTPUT_PARQUET, table.schema, compression="snappy")
    writer.write_table(table)

if writer:
    writer.close()

print(f"\n‚úÖ HO√ÄN T·∫§T! File k·∫øt qu·∫£: {OUTPUT_PARQUET}")

In [None]:
!pip install torch torchvision torchaudio



In [None]:
!pip install torch-geometric

