In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import hashlib
import numpy as np
import pandas as pd

In [None]:
ml = pd.read_parquet("/content/drive/MyDrive/Hackathon - Berca/ml_features.parquet")

In [None]:
ml.head()

Unnamed: 0,timestamp_1h,city,city_key,province,region,olt_id,olt_key,fdt_name,fat_name,cluster_id,...,snr_avg,rx_power_avg,rx_power_avg_dbm,temperature_avg_c,temp_anomaly_score,hour_of_day,day_of_week,is_maintenance_window,label_outage_1h,row_number
0,2023-01-01,Aceh,1,Aceh,Sumatra,OLT-ACE7E-01,1,FDT-ACE7E-01-A,FAT-ACE7E-01-B,ACE7E-CL11,...,26.618261,-18.968959,-18.968959,39.449085,0.579634,0,6,0,0,0
1,2023-01-01,Sulawesi Barat,26,Sulawesi Barat,Sulawesi,OLT-SUL6F-01,57,FDT-SUL6F-01-A,FAT-SUL6F-01-B,SUL6F-CL11,...,26.302077,-19.099787,-19.099787,38.014517,0.005807,0,6,0,0,1
2,2023-01-01,Lampung,18,Lampung,Sumatra,OLT-LAMB9-02,36,FDT-LAMB9-02-A,FAT-LAMB9-02-B,LAMB9-CL23,...,27.47863,-19.143296,-19.143296,38.780984,0.312394,0,6,0,0,2
3,2023-01-01,Jakarta Raya,7,Jakarta Raya,Java,OLT-JAK33-02,14,FDT-JAK33-02-A,FAT-JAK33-02-B,JAK33-CL23,...,25.767517,-18.720779,-18.720779,37.531475,-0.18741,0,6,0,0,3
4,2023-01-01,Jambi,8,Jambi,Sumatra,OLT-JAM02-01,15,FDT-JAM02-01-A,FAT-JAM02-01-B,JAM02-CL11,...,25.850164,-19.255491,-19.255491,36.812895,-0.474842,0,6,0,0,4


In [None]:
ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3578976 entries, 0 to 3578975
Data columns (total 32 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   timestamp_1h           datetime64[ns]
 1   city                   object        
 2   city_key               int64         
 3   province               object        
 4   region                 object        
 5   olt_id                 object        
 6   olt_key                int64         
 7   fdt_name               object        
 8   fat_name               object        
 9   cluster_id             object        
 10  cluster_key            int64         
 11  cluster_name           object        
 12  ont_registered         int64         
 13  offline_ont_now        int64         
 14  offline_ont_ratio      float64       
 15  link_loss_count        int64         
 16  bad_rsl_count          int64         
 17  high_temp_count        int64         
 18  dying_gasp_count      

In [None]:
# ==========================================================
# Output: dataset/ews_ml_text_transcripts.parquet
# Join keys: (timestamp_1h, olt_id, cluster_id)
# ==========================================================

# ---- Helpers -------------------------------------------------
def _rng_from_key(olt_id, cluster_id, ts):
    key = f"{olt_id}|{cluster_id}|{ts}"
    seed = int(hashlib.md5(key.encode("utf-8")).hexdigest()[:8], 16)
    return np.random.default_rng(seed)

def _pick(rng, options):
    return options[rng.integers(0, len(options))]

def _pct(x):
    try: return f"{100*float(x):.1f}%"
    except: return "n/a"

def _dbm(x):
    try: return f"{float(x):.1f} dBm"
    except: return "n/a"

def _sev_bucket(score):
    if score >= 3.0: return "critical"
    if score >= 1.8: return "major"
    if score >= 0.8: return "minor"
    return "normal"

# ---- Phrase banks ---------------
SPEAKERS = ["NOC", "Field Operations", "Tier-2", "L1 Support", "Duty Engineer", "Incident Manager"]
ACTIONS  = ["monitoring", "dispatch on standby", "reroute evaluation", "vendor escalation check",
            "pre-emptive site visit", "temporary traffic shift", "config validation"]
SYMPTOMS = ["intermittent drops", "packet loss", "unstable latency", "brief micro-outages",
            "customer pings failing", "sporadic timeouts", "jitter spikes"]
VERBS    = ["observed", "noted", "detected", "seeing", "tracking", "confirming"]
CAUSES_OPT = ["low optical power", "fiber attenuation", "SNR degradation", "connector contamination"]
CAUSES_NET = ["rising offline ONTs", "fault burst", "congestion symptoms", "alarm flapping"]
CAUSES_TH  = ["temperature spike", "thermal stress", "overheat condition"]
CHANNELS = ["[NOC Log]", "[Ops Chat]", "[Ticket Note]", "[Pager]", "[Shift Handover]", "[Incident Memo]"]
CLOSERS  = ["continuing monitoring", "preparing mitigation", "coordinating with field",
            "awaiting stabilization", "tracking KPIs closely", "no customer broadcast yet"]
MAINT    = [
    "{spk}: Planned maintenance window active for {loc}; brief service impact possible. Metrics: {metrics}.",
    "{spk}: Maintenance in progress at {loc}; traffic may be rerouted. Current status: {metrics}.",
    "{spk}: Change window at {loc}; alarms expected. {metrics}. Proceed per plan."
]
MAINT_ALERTY = [
    "Elevated signals during MW — {metrics}. Keep {act}.",
    "Precursors present despite MW ({metrics}); stage field team.",
    "Unusual spikes during MW; {metrics}. Review rollback plan."
]
OUTAGE_NEXT = [
    "{spk}: Imminent outage risk at {loc}; drivers: {drivers}. {metrics}. Users report {sym}. Action: {act}.",
    "{spk}: Precursor pattern consistent with impact next hour at {loc} — {drivers}. {metrics}.",
    "{spk}: High likelihood of service disruption soon at {loc}; indicators={drivers}. {metrics}. Mitigation in flight."
]
DEGRADED = [
    "{spk}: Service degradation at {loc}; {metrics}. Customers experiencing {sym}.",
    "{spk}: Elevated alarms with partial impact at {loc}; {metrics}. Plan: {act}.",
    "{spk}: Quality dip observed at {loc} ({sym}); {metrics}. Monitoring closely."
]
MINOR = [
    "{spk}: Minor anomalies at {loc}; {metrics}. Continue {act}.",
    "{spk}: Low-grade alerts at {loc}; {metrics}. No customer impact reported.",
    "{spk}: Small signal drift at {loc}; {metrics}."
]
HEALTHY = [
    "{spk}: KPIs normal at {loc}; {metrics}.",
    "{spk}: Operations stable at {loc}; no outage indicators.",
    "{spk}: Healthy state at {loc}; no material alarms."
]

def _build_row(row):
    # Keys
    ts  = row["timestamp_1h"]
    oid = row["olt_id"]
    cid = row["cluster_id"]
    rng = _rng_from_key(oid, cid, ts)

    # Signals (all from ml)
    outage_next = int(row["label_outage_1h"])
    maint = int(row["is_maintenance_window"])
    spike = int(row["alarm_spike_flag"])
    fault = float(row["fault_rate"])
    offr  = float(row["offline_ont_ratio"])
    snr   = float(row["snr_avg"])
    rxp   = float(row["rx_power_avg_dbm"])
    tc    = float(row["temperature_avg_c"])
    LL, BR, HT, DG = int(row["link_loss_count"]), int(row["bad_rsl_count"]), int(row["high_temp_count"]), int(row["dying_gasp_count"])

    optical = (rxp < -22.5) or (snr < 23.0)
    thermal = (tc  > 42.0)
    congest = (offr > 0.10) or (fault > 0.010)

    severity = (
        2.0*outage_next +
        1.2*spike +
        1.2*(fault > 0.012) +
        1.0*(offr > 0.12) +
        0.7*optical +
        0.6*thermal
    )
    sev_bucket = _sev_bucket(severity)

    loc = f"{row['city']} / {row['cluster_name']} ({oid})"
    spk = _pick(rng, SPEAKERS)
    act = _pick(rng, ACTIONS)
    sym = _pick(rng, SYMPTOMS)
    chan = _pick(rng, CHANNELS)
    verb = _pick(rng, VERBS)

    metrics_bits = []
    total_alarms = LL+BR+HT+DG
    if total_alarms > 0: metrics_bits.append(f"alarms LL:{LL} RSL:{BR} HT:{HT} DG:{DG}")
    metrics_bits.append(f"offline={_pct(offr)}")
    metrics_bits.append(f"fault_rate={fault:.3%}")
    metrics_bits.append(f"rx={_dbm(rxp)}, SNR={snr:.1f} dB")
    metrics_bits.append(f"temp={tc:.1f} C")
    # Shuffle order for more variety
    rng.shuffle(metrics_bits)
    metrics = "; ".join(metrics_bits)

    # Build driver text
    driver_parts = []
    if optical: driver_parts.append(_pick(rng, CAUSES_OPT))
    if congest: driver_parts.append(_pick(rng, CAUSES_NET))
    if thermal: driver_parts.append(_pick(rng, CAUSES_TH))
    if not driver_parts:
        driver_parts.append(_pick(rng, ["multiple transient alarms", "unstable link metrics", "anomalous KPIs"]))
    # Randomly vary how many we include
    if len(driver_parts) > 1 and rng.random() < 0.4:
        driver_parts = driver_parts[:1]
    drivers = ", ".join(driver_parts)

    # Decide template branch
    if maint:
        base = _pick(rng, MAINT).format(spk=spk, loc=loc, metrics=metrics)
        if severity >= 1.8:
            tail = _pick(rng, MAINT_ALERTY).format(metrics=metrics, act=act)
            text = f"{chan} {base} {tail}"
            event_type = "maintenance+anomaly"
        else:
            text = f"{chan} {base} {_pick(rng, CLOSERS)}."
            event_type = "maintenance"
    elif outage_next:
        text = _pick(rng, OUTAGE_NEXT).format(spk=spk, loc=loc, drivers=drivers, metrics=metrics, sym=sym, act=act)
        text = f"{chan} {text}"
        event_type = "imminent_outage"
    elif severity >= 1.8:
        text = _pick(rng, DEGRADED).format(spk=spk, loc=loc, metrics=metrics, sym=sym, act=act)
        text = f"{chan} {text}"
        event_type = "degraded"
    elif severity >= 0.8:
        text = _pick(rng, MINOR).format(spk=spk, loc=loc, metrics=metrics, act=act)
        text = f"{chan} {text}"
        event_type = "minor"
    else:
        text = _pick(rng, HEALTHY).format(spk=spk, loc=loc, metrics=metrics)
        text = f"{chan} {text}"
        event_type = "healthy"

    raw_id = f"{oid}|{cid}|{ts}"
    transcript_id = "TX-" + hashlib.md5(raw_id.encode("utf-8")).hexdigest()[:16].upper()

    return {
        "timestamp_1h": row["timestamp_1h"],
        "olt_id": row["olt_id"],
        "cluster_id": row["cluster_id"],
        "transcript_id": transcript_id,
        "event_type": event_type,
        "severity_bucket": sev_bucket,
        "event_transcript": text
    }

In [None]:
# ---- Build separate transcript table -------------------------
_subset = ml[[
    "timestamp_1h","city","cluster_name","olt_id","cluster_id",
    "label_outage_1h","is_maintenance_window","alarm_spike_flag",
    "fault_rate","offline_ont_ratio","snr_avg","rx_power_avg_dbm","temperature_avg_c",
    "link_loss_count","bad_rsl_count","high_temp_count","dying_gasp_count"
]].copy()

transcripts = _subset.apply(_build_row, axis=1, result_type="expand")

In [None]:
# Save as a SEPARATE artifact
ml_text_out = "/content/drive/MyDrive/Hackathon - Berca/transcripts.parquet"
transcripts.to_parquet(ml_text_out, index=False)

In [None]:
print("Created transcripts table with English-only, diverse operator notes.")
print("Columns: timestamp_1h, olt_id, cluster_id, transcript_id, event_type, severity_bucket, event_transcript")
print("Saved:", ml_text_out)

Created transcripts table with English-only, diverse operator notes.
Columns: timestamp_1h, olt_id, cluster_id, transcript_id, event_type, severity_bucket, event_transcript
Saved: /content/drive/MyDrive/Hackathon - Berca/transcripts.parquet


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
transcripts[['timestamp_1h', 'olt_id', 'cluster_id', 'transcript_id', 'event_type', 'severity_bucket', 'event_transcript']].head(10)

Unnamed: 0,timestamp_1h,olt_id,cluster_id,transcript_id,event_type,severity_bucket,event_transcript
0,2023-01-01,OLT-ACE7E-01,ACE7E-CL11,TX-DAB28A4D8D52E0B3,healthy,normal,[Shift Handover] Tier-2: Healthy state at Aceh / Cluster-ACE7E-11 (OLT-ACE7E-01); no material alarms.
1,2023-01-01,OLT-SUL6F-01,SUL6F-CL11,TX-6DD8BCF266DFAD3F,healthy,normal,[Ops Chat] Duty Engineer: Healthy state at Sulawesi Barat / Cluster-SUL6F-11 (OLT-SUL6F-01); no material alarms.
2,2023-01-01,OLT-LAMB9-02,LAMB9-CL23,TX-24B411486258A543,healthy,normal,[Ticket Note] Tier-2: Operations stable at Lampung / Cluster-LAMB9-23 (OLT-LAMB9-02); no outage indicators.
3,2023-01-01,OLT-JAK33-02,JAK33-CL23,TX-4AB5BBDEED93E2E4,healthy,normal,[Ticket Note] Duty Engineer: Operations stable at Jakarta Raya / Cluster-JAK33-23 (OLT-JAK33-02); no outage indicators.
4,2023-01-01,OLT-JAM02-01,JAM02-CL11,TX-5284DF368D75BF19,healthy,normal,[Incident Memo] Duty Engineer: Healthy state at Jambi / Cluster-JAM02-11 (OLT-JAM02-01); no material alarms.
5,2023-01-01,OLT-KALAB-02,KALAB-CL22,TX-67FF4B8062E64936,healthy,normal,"[Shift Handover] Tier-2: KPIs normal at Kalimantan Barat / Cluster-KALAB-22 (OLT-KALAB-02); temp=37.5 C; offline=0.3%; rx=-19.6 dBm, SNR=26.5 dB; fault_rate=0.162%; alarms LL:1 RSL:0 HT:0 DG:0."
6,2023-01-01,OLT-SUL98-01,SUL98-CL13,TX-08246462226DEF43,healthy,normal,"[NOC Log] Field Operations: KPIs normal at Sulawesi Utara / Cluster-SUL98-13 (OLT-SUL98-01); alarms LL:0 RSL:0 HT:1 DG:2; offline=0.4%; rx=-18.2 dBm, SNR=26.7 dB; fault_rate=0.569%; temp=38.9 C."
7,2023-01-01,OLT-KALE7-01,KALE7-CL13,TX-89AEED4F305AB584,healthy,normal,[Shift Handover] L1 Support: Operations stable at Kalimantan Timur / Cluster-KALE7-13 (OLT-KALE7-01); no outage indicators.
8,2023-01-01,OLT-RIAE4-01,RIAE4-CL12,TX-8D191D315DDC50FB,healthy,normal,[Ticket Note] Tier-2: Healthy state at Riau / Cluster-RIAE4-12 (OLT-RIAE4-01); no material alarms.
9,2023-01-01,OLT-KALF8-02,KALF8-CL22,TX-0563CC2C73D36C99,healthy,normal,[Incident Memo] Tier-2: Healthy state at Kalimantan Utara / Cluster-KALF8-22 (OLT-KALF8-02); no material alarms.
