In [2]:
import numpy as np
import pandas as pd
import re, hashlib

# ------------------- Config --------------------
np.random.seed(7)

start = pd.Timestamp("2023-01-01 00:00:00")
end   = pd.Timestamp("2024-12-31 23:00:00")
freq = "1h"

# Target prevalence control (approximate, global)
TARGET_OUTAGE_RATE = 0.003  # ~0.3% hours positive; adjust 0.002–0.006 for realism

# Primary geography now = PROVINCES (kept under column name "city" for schema stability)
provinces = [
    "Aceh","Bali","Bangka Belitung","Banten","Bengkulu","Gorontalo","Jakarta Raya","Jambi",
    "Jawa Barat","Jawa Tengah","Jawa Timur","Kalimantan Barat","Kalimantan Selatan","Kalimantan Tengah",
    "Kalimantan Timur","Kalimantan Utara","Kepulauan Riau","Lampung","Maluku","Maluku Utara",
    "Nusa Tenggara Barat","Nusa Tenggara Timur","Papua","Papua Barat","Riau",
    "Sulawesi Barat","Sulawesi Selatan","Sulawesi Tengah","Sulawesi Tenggara","Sulawesi Utara",
    "Sumatera Barat","Sumatera Selatan","Sumatera Utara","Yogyakarta"
]

province_map = {p: p for p in provinces}

# Macro-region by province
region_map = {
    # Sumatra
    "Aceh": "Sumatra","Bangka Belitung": "Sumatra","Bengkulu": "Sumatra","Jambi": "Sumatra",
    "Kepulauan Riau": "Sumatra","Lampung": "Sumatra","Riau": "Sumatra",
    "Sumatera Barat": "Sumatra","Sumatera Selatan": "Sumatra","Sumatera Utara": "Sumatra",
    # Java
    "Banten": "Java","Jakarta Raya": "Java","Jawa Barat": "Java","Jawa Tengah": "Java",
    "Jawa Timur": "Java","Yogyakarta": "Java",
    # Kalimantan
    "Kalimantan Barat": "Kalimantan","Kalimantan Selatan": "Kalimantan","Kalimantan Tengah": "Kalimantan",
    "Kalimantan Timur": "Kalimantan","Kalimantan Utara": "Kalimantan",
    # Sulawesi
    "Gorontalo": "Sulawesi","Sulawesi Barat": "Sulawesi","Sulawesi Selatan": "Sulawesi",
    "Sulawesi Tengah": "Sulawesi","Sulawesi Tenggara": "Sulawesi","Sulawesi Utara": "Sulawesi",
    # Bali & Nusa Tenggara
    "Bali": "Bali & Nusa Tenggara","Nusa Tenggara Barat": "Bali & Nusa Tenggara","Nusa Tenggara Timur": "Bali & Nusa Tenggara",
    # Maluku
    "Maluku": "Maluku","Maluku Utara": "Maluku",
    # Papua
    "Papua": "Papua","Papua Barat": "Papua",
}

# Collision-proof short code per province for IDs
def province_code(name: str) -> str:
    tokens = re.findall(r"[A-Za-z]+", name)
    acro = "".join(t[0] for t in tokens).upper()
    if len(acro) < 3:
        raw = re.sub(r"[^A-Za-z]", "", name).upper()
        acro = (raw + "XXX")[:3]
    else:
        acro = (acro + "XXX")[:3]
    h2 = hashlib.md5(name.encode("utf-8")).hexdigest()[:2].upper()
    return f"{acro}{h2}"

olts_per_city = 2
clusters_per_olt = 3

time_index = pd.date_range(start, end, freq=freq)
T = len(time_index)

hours = time_index.hour.values
dows = time_index.dayofweek.values
months = time_index.month.values
doys = time_index.dayofyear.values

# ----------------- Seasonality -----------------
def diurnal_curve(hour_arr):
    # Busy around 20:00, smaller midday bump
    return 0.6 + 0.5*np.exp(-((hour_arr-20)%24)**2/(2*3**2)) + 0.2*np.exp(-((hour_arr-12)%24)**2/(2*4**2))

def weekly_curve(dow_arr):
    # Weekend bias
    return 1.0 + 0.05*np.isin(dow_arr, [5,6]).astype(float)

def seasonal_factor(month_arr, doy_arr):
    # Yearly + rainy season
    year_wave = 1.0 + 0.1*np.sin(2*np.pi*doy_arr/365.25 - 0.7)
    rainy = 1.0 + 0.2*np.isin(month_arr, [11,12,1,2,3]).astype(float)
    return year_wave * rainy

diurnal = diurnal_curve(hours)
weekly = weekly_curve(dows)
season = seasonal_factor(months, doys)

# Maintenance flags (slightly higher night-time probability)
p_maint = 0.002*(1 + ((hours>=1)&(hours<=4)).astype(int))
is_maint_arr = (np.random.rand(T) < p_maint).astype(int)

# Province pressure spikes (macro load bursts)
city_pressure = {}
for prov in provinces:
    cp = np.full(T, 0.02, dtype=float)
    k = 40
    idx = np.random.choice(T, size=k, replace=False)
    for i in idx:
        width = np.random.randint(3, 24)
        amp = np.random.uniform(0.3, 0.8)
        s = max(0, i-width//2); e = min(T, i+width//2)
        cp[s:e] += amp * np.hanning(e-s)
    city_pressure[prov] = cp

# ----------------------------
# Dimension keys
# ----------------------------
city_list = sorted(set(provinces))  # "city" holds province names
city_key_map = {c:i+1 for i,c in enumerate(city_list)}

# Entities with topology names (derive OLT/cluster keys)
entities = []
for prov in provinces:
    code = province_code(prov)
    for o in range(1, olts_per_city+1):
        olt_id = f"OLT-{code}-{o:02d}"
        fdt_name = olt_id.replace("OLT","FDT") + "-A"
        fat_name = olt_id.replace("OLT","FAT") + "-B"
        for c in range(1, clusters_per_olt+1):
            cluster_id = f"{code}-CL{o}{c}"
            cluster_name = f"Cluster-{code}-{o}{c}"
            base_onts = np.random.randint(350, 900)
            entities.append(dict(
                city=prov,
                city_key=city_key_map[prov],
                province=province_map[prov],
                region=region_map[prov],
                olt_id=olt_id, fdt_name=fdt_name,
                fat_name=fat_name, cluster_id=cluster_id, cluster_name=cluster_name,
                base_onts=base_onts
            ))
entities_df = pd.DataFrame(entities)

# Create OLT and Cluster keys
olt_key_map = {oid:i+1 for i, oid in enumerate(sorted(entities_df['olt_id'].unique()))}
cluster_key_map = {cid:i+1 for i, cid in enumerate(sorted(entities_df['cluster_id'].unique()))}
entities_df['olt_key'] = entities_df['olt_id'].map(olt_key_map)
entities_df['cluster_key'] = entities_df['cluster_id'].map(cluster_key_map)

# =========================
# Helper: latent stress AR
# =========================
def latent_stress_series(T, rng, base_level=0.05, ar=0.95, shock_prob=0.002, shock_scale=0.5):
    """
    Generates a bounded [0,1] latent stress with AR(1) dynamics and occasional shocks.
    """
    s = np.zeros(T, dtype=float)
    s[0] = base_level
    for t in range(1, T):
        shock = (rng.random() < shock_prob) * rng.uniform(0.2, shock_scale)
        noise = rng.normal(0, 0.02)
        s[t] = ar*s[t-1] + (1-ar)*base_level + shock + noise
        s[t] = np.clip(s[t], 0, 1.5)
    return s.clip(0,1)

# ======================================
# Row generation with precursors
# ======================================
rows = []

for _, ent in entities_df.iterrows():
    prov = ent['city']
    olt_id = ent['olt_id']; cluster_id = ent['cluster_id']
    fdt_name = ent['fdt_name']; fat_name = ent['fat_name']; cluster_name = ent['cluster_name']
    base_onts = ent['base_onts']
    growth = np.linspace(0, np.random.randint(20,120), T).astype(int)
    ont_registered = base_onts + growth

    rng = np.random.default_rng(hash((prov,olt_id,cluster_id)) & 0xFFFFFFFF)

    # Latent stress combines AR process + macro factors
    stress = latent_stress_series(
        T, rng,
        base_level=0.05 + 0.05*np.mean(season) + 0.03*np.mean(weekly),
        ar=rng.uniform(0.94, 0.98),
        shock_prob=0.0025,
        shock_scale=0.6
    )
    # Add diurnal/seasonal/province spikes
    stress = np.clip(
        (0.6*stress +
         0.15*(diurnal-0.6) +
         0.10*(season-1.0) +
         0.15*city_pressure[prov]), 0, 1.2
    )

    # Base hazard tuned towards target; boosted by maintenance and stress
    base_hazard = TARGET_OUTAGE_RATE * rng.uniform(0.8, 1.25)
    hazard = (
        base_hazard
        * (1 + 2.5*stress)
        * (1 + 0.9*is_maint_arr)
        * (1 + 0.15*(hours == 3))
    )
    hazard = np.clip(hazard, TARGET_OUTAGE_RATE*0.2, 0.08)

    # Sample outages
    outage = (rng.random(T) < hazard).astype(int)

    # Precursor window: when outage at t, we increase counters at t-1..t-3
    precursor_span = rng.integers(1, 4)  # 1–3 hours
    precursor = np.zeros(T, dtype=int)
    outage_idx = np.where(outage==1)[0]
    for t0 in outage_idx:
        s = max(0, t0 - precursor_span)
        precursor[s:t0] = 1

    # Baseline event rates (per ONT per hour)
    base_link_loss = rng.uniform(0.0004, 0.0012)
    base_bad_rsl   = rng.uniform(0.0005, 0.0016)
    base_temp      = rng.uniform(0.0002, 0.0010)
    base_dying     = rng.uniform(0.0001, 0.0009)

    maint_mult = 1.0 + 0.6*is_maint_arr
    # Strengthen pre-outage signal by adding precursor multipliers
    link_loss_rate = (base_link_loss*diurnal*weekly*season*maint_mult
                      * (1 + 1.0*precursor + 2.5*outage))
    bad_rsl_rate   = (base_bad_rsl*(0.6 + 0.6*season)
                      * (1 + 0.8*precursor + 2.0*outage))
    temp_rate      = (base_temp*(0.7 + 0.4*diurnal)
                      * (1 + 0.5*precursor + 1.5*outage))
    dying_rate     = (base_dying*(0.8 + 0.5*weekly)
                      * (1 + 0.7*precursor + 2.2*outage))

    # Counts
    link_loss_cnt = rng.poisson(link_loss_rate*ont_registered)
    bad_rsl_cnt   = rng.poisson(bad_rsl_rate*ont_registered)
    high_temp_cnt = rng.poisson(temp_rate*ont_registered)
    dying_cnt     = rng.poisson(dying_rate*ont_registered)

    # Offline ratio affected by precursor/outage and congestion
    offline_ratio = np.clip(
        0.04*precursor + 0.22*outage
        + 0.35*(link_loss_cnt+bad_rsl_cnt)/np.maximum(1, ont_registered)
        + rng.normal(0.003,0.004,T),
        0, 0.35
    )
    offline_ont = np.round(offline_ratio * ont_registered).astype(int)

    # Trap/trend reacts to first differences (precursor emphasized)
    trap_rate = (link_loss_cnt+bad_rsl_cnt+high_temp_cnt+dying_cnt)/np.maximum(1, ont_registered)
    prev = np.r_[trap_rate[0], trap_rate[:-1]]
    trap_trend = np.clip(6*(trap_rate - prev) + 0.8*precursor + 1.2*outage, -1.0, 2.0)
    alarm_spike_flag = ((trap_rate > (0.008 + 0.7*hazard)) | (precursor==1) | (outage==1)).astype(int)

    # Physical indicators with pre-outage drift
    snr_avg = (26
               + 2*np.sin(2*np.pi*(doys/365.25))
               - 1.8*precursor - 5.5*outage
               + rng.normal(0,0.7,T))
    rx_power = (-19.0
                + 1.0*np.sin(2*np.pi*((doys+30)/365.25))
                - 0.8*precursor - 2.8*outage
                + rng.normal(0,0.5,T))
    temperature = (36
                   + 2.2*diurnal
                   + 0.5*np.sin(2*np.pi*(doys/365.25))
                   + 0.6*precursor + 5.5*outage
                   + rng.normal(0,0.8,T))
    temp_anom = (temperature - 38)/2.5

    df_ent = pd.DataFrame({
        "timestamp_1h": time_index,
        "city": prov,
        "city_key": ent['city_key'],
        "province": province_map[prov],
        "region": region_map[prov],
        "olt_id": olt_id, "olt_key": ent['olt_key'],
        "fdt_name": fdt_name, "fat_name": fat_name,
        "cluster_id": cluster_id, "cluster_key": ent['cluster_key'], "cluster_name": cluster_name,
        "ont_registered": ont_registered.astype(int),
        "offline_ont_now": offline_ont.astype(int),
        "offline_ont_ratio": offline_ratio.astype(float),
        "link_loss_count": link_loss_cnt.astype(int),
        "bad_rsl_count": bad_rsl_cnt.astype(int),
        "high_temp_count": high_temp_cnt.astype(int),
        "dying_gasp_count": dying_cnt.astype(int),
        "alarm_spike_flag": alarm_spike_flag.astype(int),
        "trap_trend_score": trap_trend.astype(float),
        "snr_avg": snr_avg.astype(float),
        "rx_power_avg": rx_power.astype(float),  # dBm
        "rx_power_avg_dbm": rx_power.astype(float),
        "temperature_avg_c": temperature.astype(float),  # Celsius
        "temp_anomaly_score": temp_anom.astype(float),
        "hour_of_day": hours,
        "day_of_week": dows,
        "is_maintenance_window": is_maint_arr,
        "outage_now": outage.astype(int),
        "precursor_flag": precursor.astype(int),
        "stress": stress.astype(float)
    })
    rows.append(df_ent)

realtime = pd.concat(rows, ignore_index=True)

# Hourly fault_rate from raw counts
alarm_sum = realtime[['link_loss_count','bad_rsl_count','high_temp_count','dying_gasp_count']].sum(axis=1).astype(float)
realtime['fault_rate'] = (alarm_sum / np.maximum(1, realtime['ont_registered'])).clip(lower=0)

# ---------------- Context table ----------------
context = realtime[['city','city_key','province','region','olt_id','olt_key','fdt_name','fat_name','cluster_id','cluster_key','cluster_name']]\
    .drop_duplicates().reset_index(drop=True)
vendors = ['Huawei','ZTE','Nokia']
context['vendor'] = [vendors[i % len(vendors)] for i in range(len(context))]
context['firmware_version'] = 'v5.' + (10 + pd.Series(range(len(context)))%5).astype(str)

# ------------- Customer feedback table ----------
clusters = context[['city','city_key','province','region','olt_id','olt_key','cluster_id','cluster_key','cluster_name']].drop_duplicates().reset_index(drop=True)
clusters['customer_id'] = ['CUST' + str(i).zfill(5) for i in range(len(clusters))]
# Use province code for IDs to stay unique and readable
clusters['_prov_code'] = clusters['city'].apply(province_code)
clusters['home_pass'] = "HP-" + clusters['_prov_code'] + "-" + clusters.index.astype(str).str.zfill(5)
clusters['home_connect_id'] = "HC-" + clusters['_prov_code'] + "-" + clusters.index.astype(str).str.zfill(6)
clusters = clusters.drop(columns=['_prov_code'])

clusters['join_date'] = pd.Timestamp("2020-01-01") + pd.to_timedelta((clusters.index*30)%1000, unit='D')
clusters['active_date'] = clusters['join_date']
clusters['user_satisfaction_score'] = 0.8 + 0.2*np.random.rand(len(clusters))
clusters['complaints_this_month'] = np.random.poisson(3, len(clusters))
clusters['late_payment_count'] = np.random.randint(0,3, len(clusters))
clusters['churn_flag'] = np.random.choice([0,1], len(clusters), p=[0.9,0.1])
clusters['network_usage_gb'] = np.round(np.random.gamma(6, 20, size=len(clusters)), 1)  # GB
clusters['customer_age'] = np.random.randint(18, 70, len(clusters))

clusters['device_status'] = np.where(np.random.rand(len(clusters))<0.95, "Online", "Offline")
clusters['device_link_os'] = np.random.choice(["Up","Down","Flapping"], size=len(clusters), p=[0.9,0.07,0.03])
base = pd.Timestamp("2024-12-15")
clusters['last_online_ts'] = base + pd.to_timedelta(np.random.randint(0,400,size=len(clusters)), unit="h")
clusters['last_offline_ts'] = clusters['last_online_ts'] - pd.to_timedelta(np.random.randint(0,72,size=len(clusters)), unit="h")
clusters['offline_reason'] = np.random.choice(["Power outage","Fiber issue","Maintenance","Unknown"], size=len(clusters), p=[0.20,0.30,0.25,0.25])

feedback = clusters.rename(columns={'customer_id':'customer_id'})

# --------------- Trend table (daily) ------------
realtime['date'] = realtime['timestamp_1h'].dt.normalize()

trend = realtime.groupby(['city','city_key','date'], as_index=False).agg(
    link_loss_count=('link_loss_count','sum'),
    bad_rsl_count=('bad_rsl_count','sum'),
    high_temp_count=('high_temp_count','sum'),
    dying_gasp_count=('dying_gasp_count','sum'),
    total_onts=('ont_registered','sum')
)

trend['total_alarms'] = (
    trend['link_loss_count'] + 
    trend['bad_rsl_count'] + 
    trend['high_temp_count'] + 
    trend['dying_gasp_count']
)
trend['fault_rate_daily'] = (trend['total_alarms'] / np.maximum(1, trend['total_onts'])).clip(lower=0)

trend['link_loss_vs_yesterday'] = trend.groupby('city')['link_loss_count'].pct_change().fillna(0)
trend['bad_rsl_vs_yesterday'] = trend.groupby('city')['bad_rsl_count'].pct_change().fillna(0)
trend['high_temp_vs_yesterday'] = trend.groupby('city')['high_temp_count'].pct_change().fillna(0)
trend['dying_gasp_vs_yesterday'] = trend.groupby('city')['dying_gasp_count'].pct_change().fillna(0)
trend['region'] = trend['city'].map(region_map)

# ---------------- Ticket table (daily) ----------
ticket = trend[['city','city_key','date','region']].copy()
ticket['ticket_open'] = (ticket.index % 10) + (ticket['city'].factorize()[0]) * 2
ticket['ticket_closed'] = (ticket['ticket_open'] * 0.9).astype(int)
ticket['active_ticket'] = (ticket['ticket_open'] - ticket['ticket_closed']).clip(lower=0)

rng2 = np.random.default_rng(777)
u = rng2.random(len(ticket))
short = rng2.uniform(1, 8, size=len(ticket))
mid   = rng2.uniform(8, 24, size=len(ticket))
long  = rng2.uniform(24, 72, size=len(ticket))
ticket['ticket_holding_time_hr'] = np.where(u < 0.7, short, np.where(u < 0.9, mid, long)).round(1)
ticket['avg_ticket_duration_hr'] = (1.0 + (ticket['ticket_open'] % 4)).astype(float)
ticket['total_downtime_duration_hr'] = (ticket['ticket_open'] * (ticket['avg_ticket_duration_hr'] * 0.8)).round(1)

weights = np.array([0.4, 0.35, 0.15, 0.10]); weights = weights/weights.sum()
def split_counts(n):
    if n <= 0: return [0,0,0,0]
    return list(np.random.multinomial(int(n), weights))
split_vals = np.vstack([split_counts(n) for n in ticket['ticket_open'].values])
ticket['issue_bad_connection'] = split_vals[:,0]
ticket['issue_link_loss']     = split_vals[:,1]
ticket['issue_high_temp']     = split_vals[:,2]
ticket['issue_dying_gasp']    = split_vals[:,3]

def top_rca(row):
    counts = {
        "Bad Connection": row['issue_bad_connection'],
        "Link Loss": row['issue_link_loss'],
        "High Temp": row['issue_high_temp'],
        "Dying Gasp": row['issue_dying_gasp'],
    }
    return max(counts, key=counts.get)
ticket['root_cause_top'] = ticket.apply(top_rca, axis=1)

ticket['urgency_visit_count'] = (0.3 * ticket['active_ticket']).round().astype(int)

ticket['case_id'] = ["CASE-" + str(100000 + i) for i in range(len(ticket))]
ticket['close_subject'] = rng2.choice(["Fiber repair","Port reset","Power restored","Vendor escalation","Config fix"], size=len(ticket))
ticket['vendor'] = rng2.choice(["Huawei","ZTE","Nokia"], size=len(ticket))
ticket['action_taken'] = rng2.choice(["Replace patchcore","Clean connector","Reboot OLT port","Reroute traffic","Dispatch field team"], size=len(ticket))
ticket['schedule_time'] = np.where(ticket['active_ticket']>0, (ticket['date'] + pd.Timedelta(days=1)).dt.strftime("%Y-%m-%d 09:00"), ticket['date'].dt.strftime("%Y-%m-%d 13:00"))
ticket['duration_hours'] = ticket['avg_ticket_duration_hr']
ticket['active_flag'] = (ticket['active_ticket']>0).astype(int)

# Link to customers
customer_pool = feedback['customer_id'].values
sample_idx = rng2.integers(0, len(customer_pool), size=len(ticket))
ticket['customer_id'] = customer_pool[sample_idx]

ticket['rca_a'] = rng2.choice(["Access","Aggregation","Backhaul"], size=len(ticket))
ticket['rca_g'] = rng2.choice(["Power","Physical","Configuration","Unknown"], size=len(ticket))
ticket['rca_d'] = rng2.choice(["Patchcore cut","SFP faulty","Overheat","Fiber attenuation","N/A"], size=len(ticket))
ticket['indicator_status'] = rng2.choice(["Open","Monitoring","Closed"], size=len(ticket))

weekday = ticket['date'].dt.dayofweek
month = ticket['date'].dt.month
season_boost = np.where(month.isin([1,2,3,11,12]), 1.1, 0.9)
base_act = 10 + (weekday.isin([4,5]).astype(int))*5
rng_act = np.random.default_rng(2025)
ticket['new_activation'] = (base_act * season_boost + rng_act.integers(0,7,size=len(ticket))).astype(int)

# ----------------- ML features table -----------------
ml = realtime.copy()
ml = ml.sort_values(["city","olt_id","cluster_id","timestamp_1h"])
# Predict next-hour outage
ml['label_outage_1h'] = ml.groupby(['city','olt_id','cluster_id'])['outage_now'].shift(-1).fillna(0).astype(int)

# Drop helpers not part of schema
drop_cols = [c for c in ['outage_now','date','precursor_flag','stress'] if c in ml.columns]
ml = ml.drop(columns=drop_cols)

ml = ml[[
    "timestamp_1h", "city", "city_key", "province", "region",
    "olt_id", "olt_key", "fdt_name", "fat_name", "cluster_id", "cluster_key", "cluster_name",
    "ont_registered","offline_ont_now","offline_ont_ratio",
    "link_loss_count","bad_rsl_count","high_temp_count","dying_gasp_count",
    "alarm_spike_flag","trap_trend_score","fault_rate",
    "snr_avg","rx_power_avg","rx_power_avg_dbm","temperature_avg_c","temp_anomaly_score",
    "hour_of_day","day_of_week","is_maintenance_window",
    "label_outage_1h"
]]

# ---------------- Dim tables (for BI) ---------------
dim_city = pd.DataFrame({
    'city_key': list(city_key_map.values()),
    'city': list(city_key_map.keys()),              # province names
    'province': [province_map[c] for c in city_key_map.keys()],
    'region': [region_map[c] for c in city_key_map.keys()]
}).sort_values('city_key')

dim_olt = context[['olt_key','olt_id','city_key']].drop_duplicates().sort_values('olt_key')
dim_cluster = context[['cluster_key','cluster_id','olt_key','city_key','cluster_name']].drop_duplicates().sort_values('cluster_key')

# -------------------- Save outputs -------------------
realtime_out = "dataset/ews_network_realtime_status.parquet"
context_out  = "dataset/ews_network_context_static.parquet"
trend_out    = "dataset/ews_incident_trend_summary.parquet"
ticket_out_parquet = "dataset/ews_ticket_summary.parquet"
feedback_out = "dataset/ews_customer_feedback.parquet"
ml_out       = "dataset/ews_ml_features_table.parquet"
city_dim_out = "dataset/dim_city.csv"
olt_dim_out  = "dataset/dim_olt.csv"
cluster_dim_out = "dataset/dim_cluster.csv"

# Ensure output directory exists
import os
os.makedirs("dataset", exist_ok=True)

realtime.to_parquet(realtime_out, index=False)
context.to_parquet(context_out, index=False)
trend.to_parquet(trend_out, index=False)
feedback.to_parquet(feedback_out, index=False)
ml.to_parquet(ml_out, index=False)

dim_city.to_csv(city_dim_out, index=False)
dim_olt.to_csv(olt_dim_out, index=False)
dim_cluster.to_csv(cluster_dim_out, index=False)

# Ticket post-processing + save
if "schedule_visit" not in ticket.columns:
    ticket["schedule_visit"] = np.where(
        (ticket.get("urgency_visit_count", 0) > 0) | (ticket.get("active_ticket", 0) > 0),
        "Yes", "No"
    )

if "xl_id" not in ticket.columns:
    _codes = ticket['city'].apply(province_code)
    ticket = ticket.sort_values(["city","date"]).reset_index(drop=True)
    seq = ticket.groupby(["city","date"]).cumcount() + 1
    ticket["xl_id"] = "XL-" + _codes + "-" + ticket["date"].dt.strftime("%Y%m%d") + "-" + seq.astype(str).str.zfill(5)

preferred = [
    "date","city","city_key","region","ticket_open","ticket_closed","active_ticket",
    "ticket_holding_time_hr","total_downtime_duration_hr",
    "issue_bad_connection","issue_link_loss","issue_high_temp","issue_dying_gasp",
    "root_cause_top","urgency_visit_count","schedule_visit",
    "case_id","xl_id","close_subject","vendor","action_taken",
    "schedule_time","duration_hours","active_flag","customer_id",
    "rca_a","rca_g","rca_d","indicator_status",
    "new_activation"
]
cols = [c for c in preferred if c in ticket.columns] + [c for c in ticket.columns if c not in preferred]
ticket = ticket[cols]

ticket.to_parquet(ticket_out_parquet, index=False)

In [3]:
total_hours = len(ml)
pos_rate = ml['label_outage_1h'].mean()
print(f"Rows (ML table): {total_hours:,} — Positive rate: {pos_rate:.4%} (target ~ {TARGET_OUTAGE_RATE:.2%})")
print("Saved to 'dataset/' folder.")

Rows (ML table): 3,578,976 — Positive rate: 0.4125% (target ~ 0.30%)
Saved to 'dataset/' folder.
