In [None]:
import pandas as pd
from pathlib import Path
import logging

from pathlib import Path
import logging
import pandas as pd

def clean_and_save_csv(csv_file: str) -> str:
    """
    מנקה קובץ CSV: מסיר עמודת אינדקס, מפצל ומרפד את עמודת ה-Data, ושומר מחדש.
    Label תמיד תופיע אחרי DATA[7].

    Args:
        csv_file (str): הנתיב לקובץ המקורי.

    Returns:
        str: הנתיב לקובץ החדש שנשמר.
    """
    # קריאה עם עמודה אחת ל־Data

    cols = ['Timestamp', 'CAN ID', 'DLC', 'Data', 'Label']
    df = pd.read_csv(csv_file, header=None, names=cols, low_memory=False)

    # הסרת עמודת index

    # המרות בסיסיות
    df['Timestamp'] = pd.to_numeric(df['Timestamp'], errors='coerce')
    df['DLC'] = pd.to_numeric(df['DLC'], errors='coerce').fillna(0).astype(int)
    df = df.dropna(subset=['Timestamp', 'CAN ID'])

    # ניקוי ערכי Data
    def process_data_field(raw):
        if pd.isna(raw): return ['00'] * 8
        raw = str(raw).replace(" ", "")  # הסרת רווחים
        bytes_list = [raw[i:i+2] for i in range(0, len(raw), 2)]
        while len(bytes_list) < 8:
            bytes_list.append("00")
        return bytes_list[:8]

    # החלת הפיצול והפירוק
    processed_data = df['Data'].apply(process_data_field)
    data_df = pd.DataFrame(processed_data.tolist(), columns=[f'DATA[{i}]' for i in range(8)])
    df.drop(columns=['Data'], inplace=True)

    # עדכון DLC ל-8 אם היה פחות
    df['DLC'] = 8

    # בניית הסדר הנכון מחדש
    final_df = pd.concat([df[['Timestamp', 'CAN ID', 'DLC']].reset_index(drop=True),
                          data_df.reset_index(drop=True),
                          df[['Label']].reset_index(drop=True)], axis=1)

    # שמירה
    out_file = Path(csv_file)
    final_df.to_csv(out_file, index=False)
    logging.info(f"[CLEAN] Saved cleaned file: {out_file}")
    return str(out_file)

# רשימת שמות הקבצים מתוך התמונה
csv_files = [
    "Attack_free_CHEVROLET_Spark_train.csv",
    "Attack_free_KIA_Soul_train.csv",
    "Flooding_CHEVROLET_Spark_train.csv",
    "Flooding_HYUNDAI_Sonata_train.csv",
    "Flooding_KIA_Soul_train.csv",
    "Fuzzy_CHEVROLET_Spark_train.csv",
    'Attack_free_HY_Sonata_train.csv',
    'Attack_free_KIA_Soul_train.csv',
    'Fuzzy_dataset_HY_Sonata_train.csv',
    'Fuzzy_dataset_KIA_Soul_train.csv',
    'Malfunction_1st_dataset_HY_Sonata_train.csv',
    'Malfunction_1st_dataset_KIA_Soul_train.csv',
    'Malfunction_2nd_HY_Sonata_train.csv',
    'Malfunction_2nd_KIA_Soul_train.csv',
    'Replay_dataset_HY_Sonata_train.csv',
    'Replay_dataset_KIA_Soul_train.csv'
]


# נתיב לתיקייה המכילה את הקבצים

# הפעלת הפונקציה על כל קובץ
for fname in csv_files:
    try:
        cleaned_path = clean_and_save_csv(str(fname))
        print(f"[✓] Cleaned: {cleaned_path}")
    except Exception as e:
        print(f"[✗] Failed: {fname} — {e}")


In [None]:
# ============================================================
# caae_full.py –  Conv-AAE end-to-end pipeline (A→Z)
# ============================================================

import os, glob, itertools, cv2, numpy as np, pandas as pd, tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report

# ------------------------------------------------------------
# 0) GPU setup (optional)
# ------------------------------------------------------------
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# ------------------------------------------------------------
# 1) Hyper-parameters
# ------------------------------------------------------------
IMG_SHAPE   = (32, 32, 2)              # <-- תמונת-קלט
FEATURE_DIM = np.prod(IMG_SHAPE)       # 29*29*2
N_LABELS    = 2

BATCH       = 128
EPOCHS      = 50

LATENT_DIM  = 64
λ_gp        = 10.0

LR_AE = 5e-4
LR_DZ = 1e-4
LR_DY = 1e-4
LR_G  = 5e-5

ACTIVATION = 'elu'
DROPOUT    = 0.2
NORM_TYPE  = 'layer'           # layer / batch

# ------------------------------------------------------------
# 2) Pre-processing & TFRecord creation  (ללא שינוי לוגי)
# ------------------------------------------------------------
datasets = [
    "Attack_free_CHEVROLET_Spark_train.csv",
    "Attack_free_KIA_Soul_train.csv",
    "Flooding_CHEVROLET_Spark_train.csv",
    "Flooding_KIA_Soul_train.csv",
    "Fuzzy_CHEVROLET_Spark_train.csv",
    "Attack_free_HY_Sonata_train.csv",
    "Attack_free_KIA_Soul_train.csv",
    "Fuzzy_dataset_HY_Sonata_train.csv",
    "Fuzzy_dataset_KIA_Soul_train.csv",
    "Malfunction_1st_dataset_HY_Sonata_train.csv",
    "Malfunction_1st_dataset_KIA_Soul_train.csv",
    "Malfunction_2nd_HY_Sonata_train.csv",
    "Malfunction_2nd_KIA_Soul_train.csv",
    "Replay_dataset_HY_Sonata_train.csv",
    "Replay_dataset_KIA_Soul_train.csv"
]
csv_map = {d: d for d in datasets}



def fill_flag(row):
    if not isinstance(row['Label'], str):
        col = 'Data' + str(int(row['DLC']))
        row['Flag'] = row.get(col, row['Label'])
    return row

def convert_canid_bits(cid):
    try:
        return np.array(list(map(int, bin(int(str(cid),16))[2:].zfill(29))), dtype=np.uint8)
    except:
        return np.zeros(29, dtype=np.uint8)

def hex_to_int(x):
    try: return int(str(x).strip(), 16)
    except: return 0

# Replace the preprocess_windows function with this fixed version:
def preprocess_windows(csv_file):
    print(f"[DATA] Processing {csv_file}")
    cols = ['Timestamp','canID','DLC']+[f'Data{i}' for i in range(8)] + ['Label']
    df   = pd.read_csv(csv_file, header=None, names=cols, low_memory=False)

    df['Timestamp'] = pd.to_numeric(df['Timestamp'], errors='coerce')
    df['DLC']       = pd.to_numeric(df['DLC'], errors='coerce').fillna(0).astype(int)
    df = df.dropna(subset=['Timestamp','canID']).apply(fill_flag, axis=1)

    for i in range(8):
        df[f'Data{i}'] = df[f'Data{i}'].apply(hex_to_int).astype(np.uint8)

    df['Label']    = df['Label'].astype(str).str.upper().eq('T').astype(np.uint8)
    df['canBits'] = df['canID'].apply(convert_canid_bits)
    df = df.sort_values('Timestamp')

    bits_all   = np.stack(df['canBits'].values)
    data_bytes = df[[f'Data{i}' for i in range(8)]].values
    flags_all  = df['Label'].values

    win = 29
    N   = len(bits_all)//win
    bits   = bits_all[:N*win].reshape(N, win, 29)
    data   = data_bytes[:N*win].reshape(N, win, 8)
    flags  = flags_all[:N*win].reshape(N, win)

    rows = []
    for i in range(N):
        id_block = bits[i]
        id_img   = cv2.resize(id_block.astype(np.uint8), (32, 32),
                              interpolation=cv2.INTER_NEAREST)

        last_b = data[i,-1,:]
        b8     = np.unpackbits(last_b).reshape(8,8)
        data_img = cv2.resize(b8.astype(np.float32), (32, 32),
                              interpolation=cv2.INTER_NEAREST) > .5

        two_ch = np.stack([id_img, data_img.astype(np.uint8)], axis=-1)  # 32x32x2
        rows.append((two_ch.flatten().tolist(), int(flags[i].any())))
    return rows


def write_tfrecord(rows, base):
    np.random.shuffle(rows)
    ntr = int(.7*len(rows)); nvl = int(.15*len(rows))
    splits = {'train':rows[:ntr], 'val':rows[ntr:ntr+nvl], 'test':rows[ntr+nvl:]}
    for phase, chunk in splits.items():
        with tf.io.TFRecordWriter(f"{base}_{phase}.tfrecord") as w:
            for feat,lbl in chunk:
                ex = tf.train.Example(
                     features=tf.train.Features(feature={
                     'features': tf.train.Feature(int64_list=tf.train.Int64List(value=feat)),
                     'label':    tf.train.Feature(int64_list=tf.train.Int64List(value=[lbl]))}))
                w.write(ex.SerializeToString())

# צור TFRecords אם חסרים

print("[DATA] Creating TFRecords…")
for d in datasets:
    if not os.path.exists(csv_map[d]): continue
    rows = preprocess_windows(csv_map[d])
    normals = [r for r in rows if r[1]==0]
    attacks = [r for r in rows if r[1]==1]
    write_tfrecord(normals, f"Normal_{d}")
    if attacks: write_tfrecord(attacks, d)

# ------------------------------------------------------------
# 3) tf.data pipeline
# ------------------------------------------------------------
def parse_feat(proto):
    fs = {'features': tf.io.FixedLenFeature([FEATURE_DIM], tf.int64),
          'label':    tf.io.FixedLenFeature([1], tf.int64)}
    feat = tf.io.parse_single_example(proto, fs)
    x = tf.cast(feat['features'], tf.float32)
    x = tf.reshape(x, IMG_SHAPE)              # <-- תמונה
    y = tf.one_hot(tf.cast(feat['label'][0], tf.int32), N_LABELS)
    return x, y

train_files = glob.glob('Normal_*_train.tfrecord')
train_ds = (
    tf.data.TFRecordDataset(train_files, num_parallel_reads=tf.data.AUTOTUNE)
    .map(parse_feat, tf.data.AUTOTUNE)
    .map(lambda x,y: (x + tf.random.normal(tf.shape(x),0,0.01), x, y), tf.data.AUTOTUNE)
    .shuffle(10000).repeat()
    .batch(BATCH).prefetch(tf.data.AUTOTUNE)
)
steps_per_epoch = sum(1 for _ in tf.data.TFRecordDataset(train_files)) // BATCH
print(f"[PIPE] records={steps_per_epoch*BATCH}, steps/epoch={steps_per_epoch}")

# ------------------------------------------------------------
# 4) CAAE Model
# ------------------------------------------------------------
def dense_block(units):
    layers = [tf.keras.layers.Dense(units)]
    if NORM_TYPE=='layer': layers.append(tf.keras.layers.LayerNormalization())
    elif NORM_TYPE=='batch': layers.append(tf.keras.layers.BatchNormalization())
    layers.append(tf.keras.layers.Activation(ACTIVATION))
    if DROPOUT>0: layers.append(tf.keras.layers.Dropout(DROPOUT))
    return tf.keras.Sequential(layers)

class ConvAAE(tf.keras.Model):
    def __init__(self):
        super().__init__()
        # ---- encoder
        self.enc_c1  = tf.keras.layers.Conv2D(32,(3,3),strides=2,padding='same',activation=ACTIVATION)
        self.enc_c2  = tf.keras.layers.Conv2D(64,(3,3),strides=2,padding='same',activation=ACTIVATION)
        self.enc_flat= tf.keras.layers.Flatten()
        self.enc_fc  = dense_block(256)
        self.z_layer = tf.keras.layers.Dense(LATENT_DIM)
        self.y_logits= tf.keras.layers.Dense(N_LABELS)

        # ---- decoder
        self.dec_fc   = dense_block(8*8*64)
        self.dec_reshape = tf.keras.layers.Reshape((8,8,64))
        self.dec_t1  = tf.keras.layers.Conv2DTranspose(64,(3,3),strides=2,padding='same',activation=ACTIVATION)
        self.dec_t2  = tf.keras.layers.Conv2DTranspose(32,(3,3),strides=2,padding='same',activation=ACTIVATION)
        self.dec_out = tf.keras.layers.Conv2DTranspose(2,(3,3),padding='same',activation='sigmoid')

        # ---- discriminators (z , y)
        self.dz = tf.keras.Sequential([dense_block(256),
                                       dense_block(128),
                                       tf.keras.layers.Dense(1)])
        self.dy = tf.keras.Sequential([dense_block(256),
                                       dense_block(128),
                                       tf.keras.layers.Dense(1)])

    # ---------- forward passes ----------
    def encode(self, x):
        h = self.enc_c2(self.enc_c1(x))
        h = self.enc_fc(self.enc_flat(h))
        z = self.z_layer(h)
        logits = self.y_logits(h)
        y = tf.nn.softmax(logits)
        return z, y, logits

    def decode(self, z, y):
        h = tf.concat([z,y], axis=1)
        h = self.dec_fc(h)
        h = self.dec_reshape(h)
        h = self.dec_t2(self.dec_t1(h))
        return self.dec_out(h)

    def discriminate_z(self, z): return self.dz(z)
    def discriminate_y(self, y): return self.dy(y)

    @staticmethod
    def gp(f, real, fake):
        α = tf.random.uniform([real.shape[0],1],0,1)
        inter = real + α*(fake-real)
        with tf.GradientTape() as t:
            t.watch(inter); p=f(inter)
        g = t.gradient(p, inter)
        slopes = tf.sqrt(tf.reduce_sum(tf.square(g), axis=1)+1e-8)
        return tf.reduce_mean((slopes-1.)**2)

caae = ConvAAE()

# ------------------------------------------------------------
# 4.5) Warm-up: לבנות את כל המשתנים לפני האופטימיזרים
# ------------------------------------------------------------
dummy_x = tf.zeros((1,) + IMG_SHAPE, dtype=tf.float32)
z0, y0, _ = caae.encode(dummy_x)
_ = caae.decode(z0, y0)
_ = caae.discriminate_z(tf.random.normal((1, LATENT_DIM)))
_ = caae.discriminate_y(tf.one_hot([0], depth=N_LABELS))
print("[BUILD] all layer variables created:", len(caae.trainable_variables))

# ------------------------------------------------------------
# 5) Losses & optimizers  (עם רשימות קבועות)
# ------------------------------------------------------------
ae_vars = (
    caae.enc_c1.trainable_variables + caae.enc_c2.trainable_variables +
    caae.enc_fc.trainable_variables + caae.z_layer.trainable_variables +
    caae.y_logits.trainable_variables +
    caae.dec_fc.trainable_variables + caae.dec_reshape.trainable_variables +
    caae.dec_t1.trainable_variables + caae.dec_t2.trainable_variables +
    caae.dec_out.trainable_variables
)
dz_vars = caae.dz.trainable_variables
dy_vars = caae.dy.trainable_variables
enc_vars = (
    caae.enc_c1.trainable_variables + caae.enc_c2.trainable_variables +
    caae.enc_fc.trainable_variables + caae.z_layer.trainable_variables +
    caae.y_logits.trainable_variables
)

mse  = tf.keras.losses.MeanSquaredError()
ce   = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

opt_ae = tf.keras.optimizers.Adam(LR_AE)
opt_dz = tf.keras.optimizers.Adam(LR_DZ)
opt_dy = tf.keras.optimizers.Adam(LR_DY)
opt_g  = tf.keras.optimizers.Adam(LR_G)

# ------------------------------------------------------------
# 6) Training step (tf.function)
# ------------------------------------------------------------
@tf.function
def train_step(x_noisy, x_clean, y_lbl):
    # ---------- auto-encoder ----------
    with tf.GradientTape() as t_ae:
        z_enc, y_enc, _ = caae.encode(x_noisy)
        x_rec = caae.decode(z_enc, y_enc)
        loss_re = mse(x_clean, x_rec)
    grads = t_ae.gradient(loss_re, ae_vars)
    opt_ae.apply_gradients(zip(grads, ae_vars))

    # ---------- discriminator-z ----------
    with tf.GradientTape() as t_dz:
        z_real = tf.random.normal([tf.shape(x_noisy)[0], LATENT_DIM])
        dz_r   = caae.discriminate_z(z_real)
        dz_f   = caae.discriminate_z(z_enc)
        gp_z   = caae.gp(caae.discriminate_z, z_real, z_enc)
        loss_dz = tf.reduce_mean(dz_f) - tf.reduce_mean(dz_r) + λ_gp * gp_z
    opt_dz.apply_gradients(zip(t_dz.gradient(loss_dz, dz_vars), dz_vars))

    # ---------- discriminator-y ----------
    with tf.GradientTape() as t_dy:
        dy_r = caae.discriminate_y(y_lbl)
        _, y_enc2, _ = caae.encode(x_clean)
        dy_f = caae.discriminate_y(y_enc2)
        gp_y = caae.gp(caae.discriminate_y, y_lbl, y_enc2)
        loss_dy = tf.reduce_mean(dy_f) - tf.reduce_mean(dy_r) + λ_gp * gp_y
    opt_dy.apply_gradients(zip(t_dy.gradient(loss_dy, dy_vars), dy_vars))

    # ---------- generator / encoder adversarial ----------
    with tf.GradientTape() as t_g:
        z_g, y_g, logits = caae.encode(x_clean)
        loss_g = (
            -tf.reduce_mean(caae.discriminate_z(z_g))
            -tf.reduce_mean(caae.discriminate_y(y_g))
            + ce(y_lbl, logits)
        )
    opt_g.apply_gradients(zip(t_g.gradient(loss_g, enc_vars), enc_vars))

    return loss_re, loss_dz, loss_dy, loss_g

# ------------------------------------------------------------
# 7) Training loop
# ------------------------------------------------------------
re_hist, dz_hist, dy_hist, g_hist, val_hist = [],[],[],[],[]

for epoch in range(1, EPOCHS+1):
    print(f"\n[TRAIN] Epoch {epoch}/{EPOCHS}")
    ep_re=ep_dz=ep_dy=ep_g=0
    for step,(xn, xc, y) in enumerate(train_ds.take(steps_per_epoch)):
        lr,ldz,ldy,lg = train_step(xn, xc, y)
        ep_re+=lr.numpy(); ep_dz+=ldz.numpy(); ep_dy+=ldy.numpy(); ep_g+=lg.numpy()
        if step%100==0:
            print(f"  step {step}/{steps_per_epoch} | re={lr:.4f} dz={ldz:.4f} dy={ldy:.4f} g={lg:.4f}")
    re_hist.append(ep_re/steps_per_epoch)
    dz_hist.append(ep_dz/steps_per_epoch)
    dy_hist.append(ep_dy/steps_per_epoch)
    g_hist .append(ep_g /steps_per_epoch)

    # -------- validation recon ----------
    val_loss, n_batches = 0,0
    for fn in glob.glob('Normal_*_val.tfrecord'):
        for x_val,_ in tf.data.TFRecordDataset(fn).map(parse_feat).batch(BATCH):
            x_rec = caae.decode(*caae.encode(x_val)[0:2])
            val_loss += mse(x_val, x_rec).numpy()
            n_batches += 1
    val_hist.append(val_loss/n_batches)
    print(f"[VAL] recon={val_hist[-1]:.4f}")

# ------------------------------------------------------------
# 8) Save encoder & decoder
# ------------------------------------------------------------
from tensorflow.keras.layers import Input, Activation, Concatenate
from tensorflow.keras.models import Model

# --- Encoder
enc_in = Input(shape=IMG_SHAPE)
h = caae.enc_c2(caae.enc_c1(enc_in))
h = caae.enc_fc(caae.enc_flat(h))
z_out = caae.z_layer(h)
y_log = caae.y_logits(h)
y_out = Activation('softmax')(y_log)
encoder = Model(enc_in, [z_out,y_out], name='caae_encoder')

# --- Decoder
z_in = Input(shape=(LATENT_DIM,))
y_in = Input(shape=(N_LABELS,))
h2   = caae.dec_fc(Concatenate()([z_in,y_in]))
h2   = caae.dec_reshape(h2)
h2   = caae.dec_t2(caae.dec_t1(h2))
dec_out = caae.dec_out(h2)
decoder = Model([z_in,y_in], dec_out, name='caae_decoder')

encoder.save('caae_encoder.keras')
decoder.save('caae_decoder.keras')
print("[SAVE] models stored")

# ------------------------------------------------------------
# 9) Evaluation
# ------------------------------------------------------------
errs, ys = [], []
for fn in glob.glob('*_test.tfrecord'):
    label = 0 if fn.startswith('Normal_') else 1
    for x_batch,_ in tf.data.TFRecordDataset(fn).map(parse_feat).batch(256):
        z_p,y_p = encoder(x_batch)
        x_r = decoder([z_p,y_p])
        e = tf.reduce_mean(tf.square(x_batch - x_r), axis=[1,2,3]).numpy()
        errs.append(e); ys.append(np.full(e.shape,label))
errs = np.concatenate(errs)
ys   = np.concatenate(ys)

fpr,tpr,ths = roc_curve(ys, errs)
roc_auc     = auc(fpr,tpr)
best_idx    = np.argmax(tpr-fpr)
thr_opt     = ths[best_idx]

print(f"\n[RESULT] ROC-AUC={roc_auc:.4f} | Thr={thr_opt:.6f} | "
      f"TPR={tpr[best_idx]:.3f} | FPR={fpr[best_idx]:.3f}")
cm = confusion_matrix(ys, (errs>thr_opt).astype(int))
print("[CM]\n", cm)
print("[Report]\n", classification_report(ys,(errs>thr_opt).astype(int),
                                          target_names=['Normal','Attack']))

# ------------------------------------------------------------
# 10) Plotting
# ------------------------------------------------------------
# -- Reconstruction loss curves
plt.figure(); plt.plot(re_hist,label='Train'); plt.plot(val_hist,label='Val')
plt.xlabel('Epoch'); plt.ylabel('MSE'); plt.title('Reconstruction Loss'); plt.legend(); plt.show()

# -- Adversarial losses
plt.figure(); plt.plot(dz_hist,label='Disc-z'); plt.plot(dy_hist,label='Disc-y'); plt.plot(g_hist,label='Gen')
plt.xlabel('Epoch'); plt.ylabel('Wasserstein'); plt.title('Adversarial Losses'); plt.legend(); plt.show()

# -- ROC curve
plt.figure(); plt.plot(fpr,tpr,label=f'AUC={roc_auc:.3f}'); plt.plot([0,1],[0,1],'k--')
plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC Curve'); plt.legend(); plt.show()

# -- Confusion-matrix heat-map
plt.figure(); plt.imshow(cm,cmap=plt.cm.Blues); plt.title('Confusion Matrix'); plt.colorbar()
ticks = np.arange(2); classes=['Normal','Attack']
plt.xticks(ticks,classes,rotation=45); plt.yticks(ticks,classes)
th = cm.max()/2
for i,j in itertools.product(range(2),range(2)):
    plt.text(j,i,cm[i,j],ha='center',color='white' if cm[i,j]>th else 'black')
plt.ylabel('True'); plt.xlabel('Predicted'); plt.tight_layout(); plt.show()

# -- Error distributions
plt.figure()
plt.hist(errs[ys==0],bins=50,alpha=.5,label='Normal')
plt.hist(errs[ys==1],bins=50,alpha=.5,label='Attack')
plt.xlabel('Reconstruction error'); plt.ylabel('Count')
plt.title('Error Distribution'); plt.legend(); plt.show()
