In [1]:
# %% [markdown]
# ### 1. Import Required Libraries
from bounds import bounds
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# %% [markdown]
# ### 2. Data Loading and Preparation
file_name = "DataOn2025Jan08.xlsx"
df1 = pd.read_excel(file_name, sheet_name="NES170K07Line2")
df2 = pd.read_excel(file_name, sheet_name="NES170K07Line1")
df = pd.concat([df1, df2], ignore_index=True)
print("Data shape:", df.shape)

t5_lb = bounds["170K"][0]
t5_ub = bounds["170K"][1]

def safe_literal_eval(value):
    """Safely evaluate string representations of lists."""
    if pd.isna(value):
        return None
    try:
        return ast.literal_eval(str(value).replace("nan", "None"))
    except (ValueError, SyntaxError):
        return None

def organized_data(df, t5_lb, t5_ub):
    """Process raw data into structured format."""
    data = {}
    for index, row in df.iterrows():
        if pd.isna(row['t5']):
            continue
        
        batch_number = row["batch_number"]
        data[batch_number] = {"MDR": None, "t5": row["t5"], "class": None}
        
        # Process MDR data
        t_S1 = safe_literal_eval(row["MDRTorqueS1"])
        t_S2 = safe_literal_eval(row["MDRTorqueS2"])
        if t_S1 and t_S2:
            t, S1 = zip(*t_S1)
            t, S2 = zip(*t_S2)
            MDR = pd.DataFrame({
                "time": t[1:],  # Exclude first element
                "S1": S1[1:],
                "S2": S2[1:]
            }).interpolate(method='linear').ffill().bfill()
            data[batch_number]["MDR"] = MDR
        
        # Assign class label
        if row["t5"] < t5_lb:
            data[batch_number]["class"] = "low"
        elif row["t5"] > t5_ub:
            data[batch_number]["class"] = "high"
        else:
            data[batch_number]["class"] = "normal"
    
    return {k: v for k, v in data.items() if v["MDR"] is not None and not v["MDR"].empty}

data = organized_data(df, t5_lb, t5_ub)
print("\nClass distribution:")
print(f"Low: {len([v for v in data.values() if v['class'] == 'low'])}")
print(f"Normal: {len([v for v in data.values() if v['class'] == 'normal'])}")
print(f"High: {len([v for v in data.values() if v['class'] == 'high'])}")

# %% [markdown]
# ### 3. Data Preprocessing
# Split data
keys = list(data.keys())
labels = [data[k]['class'] for k in keys]
X_train_keys, X_test_keys = train_test_split(
    keys, test_size=0.2, stratify=labels, random_state=42
)

# Calculate normalization parameters
train_mdr = pd.concat([data[k]['MDR'] for k in X_train_keys])
global_min = {'S1': train_mdr['S1'].min(), 'S2': train_mdr['S2'].min()}
global_max = {'S1': train_mdr['S1'].max(), 'S2': train_mdr['S2'].max()}

def normalize_and_pad(keys):
    """Normalize and pad sequences to equal length."""
    sequences = []
    for k in keys:
        mdr = data[k]['MDR'].copy()
        # Normalize
        mdr['S1'] = (mdr['S1'] - global_min['S1']) / (global_max['S1'] - global_min['S1'])
        mdr['S2'] = (mdr['S2'] - global_min['S2']) / (global_max['S2'] - global_min['S2'])
        sequences.append(mdr[['S1', 'S2']].values)
    
    # Pad sequences
    max_length = max(len(seq) for seq in sequences)
    return pad_sequences(sequences, maxlen=max_length, padding='post', dtype='float32')

X_train = normalize_and_pad(X_train_keys)
X_test = normalize_and_pad(X_test_keys)

# Prepare labels
class_mapping = {'low': 0, 'normal': 1, 'high': 2}
y_train = np.array([class_mapping[data[k]['class']] for k in X_train_keys])
y_test = np.array([class_mapping[data[k]['class']] for k in X_test_keys])
y_train_cat = tf.keras.utils.to_categorical(y_train)
y_test_cat = tf.keras.utils.to_categorical(y_test)

# %% [markdown]
# ### 4. Handle Class Imbalance
class_counts = Counter(y_train)
total = sum(class_counts.values())
class_weights = {
    0: total / (3 * class_counts[0]),  # low
    1: total / (3 * class_counts[1]),  # normal
    2: total / (3 * class_counts[2])   # high
}

# %% [markdown]
# ### 5. Build LSTM Model
model = Sequential([
    Masking(mask_value=0., input_shape=(X_train.shape[1], 2)),
    LSTM(128, return_sequences=True, dropout=0.3),
    LSTM(64, dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             loss='categorical_crossentropy',
             metrics=['accuracy',
                      tf.keras.metrics.Precision(name='precision'),
                      tf.keras.metrics.Recall(name='recall')])

# %% [markdown]
# ### 6. Train the Model
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=15, restore_best_weights=True
)

history = model.fit(
    X_train, y_train_cat,
    epochs=100,
    batch_size=32,
    validation_split=0.15,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# %% [markdown]
# ### 7. Evaluation Metrics
# Generate predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Confusion matrix
conf_mat = tf.math.confusion_matrix(y_test, y_pred_classes)

# Calculate metrics
class_acc = {}
for i, class_name in enumerate(['low', 'normal', 'high']):
    correct = conf_mat[i,i].numpy()
    total = conf_mat[i].numpy().sum()
    class_acc[class_name] = correct / total

avg_acc = np.trace(conf_mat) / np.sum(conf_mat)

# Print results
print("\nClassification Report:")
print(f"{'Class':<10} {'Accuracy':<10}")
for cls, acc in class_acc.items():
    print(f"{cls:<10} {acc:.2%}")
print(f"\nAverage Accuracy: {avg_acc:.2%}")

# Plot confusion matrix
plt.figure(figsize=(10, 8))
plt.imshow(conf_mat, cmap='Blues')
for i in range(3):
    for j in range(3):
        plt.text(j, i, f"{conf_mat[i,j]}", 
                ha="center", va="center", 
                color="white" if conf_mat[i,j] > conf_mat.max()/2 else "black")
plt.xticks([0, 1, 2], ['low', 'normal', 'high'])
plt.yticks([0, 1, 2], ['low', 'normal', 'high'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.colorbar()
plt.show()


2025-02-13 18:26:32.522880: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-02-13 18:26:32.522990: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Data shape: (20528, 43)

Class distribution:
Low: 365
Normal: 7297
High: 677


2025-02-13 18:29:14.848185: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-02-13 18:29:14.848221: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2025-02-13 18:29:14.848269: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ubuntu): /proc/driver/nvidia/version does not exist
2025-02-13 18:29:14.848604: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100