In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

plt.rcParams['figure.figsize'] = (16, 8)

# Load Data

In [None]:
df = pd.read_csv('/content/tb_sig_5.csv')
df['timestamp'] = df['created_date'] + " " + df["created_time"]
df.drop(columns=["created_date", "created_time"], inplace=True)
print(f"Bentuk data: {df.shape}")
df.head()

In [None]:
df.describe()

# Visualization

## Set Poin vs Actual

### Dosing

In [None]:
dosing_1 = ["sp_dosing_1", "actual_dosing_1"]
dosing_2 = ["sp_dosing_2", "actual_dosing_2"]
dosing_3 = ["sp_dosing_3", "actual_dosing_3"]
dosing_4 = ["sp_dosing_4", "actual_dosing_4"]
dosing_5 = ["sp_dosing_5", "actual_dosing_5"]
dosing_6 = ["sp_dosing_6", "actual_dosing_6"]
dosing_7 = ["sp_dosing_7", "actual_dosing_7"]
dosing_8 = ["sp_dosing_8", "actual_dosing_8"]

for col in [dosing_1, dosing_2, dosing_3, dosing_4, dosing_5, dosing_6, dosing_7, dosing_8]:
    plt.figure(figsize=(14, 6))
    sns.lineplot(data=df, x="timestamp", y=col[0], label=col[0])
    sns.lineplot(data=df, x="timestamp", y=col[1], label=col[1])
    plt.legend()

### Heater

In [None]:
heater = ['sp_heater_ctr_cross_left', 'sp_heater_ctr_cross_right', 'sp_heater_ctr_long_left', 'sp_heater_ctr_long_right']
for col in heater:
    plt.figure(figsize=(14, 6))
    sns.lineplot(data=df, x="timestamp", y=col, label=col)
    plt.legend()

# Preprocessing Data

In [None]:
def convert_to_numeric(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_datetime(df[col])
                # Konversi datetime ke timestamp (float)
                df[col] = df[col].apply(lambda x: x.timestamp())
            except Exception as e:
                try:
                    df[col] = pd.to_numeric(df[col])
                except Exception as e:
                    print(f"Dropping column '{col}' karena tidak bisa dikonversi ke numeric.")
                    df.drop(col, axis=1, inplace=True)
    return df

df = convert_to_numeric(df)


In [None]:
# Visualisasi korelasi data
plt.figure(figsize=(18, 14))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Korelasi antar Variabel')
plt.tight_layout()
plt.show();

# Features Selection

In [None]:
# Memilih fitur yang akan digunakan - menghilangkan kolom non-numerik dan identifier
features = df.columns.drop(['id', 'timestamp'])
X = df[features]

# Normalisasi data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train-test Split

In [None]:
X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)

print(f"Bentuk training set: {X_train.shape}")
print(f"Bentuk validation set: {X_val.shape}")

# Config

In [None]:
# Mendapatkan dimensi input
input_dim = X_train.shape[1]

# Parameter model
encoding_dim = 10  # Hidden Layer Dimension
epochs = 128
batch_size = 8

# Model - AutoEncoder

In [None]:
input_layer = Input(shape=(input_dim,))

# Encoder
encoder = Dense(32, activation='relu')(input_layer)
encoder = Dense(encoding_dim, activation='relu')(encoder)

# Decoder
decoder = Dense(32, activation='relu')(encoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)

# Model AutoEncoder lengkap
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Kompilasi model
autoencoder.compile(optimizer='adam', loss='mse')

# Ringkasan model
autoencoder.summary()

# Train the Model

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    mode='min',
    restore_best_weights=True
)
history = autoencoder.fit(
    X_train, X_train,
    epochs=epochs,
    batch_size=batch_size,
    shuffle=True,
    validation_data=(X_val, X_val),
    callbacks=[early_stopping],
    verbose=1
)

# Plot loss history
plt.figure(figsize=(14, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

# Predict

In [None]:
X_pred_train = autoencoder.predict(X_train)
X_pred_val = autoencoder.predict(X_val)

# Menghitung Mean Squared Error (MSE) untuk setiap sampel
mse_train = np.mean(np.power(X_train - X_pred_train, 2), axis=1)
mse_val = np.mean(np.power(X_val - X_pred_val, 2), axis=1)

# Visualisasi MSE
plt.figure(figsize=(14, 6))
plt.hist(mse_train, bins=50, alpha=0.5, label='Training MSE')
plt.hist(mse_val, bins=50, alpha=0.5, label='Validation MSE')
plt.axvline(x=np.percentile(mse_train, 95), color='r', linestyle='--', label='Threshold (95th Percentile)')
plt.title('Distribution of Reconstruction Error (MSE)')
plt.xlabel('Reconstruction MSE')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.show()

# Set Threshold and Identify the Anomaly

In [None]:
threshold = np.percentile(mse_train, 95)
print(f"Threshold MSE (95th percentile): {threshold:.6f}")

# Menerapkan model ke seluruh dataset
X_pred_full = autoencoder.predict(X_scaled)
mse_full = np.mean(np.power(X_scaled - X_pred_full, 2), axis=1)

# Mengidentifikasi anomali
df['reconstruction_error'] = mse_full
df['is_anomaly'] = df['reconstruction_error'] > threshold

# Menampilkan jumlah anomali yang terdeteksi
anomaly_count = df['is_anomaly'].sum()
print(f"Jumlah anomali terdeteksi: {anomaly_count} dari {df.shape[0]} sampel ({(anomaly_count/df.shape[0])*100:.2f}%)")


# Visualize Recontruction Error and Threshold

In [None]:
plt.figure(figsize=(14, 6))
plt.scatter(range(len(df)), df['reconstruction_error'], c=df['is_anomaly'].map({True: 'red', False: 'blue'}))
plt.axhline(y=threshold, color='r', linestyle='--', label=f'Threshold: {threshold:.6f}')
plt.title('Reconstruction Error untuk Setiap Sampel')
plt.ylabel('Reconstruction Error (MSE)')
plt.xlabel('Indeks Sampel')
plt.legend(['Normal', 'Anomali', 'Threshold'])
plt.grid(True)
plt.show()

In [None]:
# Menunjukkan data anomali
print("\nData anomali terdeteksi:")
df[df['is_anomaly'] == True][['id', 'timestamp'] + list(features) + ['reconstruction_error']]

# Features Importance

In [None]:
feature_errors = pd.DataFrame()

for i, feature in enumerate(features):
    feature_errors[feature] = np.power(X_scaled - X_pred_full, 2)[:, i]

# Menghitung kontribusi rata-rata setiap fitur terhadap error
feature_error_mean = feature_errors.mean().sort_values(ascending=False)

# Visualisasi kontribusi fitur terhadap error
plt.figure(figsize=(14, 8))
sns.barplot(x=feature_error_mean.values, y=feature_error_mean.index)
plt.title('Kontribusi Rata-rata Setiap Fitur terhadap Reconstruction Error')
plt.xlabel('Mean Squared Error')
plt.tight_layout()
plt.show()

# Save Model

In [None]:
autoencoder.save('anomaly_autoencoder_model.h5')

import joblib
joblib.dump(scaler, 'anomaly_scaler.pkl')

# Main Function
Fungsi untuk mendeteksi Anomali pada data baru

In [None]:
def detect_anomalies(new_data, model, scaler, threshold):
    # Preprocessing
    if isinstance(new_data, pd.DataFrame):
        if all(feature in new_data.columns for feature in features):
            X_new = new_data[features]
        else:
            raise ValueError("Data baru tidak memiliki kolom yang sama dengan data training")
    else:
        X_new = new_data

    # Normalisasi
    X_new_scaled = scaler.transform(X_new)

    # Prediksi
    X_new_pred = model.predict(X_new_scaled)

    # Hitung rekonstruksi error
    mse_new = np.mean(np.power(X_new_scaled - X_new_pred, 2), axis=1)

    # Tambahkan kolom rekonstruksi error dan flag anomali
    if isinstance(new_data, pd.DataFrame):
        new_data = new_data.copy()
        new_data['reconstruction_error'] = mse_new
        new_data['is_anomaly'] = mse_new > threshold

    return new_data

# EXAMPLE USAGE:
# new_df = pd.read_csv('new_data.csv')
# results = detect_anomalies(new_df, autoencoder, scaler, threshold)
# anomalies = results[results['is_anomaly'] == True]