In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
import joblib
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Load data & model

In [None]:
def load_results(model_path, scaler_path, data_path):
    # Memuat model dan scaler
    model = load_model(model_path)
    scaler = joblib.load(scaler_path)
    
    # Memuat data
    df = pd.read_csv(data_path)
    
    # Mengidentifikasi fitur yang digunakan model
    features = [col for col in df.columns 
                if col not in ['id', 'created_date', 'created_time', 'reconstruction_error', 'is_anomaly']]
    
    # Jika hasil deteksi belum ada di data, lakukan deteksi
    if 'reconstruction_error' not in df.columns or 'is_anomaly' not in df.columns:
        # Preprocessing
        X = df[features]
        X_scaled = scaler.transform(X)
        
        # Prediksi
        X_pred = model.predict(X_scaled)
        
        # Hitung rekonstruksi error
        mse = np.mean(np.power(X_scaled - X_pred, 2), axis=1)
        
        # Tentukan threshold (contoh: 95th percentile)
        threshold = np.percentile(mse, 95)
        
        # Tambahkan hasil ke dataframe
        df['reconstruction_error'] = mse
        df['is_anomaly'] = mse > threshold
    
    return model, scaler, df, features

# Anomaly vs normal Distribution

In [None]:
def visualize_normal_vs_anomaly(df, features):
    """
    Visualisasi distribusi data normal vs anomali untuk fitur-fitur utama.
    
    Args:
        df (DataFrame): DataFrame dengan kolom is_anomaly
        features (list): Daftar fitur yang akan divisualisasikan
    """
    plt.figure(figsize=(20, 15))
    
    # Pilih 8 fitur teratas untuk visualisasi
    visual_features = features[:8]
    
    for i, feature in enumerate(visual_features):
        plt.subplot(2, 4, i+1)
        sns.kdeplot(df[df['is_anomaly'] == False][feature], label='Normal', color='blue')
        sns.kdeplot(df[df['is_anomaly'] == True][feature], label='Anomali', color='red')
        plt.title(f'Distribusi {feature}')
        plt.legend()
    
    plt.tight_layout()
    plt.suptitle('Perbandingan Distribusi Fitur: Normal vs Anomali', fontsize=16, y=1.02)
    plt.show()

# Visualization with Dimentionality Reduction

In [None]:
def visualize_with_dimension_reduction(df, features, scaler):
    """
    Visualisasi data menggunakan PCA dan t-SNE untuk melihat clustering anomali.
    
    Args:
        df (DataFrame): DataFrame dengan kolom is_anomaly
        features (list): Daftar fitur untuk reduksi dimensi
        scaler: Scaler yang digunakan untuk normalisasi data
    """
    # Ekstrak dan normalisasi data fitur
    X = df[features]
    X_scaled = scaler.transform(X)
    
    # PCA untuk reduksi dimensi
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # t-SNE untuk reduksi dimensi
    tsne = TSNE(n_components=2, perplexity=5, random_state=42)
    X_tsne = tsne.fit_transform(X_scaled)
    
    # Visualisasi hasil PCA
    plt.figure(figsize=(18, 8))
    
    plt.subplot(1, 2, 1)
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['is_anomaly'], cmap='coolwarm', alpha=0.7)
    plt.title('PCA: Normal vs Anomali')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(scatter, label='Anomali')
    plt.grid(True, linestyle='--', alpha=0.5)
    
    plt.subplot(1, 2, 2)
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=df['is_anomaly'], cmap='coolwarm', alpha=0.7)
    plt.title('t-SNE: Normal vs Anomali')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.colorbar(scatter, label='Anomali')
    plt.grid(True, linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
    
    # Menampilkan explained variance PCA
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained variance by PCA components: {explained_variance[0]:.4f}, {explained_variance[1]:.4f}")
    print(f"Total explained variance: {sum(explained_variance):.4f}")

# Analyze Anomaly Pattern time-by-time

In [None]:
def analyze_temporal_patterns(df):
    """
    Analisis pola anomali berdasarkan waktu.
    
    Args:
        df (DataFrame): DataFrame dengan timestamp dan kolom is_anomaly
    """
    # Pastikan format waktu tepat
    if 'created_date' in df.columns and 'created_time' in df.columns:
        df['timestamp'] = pd.to_datetime(df['created_date'] + ' ' + df['created_time'])
    
    # Mengelompokkan anomali berdasarkan tanggal
    if 'timestamp' in df.columns:
        # Ekstrak tanggal dari timestamp
        df['date'] = df['timestamp'].dt.date
        
        # Hitung jumlah anomali per tanggal
        anomaly_by_date = df.groupby('date')['is_anomaly'].sum().reset_index()
        anomaly_by_date.columns = ['date', 'anomaly_count']
        
        # Hitung total sampel per tanggal
        total_by_date = df.groupby('date').size().reset_index()
        total_by_date.columns = ['date', 'total_count']
        
        # Gabungkan untuk menghitung persentase
        anomaly_stats = pd.merge(anomaly_by_date, total_by_date, on='date')
        anomaly_stats['anomaly_percentage'] = (anomaly_stats['anomaly_count'] / anomaly_stats['total_count']) * 100
        
        # Visualisasi tren anomali
        plt.figure(figsize=(14, 6))
        plt.bar(range(len(anomaly_stats)), anomaly_stats['anomaly_percentage'], 
                width=0.4, color='skyblue', edgecolor='darkblue')
        plt.xticks(range(len(anomaly_stats)), [d.strftime('%Y-%m-%d') for d in anomaly_stats['date']], rotation=45)
        plt.title('Persentase Anomali per Tanggal')
        plt.xlabel('Tanggal')
        plt.ylabel('Persentase Anomali (%)')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
        
        return anomaly_stats
    else:
        print("Informasi waktu tidak tersedia dalam data.")
        return None

# Analysis of features that contribute most to anomalies

In [None]:
def analyze_feature_contribution(df, model, scaler, features):
    """
    Analisis kontribusi setiap fitur terhadap anomali.
    
    Args:
        df (DataFrame): DataFrame dengan kolom is_anomaly
        model: Model autoencoder yang dilatih
        scaler: Scaler yang digunakan untuk normalisasi data
        features (list): Daftar fitur yang dianalisis
    """
    # Ekstrak data anomali dan normal
    anomaly_data = df[df['is_anomaly'] == True][features]
    normal_data = df[df['is_anomaly'] == False][features]
    
    # Normalisasi data
    anomaly_scaled = scaler.transform(anomaly_data)
    normal_scaled = scaler.transform(normal_data)
    
    # Prediksi rekonstruksi
    anomaly_pred = model.predict(anomaly_scaled)
    normal_pred = model.predict(normal_scaled)
    
    # Hitung error per fitur
    anomaly_error_per_feature = np.mean(np.power(anomaly_scaled - anomaly_pred, 2), axis=0)
    normal_error_per_feature = np.mean(np.power(normal_scaled - normal_pred, 2), axis=0)
    
    # Hitung rasio error (anomali / normal) untuk mengidentifikasi fitur yang paling kontributif
    error_ratio = anomaly_error_per_feature / (normal_error_per_feature + 1e-10)
    
    # Buat DataFrame untuk visualisasi
    feature_contribution = pd.DataFrame({
        'feature': features,
        'anomaly_error': anomaly_error_per_feature,
        'normal_error': normal_error_per_feature,
        'error_ratio': error_ratio
    }).sort_values('error_ratio', ascending=False)
    
    # Visualisasi kontribusi fitur
    plt.figure(figsize=(12, 8))
    sns.barplot(x='error_ratio', y='feature', data=feature_contribution, palette='viridis')
    plt.title('Kontribusi Fitur terhadap Anomali (Rasio Error Anomali/Normal)')
    plt.xlabel('Rasio Error (Anomali / Normal)')
    plt.ylabel('Fitur')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    return feature_contribution

# Run all Analyze

In [None]:
def run_analysis(model_path, scaler_path, data_path):
    """
    Menjalankan seluruh analisis dan visualisasi.
    
    Args:
        model_path (str): Path ke model yang disimpan
        scaler_path (str): Path ke scaler yang disimpan
        data_path (str): Path ke data yang akan dianalisis
    """
    # Memuat model, scaler, dan data
    model, scaler, df, features = load_results(model_path, scaler_path, data_path)
    
    # Visualisasi distribusi normal vs anomali
    visualize_normal_vs_anomaly(df, features)
    
    # Visualisasi dengan reduksi dimensi (PCA dan t-SNE)
    visualize_with_dimension_reduction(df, features, scaler)
    
    # Analisis pola anomali dari waktu ke waktu
    anomaly_stats = analyze_temporal_patterns(df)
    
    # Analisis kontribusi fitur terhadap anomali
    feature_contribution = analyze_feature_contribution(df, model, scaler, features)
    
    return anomaly_stats, feature_contribution

# EXAMPLE USAGE

In [None]:
if __name__ == "__main__":
    # Path ke model, scaler, dan data
    model_path = 'autoencoder_model.h5'
    scaler_path = 'scaler.pkl'
    data_path = 'data_with_results.csv'
    
    # Jalankan analisis
    anomaly_stats, feature_contribution = run_analysis(model_path, scaler_path, data_path)
    
    # Tampilkan hasil analisis
    print("\nStatistik Anomali per Tanggal:")
    print(anomaly_stats)
    
    print("\nKontribusi Fitur terhadap Anomali:")
    print(feature_contribution)