In [17]:
#!/usr/bin/env python3
"""
Industrial Anomaly Detection - Refined and Error-Free Implementation
Addresses all identified issues in the original notebook
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pickle
from datetime import datetime
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

# Statistical Analysis
from scipy import stats
from scipy.stats import zscore

print("="*80)
print("INDUSTRIAL ANOMALY DETECTION - REFINED IMPLEMENTATION")
print("="*80)

# =============================================================================
# 1. DATA LOADING AND PREPROCESSING
# =============================================================================

def load_and_clean_data(file_path):
    """
    Load and clean the industrial dataset with robust error handling
    """
    print("\n1. LOADING AND CLEANING DATA")
    print("-" * 50)
    
    try:
        # Load the dataset
        df = pd.read_csv(file_path)
        print(f"✓ Dataset loaded successfully: {df.shape}")
        
        # Handle timestamp creation
        if 'Date' in df.columns and 'Time' in df.columns:
            # Clean time format (remove trailing ',0')
            df['Time'] = df['Time'].astype(str).str.replace(',0', '', regex=False)
            
            # Create timestamp with error handling
            try:
                df['Timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], 
                                                errors='coerce')
                df = df.sort_values('Timestamp').reset_index(drop=True)
                print("✓ Timestamp created and data sorted chronologically")
                
                # Drop original Date and Time columns
                df = df.drop(columns=['Date', 'Time'])
                
            except Exception as e:
                print(f"⚠ Warning: Could not create timestamp: {e}")
        
        # Check for missing values
        missing_counts = df.isnull().sum()
        if missing_counts.sum() > 0:
            print(f"⚠ Missing values found in {(missing_counts > 0).sum()} columns")
            # Forward fill then backward fill
            df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
            print("✓ Missing values handled")
        else:
            print("✓ No missing values detected")
            
        return df
        
    except Exception as e:
        print(f"✗ Error loading data: {e}")
        raise

# =============================================================================
# 2. FEATURE ENGINEERING
# =============================================================================

def engineer_features(df):
    """
    Create domain-specific features for anomaly detection
    """
    print("\n2. FEATURE ENGINEERING")
    print("-" * 50)
    
    original_features = df.shape[1]
    
    # Time-based features (if timestamp exists)
    if 'Timestamp' in df.columns:
        df['Hour'] = df['Timestamp'].dt.hour
        df['Minute'] = df['Timestamp'].dt.minute
        df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        print("✓ Time-based features created")
    
    # Efficiency metrics
    efficiency_features = []
    if all(col in df.columns for col in ['CompletedLog.ToolPickTime', 'CompletedLog.TotalCycleTime']):
        df['PickEfficiency'] = df['CompletedLog.ToolPickTime'] / (df['CompletedLog.TotalCycleTime'] + 1e-6)
        efficiency_features.append('PickEfficiency')
        
    if all(col in df.columns for col in ['CompletedLog.ToolDepositTime', 'CompletedLog.TotalCycleTime']):
        df['DepositEfficiency'] = df['CompletedLog.ToolDepositTime'] / (df['CompletedLog.TotalCycleTime'] + 1e-6)
        efficiency_features.append('DepositEfficiency')
    
    # Deviation features (actual vs target)
    deviation_features = []
    deviation_pairs = [
        ('Actuator.TargetPos', 'Actuator.CurrentPos', 'ActuatorPosDeviation'),
        ('Actuator.WantedBellowPressure', 'IO.BellowPressure', 'BellowPressureDeviation'),
        ('VacuumBlower.CMD.SpeedHz', 'Vacuum.Stat.ActSpeedHz', 'VacuumSpeedDeviation')
    ]
    
    for target_col, actual_col, dev_name in deviation_pairs:
        if target_col in df.columns and actual_col in df.columns:
            df[dev_name] = df[target_col] - df[actual_col]
            df[f'{dev_name}_Abs'] = np.abs(df[dev_name])
            deviation_features.extend([dev_name, f'{dev_name}_Abs'])
    
    # System health indicators
    health_features = []
    
    # Vacuum system health
    if 'IO.SoleVacuum' in df.columns and 'IO.CurrentVacuumMotor' in df.columns:
        df['VacuumEfficiency'] = df['IO.SoleVacuum'] / (df['IO.CurrentVacuumMotor'] + 1e-6)
        health_features.append('VacuumEfficiency')
    
    # Temperature stability
    if 'IO.ToolTemperature' in df.columns:
        temp_mean = df['IO.ToolTemperature'].mean()
        df['TempDeviation'] = np.abs(df['IO.ToolTemperature'] - temp_mean)
        health_features.append('TempDeviation')
    
    # Alarm aggregation
    alarm_cols = [col for col in df.columns if 'Alarm.' in col]
    if alarm_cols:
        df['TotalAlarms'] = df[alarm_cols].sum(axis=1)
        df['SystemHealthScore'] = 1 - (df['TotalAlarms'] / len(alarm_cols))
        health_features.extend(['TotalAlarms', 'SystemHealthScore'])
    
    # Machine status
    if 'Status.MachineStatus' in df.columns:
        df['MachineRunning'] = (df['Status.MachineStatus'] > 0).astype(int)
        health_features.append('MachineRunning')
    
    # Rolling statistics for trend detection
    trend_features = []
    if 'CompletedLog.TotalCycleTime' in df.columns:
        df['CycleTime_MA5'] = df['CompletedLog.TotalCycleTime'].rolling(5, min_periods=1).mean()
        df['CycleTime_Std5'] = df['CompletedLog.TotalCycleTime'].rolling(5, min_periods=1).std().fillna(0)
        trend_features.extend(['CycleTime_MA5', 'CycleTime_Std5'])
    
    new_features = df.shape[1] - original_features
    print(f"✓ Created {new_features} new features:")
    if efficiency_features:
        print(f"  - Efficiency: {', '.join(efficiency_features)}")
    if deviation_features:
        print(f"  - Deviations: {', '.join(deviation_features[:3])}...")
    if health_features:
        print(f"  - Health: {', '.join(health_features[:3])}...")
    if trend_features:
        print(f"  - Trends: {', '.join(trend_features)}")
    
    return df

# =============================================================================
# 3. DATA PREPARATION FOR MODELING
# =============================================================================

def prepare_modeling_data(df, target_col='Alarm.ItemDroppedError'):
    """
    Prepare clean feature matrix for machine learning
    """
    print("\n3. PREPARING MODELING DATA")
    print("-" * 50)
    
    # Check target column
    target_exists = target_col in df.columns
    if target_exists:
        target_stats = df[target_col].value_counts()
        print(f"Target column '{target_col}' distribution:")
        print(target_stats)
        has_positive_cases = (df[target_col] > 0).any()
    else:
        print(f"⚠ Target column '{target_col}' not found - using unsupervised approach")
        has_positive_cases = False
    
    # Select features for modeling
    exclude_cols = [
        'Relative time', 'Timestamp', 'Hour', 'Minute', 'DayOfWeek', 'IsWeekend',
        'Statistics.SequenceNr', 'BellowSoftTouch.SequenceNo'  # ID columns
    ]
    
    # Add target column to exclusions if it exists
    if target_exists:
        exclude_cols.append(target_col)
    
    # Get numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in numeric_cols if col not in exclude_cols]
    
    # Create feature matrix
    X = df[feature_cols].copy()
    
    # Handle any remaining missing values
    X = X.fillna(X.median()).fillna(0)
    
    # Remove constant or near-constant features
    constant_mask = []
    for col in X.columns:
        if X[col].nunique() <= 1 or X[col].std() < 1e-8:
            constant_mask.append(col)
    
    if constant_mask:
        X = X.drop(columns=constant_mask)
        print(f"✓ Removed {len(constant_mask)} constant/near-constant features")
    
    # Prepare target variable
    if target_exists and has_positive_cases:
        y = df[target_col].astype(int)
        supervised_possible = True
    else:
        y = None
        supervised_possible = False
    
    print(f"✓ Final feature matrix: {X.shape}")
    print(f"✓ Modeling approach: {'Supervised + Unsupervised' if supervised_possible else 'Unsupervised only'}")
    
    return X, y, supervised_possible

# =============================================================================
# 4. ANOMALY DETECTION MODELS
# =============================================================================

class RefinedAnomalyDetector:
    """
    Comprehensive anomaly detection with multiple algorithms
    """
    
    def __init__(self):
        self.models = {}
        self.scaler = None
        self.feature_names = None
        self.results = {}
        
    def fit(self, X, y=None, supervised_possible=False):
        """
        Train anomaly detection models
        """
        print("\n4. TRAINING ANOMALY DETECTION MODELS")
        print("-" * 50)
        
        # Store feature names
        self.feature_names = X.columns.tolist()
        
        # Scale features
        self.scaler = RobustScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # Train unsupervised models (always applicable)
        self._train_unsupervised(X_scaled)
        
        # Train supervised models if possible
        if supervised_possible and y is not None:
            self._train_supervised(X_scaled, y)
        
        return self
    
    def _train_unsupervised(self, X_scaled):
        """Train unsupervised anomaly detection models"""
        print("Training unsupervised models...")
        
        # 1. Isolation Forest
        iso_forest = IsolationForest(
            contamination=0.05,  # Expect 5% anomalies
            random_state=42,
            n_estimators=200,
            max_samples='auto'
        )
        iso_forest.fit(X_scaled)
        self.models['isolation_forest'] = iso_forest
        print("  ✓ Isolation Forest trained")
        
        # 2. DBSCAN for density-based anomalies
        dbscan = DBSCAN(eps=0.5, min_samples=10)
        cluster_labels = dbscan.fit_predict(X_scaled)
        self.models['dbscan'] = dbscan
        self.models['dbscan_labels'] = cluster_labels
        print("  ✓ DBSCAN clustering completed")
        
        # 3. Statistical Z-score method
        z_scores = np.abs(zscore(X_scaled, axis=0, nan_policy='omit'))
        max_z_scores = np.nanmax(z_scores, axis=1)
        self.models['z_scores'] = max_z_scores
        print("  ✓ Statistical analysis completed")
    
    def _train_supervised(self, X_scaled, y):
        """Train supervised models when positive cases exist"""
        print("Training supervised models...")
        
        try:
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, y, test_size=0.3, random_state=42, stratify=y
            )
            
            # Random Forest Classifier
            rf_clf = RandomForestClassifier(
                n_estimators=200,
                max_depth=10,
                random_state=42,
                class_weight='balanced'
            )
            rf_clf.fit(X_train, y_train)
            self.models['random_forest'] = rf_clf
            self.models['test_data'] = (X_test, y_test)
            print("  ✓ Random Forest trained")
            
        except Exception as e:
            print(f"  ⚠ Supervised training failed: {e}")
    
    def predict(self, X=None):
        """
        Generate anomaly predictions using ensemble of models
        """
        print("\n5. GENERATING ANOMALY PREDICTIONS")
        print("-" * 50)
        
        if X is None:
            print("No data provided for prediction")
            return None
        
        # Scale features
        X_scaled = self.scaler.transform(X)
        results = {}
        
        # Isolation Forest predictions
        if 'isolation_forest' in self.models:
            iso_pred = self.models['isolation_forest'].predict(X_scaled)
            iso_scores = self.models['isolation_forest'].decision_function(X_scaled)
            results['isolation_forest'] = {
                'predictions': (iso_pred == -1).astype(int),
                'scores': iso_scores,
                'anomaly_rate': (iso_pred == -1).mean() * 100
            }
            print(f"  ✓ Isolation Forest: {(iso_pred == -1).sum()} anomalies ({(iso_pred == -1).mean()*100:.2f}%)")
        
        # DBSCAN predictions
        if 'dbscan_labels' in self.models:
            dbscan_labels = self.models['dbscan'].fit_predict(X_scaled)
            noise_mask = dbscan_labels == -1
            results['dbscan'] = {
                'predictions': noise_mask.astype(int),
                'anomaly_rate': noise_mask.mean() * 100,
                'n_clusters': len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
            }
            print(f"  ✓ DBSCAN: {noise_mask.sum()} anomalies ({noise_mask.mean()*100:.2f}%)")
        
        # Statistical anomalies
        if 'z_scores' in self.models:
            z_scores = np.abs(zscore(X_scaled, axis=0, nan_policy='omit'))
            max_z_scores = np.nanmax(z_scores, axis=1)
            z_threshold = 3.0  # 3-sigma rule
            z_anomalies = max_z_scores > z_threshold
            results['statistical'] = {
                'predictions': z_anomalies.astype(int),
                'scores': max_z_scores,
                'anomaly_rate': z_anomalies.mean() * 100
            }
            print(f"  ✓ Statistical (3σ): {z_anomalies.sum()} anomalies ({z_anomalies.mean()*100:.2f}%)")
        
        # Ensemble prediction
        ensemble_scores = np.zeros(len(X_scaled))
        weight_sum = 0
        
        if 'isolation_forest' in results:
            ensemble_scores += results['isolation_forest']['predictions'] * 0.4
            weight_sum += 0.4
            
        if 'dbscan' in results:
            ensemble_scores += results['dbscan']['predictions'] * 0.3
            weight_sum += 0.3
            
        if 'statistical' in results:
            ensemble_scores += results['statistical']['predictions'] * 0.3
            weight_sum += 0.3
        
        if weight_sum > 0:
            ensemble_scores /= weight_sum
            ensemble_predictions = (ensemble_scores >= 0.5).astype(int)
            results['ensemble'] = {
                'predictions': ensemble_predictions,
                'scores': ensemble_scores,
                'anomaly_rate': ensemble_predictions.mean() * 100
            }
            print(f"  ✓ Ensemble: {ensemble_predictions.sum()} anomalies ({ensemble_predictions.mean()*100:.2f}%)")
        
        self.results = results
        return results
    
    def save_model(self, filepath):
        """Save the trained model"""
        model_data = {
            'models': self.models,
            'scaler': self.scaler,
            'feature_names': self.feature_names,
            'metadata': {
                'created_at': datetime.now().isoformat(),
                'n_features': len(self.feature_names) if self.feature_names else 0,
                'model_types': list(self.models.keys())
            }
        }
        
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"✓ Model saved to {filepath}")
    
    def load_model(self, filepath):
        """Load a saved model"""
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        self.models = model_data['models']
        self.scaler = model_data['scaler']
        self.feature_names = model_data['feature_names']
        
        print(f"✓ Model loaded from {filepath}")
        print(f"  Features: {len(self.feature_names)}")
        print(f"  Model types: {', '.join(model_data['metadata']['model_types'])}")

# =============================================================================
# 6. EVALUATION AND ANALYSIS
# =============================================================================

def analyze_results(df, results, target_col='Alarm.ItemDroppedError'):
    """
    Analyze and interpret anomaly detection results
    """
    print("\n6. RESULTS ANALYSIS")
    print("-" * 50)
    
    if not results:
        print("No results to analyze")
        return
    
    # Add predictions to dataframe
    for method_name, method_results in results.items():
        if 'predictions' in method_results:
            df[f'Anomaly_{method_name}'] = method_results['predictions']
    
    # Summary statistics
    print("Anomaly Detection Summary:")
    for method_name, method_results in results.items():
        if 'anomaly_rate' in method_results:
            print(f"  {method_name}: {method_results['anomaly_rate']:.2f}%")
    
    # Temporal analysis (if timestamp available)
    if 'Timestamp' in df.columns and 'ensemble' in results:
        anomaly_mask = results['ensemble']['predictions'] == 1
        if anomaly_mask.sum() > 0:
            anomaly_data = df[anomaly_mask]
            print(f"\nTemporal Analysis:")
            print(f"  First anomaly: {anomaly_data['Timestamp'].min()}")
            print(f"  Last anomaly: {anomaly_data['Timestamp'].max()}")
            
            if 'Hour' in df.columns:
                hourly_dist = anomaly_data['Hour'].value_counts().sort_index()
                if len(hourly_dist) > 0:
                    peak_hour = hourly_dist.idxmax()
                    print(f"  Peak anomaly hour: {peak_hour}:00")
    
    # Feature correlation analysis
    if 'ensemble' in results:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            correlations = []
            ensemble_predictions = results['ensemble']['predictions']
            
            for col in numeric_cols:
                if col not in [f'Anomaly_{name}' for name in results.keys()]:
                    try:
                        corr = np.corrcoef(df[col].fillna(0), ensemble_predictions)[0, 1]
                        if not np.isnan(corr):
                            correlations.append((col, abs(corr)))
                    except:
                        continue
            
            correlations.sort(key=lambda x: x[1], reverse=True)
            print(f"\nTop 5 features correlated with anomalies:")
            for feature, corr in correlations[:5]:
                print(f"  {feature}: {corr:.3f}")

# =============================================================================
# 7. BUSINESS INSIGHTS
# =============================================================================

def generate_business_insights(df, results):
    """
    Generate actionable business insights
    """
    print("\n7. BUSINESS INSIGHTS AND RECOMMENDATIONS")
    print("-" * 50)
    
    total_operations = len(df)
    
    if 'ensemble' in results:
        anomaly_count = results['ensemble']['predictions'].sum()
        anomaly_rate = results['ensemble']['anomaly_rate']
    else:
        anomaly_count = 0
        anomaly_rate = 0
    
    print(f"Operational Summary:")
    print(f"  • Total operations analyzed: {total_operations:,}")
    print(f"  • Potential anomalies detected: {anomaly_count:,}")
    print(f"  • Anomaly rate: {anomaly_rate:.2f}%")
    
    # Risk assessment
    if anomaly_rate < 2:
        risk_level = "LOW"
        action = "Continue monitoring"
    elif anomaly_rate < 5:
        risk_level = "MEDIUM"
        action = "Investigate patterns"
    else:
        risk_level = "HIGH"
        action = "Immediate investigation required"
    
    print(f"\nRisk Assessment:")
    print(f"  • Risk level: {risk_level}")
    print(f"  • Recommended action: {action}")
    
    # Time-based insights
    if 'Timestamp' in df.columns:
        duration = (df['Timestamp'].max() - df['Timestamp'].min()).total_seconds() / 3600
        operations_per_hour = total_operations / duration if duration > 0 else 0
        anomalies_per_hour = anomaly_count / duration if duration > 0 else 0
        
        print(f"\nOperational Metrics:")
        print(f"  • Analysis duration: {duration:.1f} hours")
        print(f"  • Operations per hour: {operations_per_hour:.1f}")
        print(f"  • Anomalies per hour: {anomalies_per_hour:.1f}")
    
    print(f"\nRecommendations:")
    print(f"  1. Implement real-time monitoring using ensemble model")
    print(f"  2. Set alert threshold at ensemble score > 0.7")
    print(f"  3. Investigate top correlated features for root causes")
    print(f"  4. Schedule preventive maintenance based on patterns")
    
    if anomaly_count > 0:
        print(f"  5. Review {anomaly_count} flagged operations for validation")

# =============================================================================
# 8. MAIN EXECUTION PIPELINE
# =============================================================================

def main_pipeline(file_path, target_col='Alarm.ItemDroppedError', save_model=True):
    """
    Execute the complete anomaly detection pipeline
    """
    print("Starting Industrial Anomaly Detection Pipeline...")
    
    try:
        # Step 1: Load and clean data
        df = load_and_clean_data(file_path)
        
        # Step 2: Feature engineering
        df = engineer_features(df)
        
        # Step 3: Prepare modeling data
        X, y, supervised_possible = prepare_modeling_data(df, target_col)
        
        # Step 4: Train models
        detector = RefinedAnomalyDetector()
        detector.fit(X, y, supervised_possible)
        
        # Step 5: Generate predictions
        results = detector.predict(X)
        
        # Step 6: Analyze results
        analyze_results(df, results, target_col)
        
        # Step 7: Business insights
        generate_business_insights(df, results)
        
        # Step 8: Save model
        if save_model:
            model_path = "models/refined_anomaly_model.pkl"
            detector.save_model(model_path)
        
        print("\n" + "="*80)
        print("PIPELINE COMPLETED SUCCESSFULLY")
        print("="*80)
        
        return detector, df, results
        
    except Exception as e:
        print(f"\n✗ Pipeline failed: {e}")
        raise

# =============================================================================
# 9. MODEL TESTING FUNCTION
# =============================================================================

def test_saved_model(model_path, test_data_path):
    """
    Test a saved model on new data
    """
    print("\nTESTING SAVED MODEL")
    print("-" * 50)
    
    try:
        # Load the saved model
        detector = RefinedAnomalyDetector()
        detector.load_model(model_path)
        
        # Load new test data
        df_test = load_and_clean_data(test_data_path)
        df_test = engineer_features(df_test)
        
        # Prepare features (ensure same features as training)
        test_features = [col for col in detector.feature_names if col in df_test.columns]
        missing_features = [col for col in detector.feature_names if col not in df_test.columns]
        
        if missing_features:
            print(f"⚠ Missing features in test data: {len(missing_features)}")
            # Create missing features with zeros
            for feature in missing_features:
                df_test[feature] = 0
        
        X_test = df_test[detector.feature_names]
        
        # Generate predictions
        results = detector.predict(X_test)
        
        # Add predictions to test dataframe
        if results and 'ensemble' in results:
            df_test['Anomaly_Prediction'] = results['ensemble']['predictions']
            df_test['Anomaly_Score'] = results['ensemble']['scores']
        
        print("✓ Testing completed successfully")
        return df_test, results
        
    except Exception as e:
        print(f"✗ Testing failed: {e}")
        raise

# =============================================================================
# 10. EXAMPLE USAGE
# =============================================================================

if __name__ == "__main__":
    # Example file path - update with your actual path
    sample_path = r"C:\Users\paran_chakali\projects\New folder\AICD_sample.csv"
    
    # Run the complete pipeline
    detector, df_processed, prediction_results = main_pipeline(
        file_path=sample_path,
        target_col='Alarm.ItemDroppedError',
        save_model=True
    )
    


INDUSTRIAL ANOMALY DETECTION - REFINED IMPLEMENTATION
Starting Industrial Anomaly Detection Pipeline...

1. LOADING AND CLEANING DATA
--------------------------------------------------
✓ Dataset loaded successfully: (20000, 96)
✓ Timestamp created and data sorted chronologically
✓ No missing values detected

2. FEATURE ENGINEERING
--------------------------------------------------
✓ Time-based features created
✓ Created 19 new features:
  - Efficiency: PickEfficiency, DepositEfficiency
  - Deviations: ActuatorPosDeviation, ActuatorPosDeviation_Abs, BellowPressureDeviation...
  - Health: VacuumEfficiency, TempDeviation, TotalAlarms...
  - Trends: CycleTime_MA5, CycleTime_Std5

3. PREPARING MODELING DATA
--------------------------------------------------
Target column 'Alarm.ItemDroppedError' distribution:
Alarm.ItemDroppedError
0.0    20000
Name: count, dtype: int64
✓ Removed 102 constant/near-constant features
✓ Final feature matrix: (20000, 3)
✓ Modeling approach: Unsupervised only

4

### Test the model 

In [18]:
sample_path = r"C:\Users\paran_chakali\projects\New folder\AICD_sample.csv"

test_df, test_results = test_saved_model(
        "models/refined_anomaly_model.pkl",
        sample_path  
    )


TESTING SAVED MODEL
--------------------------------------------------
✓ Model loaded from models/refined_anomaly_model.pkl
  Features: 3
  Model types: isolation_forest, dbscan, dbscan_labels, z_scores

1. LOADING AND CLEANING DATA
--------------------------------------------------
✓ Dataset loaded successfully: (20000, 96)
✓ Timestamp created and data sorted chronologically
✓ No missing values detected

2. FEATURE ENGINEERING
--------------------------------------------------
✓ Time-based features created
✓ Created 19 new features:
  - Efficiency: PickEfficiency, DepositEfficiency
  - Deviations: ActuatorPosDeviation, ActuatorPosDeviation_Abs, BellowPressureDeviation...
  - Health: VacuumEfficiency, TempDeviation, TotalAlarms...
  - Trends: CycleTime_MA5, CycleTime_Std5

5. GENERATING ANOMALY PREDICTIONS
--------------------------------------------------
  ✓ Isolation Forest: 204 anomalies (1.02%)
  ✓ DBSCAN: 0 anomalies (0.00%)
  ✓ Statistical (3σ): 35 anomalies (0.18%)
  ✓ Ensembl

In [19]:
import pandas as pd
import pickle

# Load model
with open("models/anomaly_model_v1.pkl", "rb") as f:
    saved = pickle.load(f)

model = saved["model"]
scaler = saved["scaler"]
feature_names = saved["features"]

# Load new dataset
file_path = r"C:\Users\paran_chakali\projects\New folder\AICD_sample.csv"
df_new = pd.read_csv(file_path)

# Drop same unused cols
drop_cols = ['Relative time', 'Date', 'Time', 'Timestamp']
for col in drop_cols:
    if col in df_new.columns:
        df_new = df_new.drop(columns=col)

# Ensure compatibility with training features
X_new = df_new[feature_names].fillna(0)

# Scale
X_scaled_new = scaler.transform(X_new)

# Predict (-1 = anomaly, 1 = normal)
df_new["Prediction"] = model.predict(X_scaled_new)

# Show first results
print(df_new[["Prediction"]].head(20))


    Prediction
0            1
1            1
2            1
3            1
4            1
5            1
6            1
7            1
8            1
9            1
10           1
11           1
12           1
13           1
14           1
15           1
16           1
17           1
18           1
19           1
