In [1]:
# Essential Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For modeling (optional, if needed later)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Date and time handling
from datetime import datetime

# Suppress warnings for cleaner output
import os
import warnings
warnings.filterwarnings("ignore")

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


In [1]:
# Enhanced Excel Data Loader with Error Handling and Logging
import pandas as pd
import os
import re
from pathlib import Path

def load_excel_sheets(excel_path, preview_rows=2, create_globals=True):
    """
    Load all sheets from an Excel file with enhanced error handling and logging.
    
    Parameters:
    -----------
    excel_path : str
        Path to the Excel file
    preview_rows : int
        Number of rows to preview for each sheet
    create_globals : bool
        Whether to create global DataFrame variables
    
    Returns:
    --------
    dict : Dictionary with lowercase sheet names as keys and DataFrames as values
    """
    
    df_lowercase = {}
    created_variables = []
    
    # Convert to Path object for better path handling
    file_path = Path(excel_path)
    
    # Check if file exists
    if not file_path.exists():
        print(f"❌ Error: File not found at: {excel_path}")
        print(f"Current working directory: {os.getcwd()}")
        return df_lowercase
    
    try:
        print(f"📂 Loading Excel file: {excel_path}")
        
        # Read all sheets
        df_dict = pd.read_excel(excel_path, sheet_name=None)
        
        # Convert sheet names to lowercase and clean them
        df_lowercase = {}
        for sheet_name, df in df_dict.items():
            clean_sheet_name = sheet_name.lower().strip()
            df_lowercase[clean_sheet_name] = df
        
        print(f"✅ Successfully loaded {len(df_lowercase)} sheets")
        
        # Preview and create variables
        print(f"\n{'='*80}")
        print(f"📊 SHEET PREVIEW AND VARIABLE CREATION")
        print(f"{'='*80}")
        
        for sheet_key, df_value in df_lowercase.items():
            print(f"\n📄 Sheet: '{sheet_key}'")
            print(f"   Shape: ({df_value.shape[0]:,} rows, {df_value.shape[1]:,} columns)")
            
            # Show column names
            if len(df_value.columns) <= 10:
                print(f"   Columns: {list(df_value.columns)}")
            else:
                print(f"   Columns: {list(df_value.columns[:5])} ... {list(df_value.columns[-2:])}")
            
            # Preview data
            if not df_value.empty:
                print(f"   Preview (first {preview_rows} rows):")
                print(df_value.head(preview_rows).to_string(index=False, max_cols=8))
            else:
                print("   ⚠️  Sheet is empty")
            
            # Create global variable if requested
            if create_globals:
                variable_name = create_safe_variable_name(sheet_key)
                globals()[variable_name] = df_value
                created_variables.append(variable_name)
                print(f"   ✅ Created variable: {variable_name}")
            
            print("-" * 60)
        
        # Summary
        print(f"\n📋 SUMMARY:")
        print(f"   • Total sheets loaded: {len(df_lowercase)}")
        print(f"   • Total rows across all sheets: {sum(df.shape[0] for df in df_lowercase.values()):,}")
        print(f"   • Created variables: {created_variables}")
        
        return df_lowercase
        
    except Exception as e:
        print(f"❌ Error loading Excel file: {str(e)}")
        return df_lowercase

def create_safe_variable_name(sheet_name):
    """
    Create a safe Python variable name from a sheet name.
    
    Parameters:
    -----------
    sheet_name : str
        Original sheet name
    
    Returns:
    --------
    str : Safe variable name
    """
    # Replace spaces and special characters with underscores
    safe_name = re.sub(r'[^a-zA-Z0-9_]', '_', sheet_name)
    
    # Remove consecutive underscores
    safe_name = re.sub(r'_+', '_', safe_name)
    
    # Remove leading/trailing underscores
    safe_name = safe_name.strip('_')
    
    # Ensure it starts with a letter or underscore (not a number)
    if safe_name and safe_name[0].isdigit():
        safe_name = 'sheet_' + safe_name
    
    # Add df_ prefix
    return f'df_{safe_name}' if safe_name else 'df_unnamed_sheet'

def get_sheet_info(df_dict):
    """
    Get summary information about all loaded sheets.
    
    Parameters:
    -----------
    df_dict : dict
        Dictionary of DataFrames
    
    Returns:
    --------
    pd.DataFrame : Summary information
    """
    if not df_dict:
        print("No sheets to summarize.")
        return pd.DataFrame()
    
    summary_data = []
    for sheet_name, df in df_dict.items():
        info = {
            'Sheet Name': sheet_name,
            'Rows': df.shape[0],
            'Columns': df.shape[1],
            'Memory Usage (KB)': round(df.memory_usage(deep=True).sum() / 1024, 2),
            'Has Missing Values': df.isnull().any().any(),
            'Numeric Columns': df.select_dtypes(include=['number']).shape[1],
            'Text Columns': df.select_dtypes(include=['object']).shape[1],
            'Date Columns': df.select_dtypes(include=['datetime']).shape[1]
        }
        summary_data.append(info)
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df.sort_values('Rows', ascending=False)

# Usage Example:
if __name__ == "__main__":
    # Define the Excel path
    excel_path = r'data\Project Assessment Data.xlsx'
    
    # Load the sheets
    df_lowercase = load_excel_sheets(excel_path, preview_rows=3, create_globals=True)
    
    # Get summary information
    if df_lowercase:
        print(f"\n{'='*80}")
        print("📈 DETAILED SHEET SUMMARY")
        print(f"{'='*80}")
        summary = get_sheet_info(df_lowercase)
        print(summary.to_string(index=False))
        
        # Show available DataFrames in globals
        df_variables = [var for var in globals() if var.startswith('df_') and isinstance(globals()[var], pd.DataFrame)]
        print(f"\n🎯 Available DataFrame variables: {df_variables}")

📂 Loading Excel file: data\Project Assessment Data.xlsx
✅ Successfully loaded 14 sheets

📊 SHEET PREVIEW AND VARIABLE CREATION

📄 Sheet: 'plant description'
   Shape: (13 rows, 2 columns)
   Columns: ['Plant Code', 'Name']
   Preview (first 3 rows):
Plant Code   Name
      A110 Plant1
      A111 Plant2
      A112 Plant3
   ✅ Created variable: df_plant_description
------------------------------------------------------------

📄 Sheet: 'afko'
   Shape: (200 rows, 183 columns)
   Columns: ['MANDT', 'AUFNR 2 Order Number', 'GLTRP', 'GSTRP', 'FTRMS'] ... ['QPGT.LTEXTV', 'QPGT.INAKTIV']
   Preview (first 3 rows):
 MANDT  AUFNR 2 Order Number    GLTRP    GSTRP  ...  QPGT.SPRACHE  QPGT.KURZTEXT  QPGT.LTEXTV  QPGT.INAKTIV
   600             340011104 20241005 20241005  ...           NaN            NaN          NaN           NaN
   600             340011105 20241005 20241005  ...           NaN            NaN          NaN           NaN
   600             340011106 20241005 20241005  ...           

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

def clean_qmel_data(df_qmel, verbose=True):
    """
    Comprehensive cleaning pipeline for QMEL (Quality Notifications) data.
    
    Parameters:
    -----------
    df_qmel : pd.DataFrame
        Raw QMEL DataFrame
    verbose : bool
        Whether to print detailed progress information
    
    Returns:
    --------
    pd.DataFrame : Cleaned QMEL DataFrame
    dict : Cleaning summary statistics
    """
    
    if df_qmel is None or df_qmel.empty:
        print("❌ DataFrame is None or empty. Cannot proceed with cleaning.")
        return df_qmel, {}
    
    # Create a copy to avoid modifying the original
    df_clean = df_qmel.copy()
    initial_shape = df_clean.shape
    cleaning_stats = {
        'initial_rows': initial_shape[0],
        'initial_columns': initial_shape[1],
        'date_columns_converted': 0,
        'missing_values_before': df_clean.isnull().sum().sum(),
        'duplicates_removed': 0
    }
    
    if verbose:
        print("🧹 STARTING QMEL DATA CLEANING PIPELINE")
        print("=" * 60)
        print(f"Initial dataset shape: {initial_shape}")
    
    # 1. Initial Data Assessment
    if verbose:
        print(f"\n📊 1. INITIAL DATA ASSESSMENT")
        print(f"Shape: {df_clean.shape}")
        print(f"Total missing values: {df_clean.isnull().sum().sum():,}")
        print(f"Memory usage: {df_clean.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    
    # 2. Handle Missing Values Analysis
    if verbose:
        print(f"\n🔍 2. MISSING VALUES ANALYSIS")
        missing_analysis = analyze_missing_values(df_clean)
        if not missing_analysis.empty:
            print(missing_analysis.to_string())
        else:
            print("✅ No missing values found!")
    
    # 3. Date Column Conversion
    if verbose:
        print(f"\n📅 3. DATE COLUMN CONVERSION")
    
    # SAP date columns in QMEL
    date_columns = {
        'ERDAT': 'Creation Date',
        'AEDAT': 'Changed Date', 
        'PSTER': 'Period From',
        'PETRI': 'Period To',
        'BEZDT': 'Reference Date',
        'QMDAT': 'Notification Date',
        'LTRMN': 'Delivery Date'
    }
    
    date_conversion_results = {}
    
    for col, description in date_columns.items():
        if col in df_clean.columns:
            result = convert_sap_date_column(df_clean, col, verbose=verbose)
            date_conversion_results[col] = result
            if result['converted']:
                cleaning_stats['date_columns_converted'] += 1
        elif verbose:
            print(f"   ⚠️  Column '{col}' ({description}) not found")
    
    # 4. Data Type Optimization
    if verbose:
        print(f"\n🔧 4. DATA TYPE OPTIMIZATION")
    
    # Convert numeric columns that might be stored as strings
    numeric_candidates = ['PRIOK', 'QMNUM', 'MATNR', 'MENGE', 'MEINS']
    for col in numeric_candidates:
        if col in df_clean.columns:
            original_dtype = df_clean[col].dtype
            if df_clean[col].dtype == 'object':
                try:
                    # Try to convert to numeric
                    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
                    if verbose:
                        print(f"   ✅ Converted '{col}' from {original_dtype} to {df_clean[col].dtype}")
                except:
                    if verbose:
                        print(f"   ❌ Failed to convert '{col}' to numeric")
    
    # 5. Text Data Cleaning
    if verbose:
        print(f"\n📝 5. TEXT DATA CLEANING")
    
    text_columns = ['QMTXT', 'KURZTEXT', 'LTXTM', 'BZMNG']
    for col in text_columns:
        if col in df_clean.columns:
            original_nulls = df_clean[col].isnull().sum()
            # Strip whitespace and convert empty strings to NaN
            df_clean[col] = df_clean[col].astype(str).str.strip()
            df_clean[col] = df_clean[col].replace(['', 'nan', 'None', '0'], pd.NaT)
            new_nulls = df_clean[col].isnull().sum()
            if verbose and new_nulls != original_nulls:
                print(f"   📝 Cleaned '{col}': {new_nulls - original_nulls} empty values converted to NaN")
    
    # 6. Categorical Data Analysis
    if verbose:
        print(f"\n🏷️  6. CATEGORICAL DATA ANALYSIS")
    
    categorical_columns = ['QMART', 'PRIOK', 'QMGRP', 'QMCOD', 'MAWERK', 'LIFNR']
    for col in categorical_columns:
        if col in df_clean.columns:
            unique_count = df_clean[col].nunique()
            if verbose:
                print(f"   📊 '{col}': {unique_count} unique values")
                if unique_count <= 20:  # Show value counts for small categorical vars
                    print(f"      Values: {df_clean[col].value_counts(dropna=False).head().to_dict()}")
    
    # 7. Remove Duplicates
    if verbose:
        print(f"\n🗑️  7. DUPLICATE REMOVAL")
    
    initial_rows = len(df_clean)
    # Check for complete duplicates
    df_clean = df_clean.drop_duplicates()
    duplicates_removed = initial_rows - len(df_clean)
    cleaning_stats['duplicates_removed'] = duplicates_removed
    
    if verbose:
        if duplicates_removed > 0:
            print(f"   🗑️  Removed {duplicates_removed} duplicate rows")
        else:
            print(f"   ✅ No duplicate rows found")
    
    # 8. Final Statistics
    cleaning_stats.update({
        'final_rows': len(df_clean),
        'final_columns': len(df_clean.columns),
        'missing_values_after': df_clean.isnull().sum().sum(),
        'memory_mb': df_clean.memory_usage(deep=True).sum() / (1024**2)
    })
    
    if verbose:
        print(f"\n📈 8. CLEANING SUMMARY")
        print(f"=" * 40)
        print(f"Rows: {cleaning_stats['initial_rows']:,} → {cleaning_stats['final_rows']:,} "
              f"({cleaning_stats['final_rows'] - cleaning_stats['initial_rows']:+,})")
        print(f"Missing values: {cleaning_stats['missing_values_before']:,} → {cleaning_stats['missing_values_after']:,}")
        print(f"Date columns converted: {cleaning_stats['date_columns_converted']}")
        print(f"Memory usage: {cleaning_stats['memory_mb']:.2f} MB")
        print(f"Duplicates removed: {cleaning_stats['duplicates_removed']:,}")
    
    return df_clean, cleaning_stats

def convert_sap_date_column(df, column_name, verbose=True):
    """
    Convert SAP date column from YYYYMMDD format to datetime.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing the column
    column_name : str
        Name of the column to convert
    verbose : bool
        Whether to print conversion details
    
    Returns:
    --------
    dict : Conversion results and statistics
    """
    if column_name not in df.columns:
        return {'converted': False, 'reason': 'Column not found'}
    
    original_dtype = df[column_name].dtype
    original_nulls = df[column_name].isnull().sum()
    
    # Convert to string and handle SAP null representations
    df[column_name] = df[column_name].astype(str)
    
    # Replace SAP null representations
    sap_nulls = ['0', '0.0', '00000000', 'nan', 'None', '']
    df[column_name] = df[column_name].replace(sap_nulls, pd.NaT)
    
    # Convert to datetime
    df[column_name] = pd.to_datetime(df[column_name], format='%Y%m%d', errors='coerce')
    
    new_nulls = df[column_name].isnull().sum()
    conversion_failures = new_nulls - original_nulls
    
    if verbose:
        print(f"   📅 '{column_name}': {original_dtype} → {df[column_name].dtype}")
        if conversion_failures > 0:
            print(f"      ⚠️  {conversion_failures} values couldn't be converted")
        print(f"      📊 Valid dates: {len(df) - new_nulls:,}, Missing: {new_nulls:,}")
    
    return {
        'converted': True,
        'original_dtype': str(original_dtype),
        'new_dtype': str(df[column_name].dtype),
        'conversion_failures': conversion_failures,
        'total_nulls': new_nulls
    }

def analyze_missing_values(df):
    """
    Analyze missing values in the DataFrame.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame to analyze
    
    Returns:
    --------
    pd.DataFrame : Missing value analysis
    """
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
    
    if missing_data.empty:
        return pd.DataFrame()
    
    missing_percent = (missing_data / len(df) * 100).round(2)
    
    missing_df = pd.DataFrame({
        'Missing_Count': missing_data,
        'Missing_Percentage': missing_percent
    })
    
    return missing_df

def get_qmel_quality_report(df_qmel_clean):
    """
    Generate a data quality report for cleaned QMEL data.
    
    Parameters:
    -----------
    df_qmel_clean : pd.DataFrame
        Cleaned QMEL DataFrame
    
    Returns:
    --------
    dict : Quality report
    """
    if df_qmel_clean.empty:
        return {'error': 'DataFrame is empty'}
    
    report = {
        'total_notifications': len(df_qmel_clean),
        'date_range': {},
        'notification_types': {},
        'priority_distribution': {},
        'data_completeness': {}
    }
    
    # Date range analysis
    if 'ERDAT' in df_qmel_clean.columns:
        valid_dates = df_qmel_clean['ERDAT'].dropna()
        if not valid_dates.empty:
            report['date_range'] = {
                'earliest': valid_dates.min(),
                'latest': valid_dates.max(),
                'span_days': (valid_dates.max() - valid_dates.min()).days
            }
    
    # Notification types
    if 'QMART' in df_qmel_clean.columns:
        report['notification_types'] = df_qmel_clean['QMART'].value_counts().head(10).to_dict()
    
    # Priority distribution
    if 'PRIOK' in df_qmel_clean.columns:
        report['priority_distribution'] = df_qmel_clean['PRIOK'].value_counts().to_dict()
    
    # Data completeness
    total_cells = df_qmel_clean.shape[0] * df_qmel_clean.shape[1]
    missing_cells = df_qmel_clean.isnull().sum().sum()
    report['data_completeness'] = {
        'completeness_percentage': round((1 - missing_cells/total_cells) * 100, 2),
        'missing_cells': missing_cells,
        'total_cells': total_cells
    }
    
    return report

# Usage example
if __name__ == "__main__":
    # Check if df_qmel exists and clean it
    if 'df_qmel' in globals() and not df_qmel.empty:
        print("🚀 Starting QMEL Data Cleaning...")
        
        # Clean the data
        df_qmel_clean, stats = clean_qmel_data(df_qmel, verbose=True)
        
        # Generate quality report
        print("\n" + "="*60)
        print("📋 QMEL DATA QUALITY REPORT")
        print("="*60)
        
        quality_report = get_qmel_quality_report(df_qmel_clean)
        
        print(f"📊 Total Notifications: {quality_report.get('total_notifications', 'N/A'):,}")
        
        if 'date_range' in quality_report and quality_report['date_range']:
            dr = quality_report['date_range']
            print(f"📅 Date Range: {dr['earliest'].strftime('%Y-%m-%d')} to {dr['latest'].strftime('%Y-%m-%d')}")
            print(f"   Span: {dr['span_days']:,} days")
        
        if 'notification_types' in quality_report and quality_report['notification_types']:
            print(f"🏷️  Top Notification Types:")
            for ntype, count in list(quality_report['notification_types'].items())[:5]:
                print(f"   {ntype}: {count:,}")
        
        if 'data_completeness' in quality_report:
            dc = quality_report['data_completeness']
            print(f"✅ Data Completeness: {dc['completeness_percentage']}%")
        
        print(f"\n🎯 Cleaned DataFrame 'df_qmel_clean' is ready for analysis!")
        
    else:
        print("❌ df_qmel not found or is empty. Please load the data first.")

🚀 Starting QMEL Data Cleaning...
🧹 STARTING QMEL DATA CLEANING PIPELINE
Initial dataset shape: (200, 165)

📊 1. INITIAL DATA ASSESSMENT
Shape: (200, 165)
Total missing values: 19,833
Memory usage: 0.36 MB

🔍 2. MISSING VALUES ANALYSIS
                        Missing_Count  Missing_Percentage
AENAM                             200               100.0
AUFNR                             200               100.0
WAERS                             200               100.0
VERID                             200               100.0
SA_AUFNR                          200               100.0
RM_WERKS                          200               100.0
RM_MATNR                          200               100.0
QMCOD                             200               100.0
AUSWIRK                           200               100.0
VKORG                             200               100.0
MATNR                             200               100.0
REVLV                             200               100.0
MATKL      

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

def prepare_plant_data():
    """
    Create the plant description dataframe from your actual plant data
    """
    plant_data = {
        'Plant_Code': ['A110', 'A111', 'A112', 'A113', 'A114', 'A210', 'A211', 
                      'A310', 'A410', 'A510', 'A610', 'A710', 'A810'],
        'Plant_Name': ['Plant1', 'Plant2', 'Plant3', 'Plant4', 'Plant5', 'Plant6', 
                      'Plant7', 'Plant8', 'Plant9', 'Plant10', 'Plant11', 'Plant12', 'Plant13']
    }
    
    df_plant_description = pd.DataFrame(plant_data)
    return df_plant_description

def enhance_codes_with_descriptions(df_quality, code_group_col, code_col, 
                                  df_qpcd, df_qpct, df_qpgt, prefix=""):
    """
    Add code descriptions to quality tables
    
    Parameters:
    - df_quality: Quality table (QMEL, QMFE, or QMUR)
    - code_group_col: Column name for code group (e.g., 'QMGRP', 'FEGRP', 'URGRP')
    - code_col: Column name for code (e.g., 'QMCOD', 'FECOD', 'URCOD')
    - df_qpcd, df_qpct, df_qpgt: Code definition tables
    - prefix: Prefix for new columns
    """
    
    enhanced_df = df_quality.copy()
    
    if df_qpcd is not None and not df_qpcd.empty and df_qpct is not None and not df_qpct.empty:
        try:
            # Join with code definitions
            enhanced_df = enhanced_df.merge(
                df_qpcd[['MANDT', 'KATALOGART', 'CODEGRUPPE', 'CODE']],
                left_on=['MANDT', code_group_col, code_col],
                right_on=['MANDT', 'CODEGRUPPE', 'CODE'],
                how='left',
                suffixes=('', f'_{prefix}DEF')
            )
            
            # Join with code texts (taking English or first available language)
            qpct_english = df_qpct[df_qpct['SPRACHE'] == 'EN'] if 'SPRACHE' in df_qpct.columns else df_qpct
            if qpct_english.empty and 'SPRACHE' in df_qpct.columns:
                qpct_english = df_qpct.groupby(['MANDT', 'KATALOGART', 'CODEGRUPPE', 'CODE']).first().reset_index()
            
            enhanced_df = enhanced_df.merge(
                qpct_english[['MANDT', 'KATALOGART', 'CODEGRUPPE', 'CODE', 'KURZTEXT']],
                on=['MANDT', 'KATALOGART', 'CODEGRUPPE', 'CODE'],
                how='left',
                suffixes=('', f'_{prefix}TEXT')
            )
            
            # Add group descriptions from QPGT if available
            if df_qpgt is not None and not df_qpgt.empty:
                qpgt_english = df_qpgt[df_qpgt['SPRACHE'] == 'EN'] if 'SPRACHE' in df_qpgt.columns else df_qpgt
                if qpgt_english.empty and 'SPRACHE' in df_qpgt.columns:
                    qpgt_english = df_qpgt.groupby(['MANDANT', 'KATALOGART', 'CODEGRUPPE']).first().reset_index()
                
                enhanced_df = enhanced_df.merge(
                    qpgt_english[['MANDANT', 'KATALOGART', 'CODEGRUPPE', 'KURZTEXT']].rename(columns={'MANDANT': 'MANDT'}),
                    left_on=['MANDT', 'KATALOGART', 'CODEGRUPPE'],
                    right_on=['MANDT', 'KATALOGART', 'CODEGRUPPE'],
                    how='left',
                    suffixes=('', f'_{prefix}GRP')
                )
            
        except Exception as e:
            print(f"   ⚠️  Code enhancement failed for {prefix}: {e}")
    
    return enhanced_df

def create_comprehensive_sap_view(df_aufk, df_afko, df_afpo, df_aufm, df_qmel, df_qmfe, 
                                df_qmur, df_qmih, df_qpcd, df_qpct, df_qpgt, df_crhd_v1, df_jest,
                                df_plant_description=None):
    """
    Creates a comprehensive SAP quality management view using your exact table structures
    
    Parameters:
    - All your SAP dataframes as loaded
    - df_plant_description: Optional, will create from your plant data if not provided
    
    Returns:
    - comprehensive_df: Main integrated dataset
    - summary_stats: Comprehensive summary statistics
    - quality_details: Detailed quality analysis
    """
    
    print("🚀 SAP COMPREHENSIVE QUALITY MANAGEMENT INTEGRATION")
    print("=" * 80)
    
    # Prepare plant data if not provided
    if df_plant_description is None:
        df_plant_description = prepare_plant_data()
        print("✓ Created plant description from your data")
    
    # Validate required data
    if df_aufk is None or df_aufk.empty:
        raise ValueError("AUFK (Order Master) data is required!")
    
    print(f"📊 Input Data Summary:")
    print(f"   • AUFK (Orders): {len(df_aufk):,} records")
    print(f"   • AFKO (Headers): {len(df_afko):,} records" if df_afko is not None and not df_afko.empty else "   • AFKO: Not available")
    print(f"   • AFPO (Items): {len(df_afpo):,} records" if df_afpo is not None and not df_afpo.empty else "   • AFPO: Not available")
    print(f"   • QMEL (Quality): {len(df_qmel):,} records" if df_qmel is not None and not df_qmel.empty else "   • QMEL: Not available")
    print(f"   • Plants: {len(df_plant_description):,} plants")
    
    # STEP 1: Build Production Order Base
    print(f"\n🏭 STEP 1: Building Production Order Foundation...")
    base_df = df_aufk.copy()
    
    # Add plant information early - primary source AUFK.WERKS
    if 'WERKS' in base_df.columns:
        base_df = base_df.merge(
            df_plant_description,
            left_on='WERKS',
            right_on='Plant_Code',
            how='left',
            suffixes=('', '_PLANT')
        )
        print(f"   ✓ Plant information added via AUFK.WERKS: {len(base_df):,} orders")
    
    # Join with AFKO (Order Header)
    if df_afko is not None and not df_afko.empty:
        join_cols = ['MANDT', 'AUFNR'] if 'MANDT' in df_afko.columns else ['AUFNR']
        base_df = base_df.merge(df_afko, on=join_cols, how='left', suffixes=('', '_AFKO'))
        print(f"   ✓ Order headers joined: {len(base_df):,} orders")
    
    # STEP 2: Add Order Items Summary (AFPO)
    print(f"\n📦 STEP 2: Processing Order Items...")
    
    if df_afpo is not None and not df_afpo.empty:
        try:
            # Create order items summary
            group_cols = ['MANDT', 'AUFNR'] if 'MANDT' in df_afpo.columns else ['AUFNR']
            
            # Build aggregation dictionary based on available columns
            agg_dict = {}
            if 'MATNR' in df_afpo.columns:
                agg_dict['MATNR'] = lambda x: ', '.join(x.unique()[:5])  # Top 5 materials
            if 'POSNR' in df_afpo.columns:
                agg_dict['POSNR'] = 'count'
            if 'CHARG' in df_afpo.columns:
                agg_dict['CHARG'] = lambda x: ', '.join(x.dropna().unique()[:3])  # Top 3 batches
            if 'PWERK' in df_afpo.columns:
                agg_dict['PWERK'] = lambda x: x.iloc[0] if len(x) > 0 else None  # Take first plant
            if 'PSMNG' in df_afpo.columns:
                agg_dict['PSMNG'] = 'sum'  # Total planned quantity
            if 'WEMNG' in df_afpo.columns:
                agg_dict['WEMNG'] = 'sum'  # Total received quantity
            
            if agg_dict:
                items_summary = df_afpo.groupby(group_cols).agg(agg_dict).reset_index()
                
                # Rename columns for clarity
                rename_dict = {}
                if 'MATNR' in agg_dict:
                    rename_dict['MATNR'] = 'ORDER_MATERIALS'
                if 'POSNR' in agg_dict:
                    rename_dict['POSNR'] = 'ORDER_ITEM_COUNT'
                if 'CHARG' in agg_dict:
                    rename_dict['CHARG'] = 'ORDER_BATCHES'
                if 'PWERK' in agg_dict:
                    rename_dict['PWERK'] = 'ITEM_PLANT'
                if 'PSMNG' in agg_dict:
                    rename_dict['PSMNG'] = 'TOTAL_PLANNED_QTY'
                if 'WEMNG' in agg_dict:
                    rename_dict['WEMNG'] = 'TOTAL_RECEIVED_QTY'
                
                items_summary = items_summary.rename(columns=rename_dict)
                
                # Join back to base
                base_df = base_df.merge(items_summary, on=group_cols, how='left')
                print(f"   ✓ Order items summary added: {len(base_df):,} orders")
                
                # Add secondary plant info if AUFK doesn't have plant
                if 'Plant_Name' not in base_df.columns and 'ITEM_PLANT' in base_df.columns:
                    base_df = base_df.merge(
                        df_plant_description,
                        left_on='ITEM_PLANT',
                        right_on='Plant_Code',
                        how='left',
                        suffixes=('', '_ITEM')
                    )
                    print(f"   ✓ Secondary plant info added via AFPO.PWERK")
            
        except Exception as e:
            print(f"   ⚠️  Order items processing failed: {e}")
    
    # STEP 3: Add Goods Movements Summary (AUFM)
    print(f"\n📊 STEP 3: Processing Goods Movements...")
    
    if df_aufm is not None and not df_aufm.empty and 'AUFNR' in df_aufm.columns:
        try:
            group_cols = ['MANDT', 'AUFNR'] if 'MANDT' in df_aufm.columns else ['AUFNR']
            
            # Summarize goods movements
            aufm_agg = {
                'MBLNR': 'count',  # Number of documents
                'BWART': lambda x: ', '.join(x.unique()[:5]),  # Movement types
                'MENGE': 'sum',  # Total quantity
                'DMBTR': 'sum'  # Total amount
            }
            
            # Only include columns that exist
            aufm_agg = {k: v for k, v in aufm_agg.items() if k in df_aufm.columns}
            
            if aufm_agg:
                movements_summary = df_aufm.groupby(group_cols).agg(aufm_agg).reset_index()
                
                rename_dict = {
                    'MBLNR': 'GOODS_MOVEMENT_COUNT',
                    'BWART': 'MOVEMENT_TYPES',
                    'MENGE': 'TOTAL_MOVEMENT_QTY',
                    'DMBTR': 'TOTAL_MOVEMENT_VALUE'
                }
                
                movements_summary = movements_summary.rename(columns={k: v for k, v in rename_dict.items() if k in movements_summary.columns})
                
                base_df = base_df.merge(movements_summary, on=group_cols, how='left')
                print(f"   ✓ Goods movements summary added: {len(base_df):,} orders")
            
        except Exception as e:
            print(f"   ⚠️  Goods movements processing failed: {e}")
    
    # STEP 4: Add Quality Notifications (QMEL)
    print(f"\n🔍 STEP 4: Processing Quality Notifications...")
    
    if df_qmel is not None and not df_qmel.empty:
        try:
            # Enhance QMEL with code descriptions
            qmel_enhanced = enhance_codes_with_descriptions(
                df_qmel, 'QMGRP', 'QMCOD', df_qpcd, df_qpct, df_qpgt, 'QM'
            )
            
            # Find all possible linking columns to AUFNR
            aufnr_cols = ['AUFNR'] + [col for col in qmel_enhanced.columns if 'ZZAUFNR' in col]
            
            quality_notifications = []
            
            for aufnr_col in aufnr_cols:
                if aufnr_col in qmel_enhanced.columns:
                    # Create subset with non-null values for this AUFNR column
                    qmel_subset = qmel_enhanced[qmel_enhanced[aufnr_col].notna()].copy()
                    if not qmel_subset.empty:
                        qmel_subset = qmel_subset.rename(columns={aufnr_col: 'AUFNR_LINK'})
                        quality_notifications.append(qmel_subset)
            
            if quality_notifications:
                # Combine all quality notifications
                all_quality = pd.concat(quality_notifications, ignore_index=True)
                
                # Group by order and create summary
                group_cols = ['MANDT', 'AUFNR_LINK'] if 'MANDT' in all_quality.columns else ['AUFNR_LINK']
                
                quality_agg = {
                    'QMNUM': 'count',
                    'QMART': lambda x: ', '.join(x.unique()),
                    'PRIOK': 'mean'  # Average priority
                }
                
                # Add description columns if available
                if 'KURZTEXT' in all_quality.columns:
                    quality_agg['KURZTEXT'] = lambda x: ' | '.join(x.dropna().unique()[:3])
                
                quality_summary = all_quality.groupby(group_cols).agg(quality_agg).reset_index()
                
                rename_dict = {
                    'AUFNR_LINK': 'AUFNR',
                    'QMNUM': 'QUALITY_NOTIF_COUNT',
                    'QMART': 'QUALITY_NOTIF_TYPES',
                    'PRIOK': 'AVG_QUALITY_PRIORITY'
                }
                
                if 'KURZTEXT' in quality_agg:
                    rename_dict['KURZTEXT'] = 'QUALITY_ISSUES_DESC'
                
                quality_summary = quality_summary.rename(columns=rename_dict)
                
                # Join to base
                join_cols = ['MANDT', 'AUFNR'] if 'MANDT' in quality_summary.columns else ['AUFNR']
                base_df = base_df.merge(quality_summary, on=join_cols, how='left')
                print(f"   ✓ Quality notifications added: {len(base_df):,} orders")
            
        except Exception as e:
            print(f"   ⚠️  Quality notifications processing failed: {e}")
    
    # STEP 5: Add Quality Defects (QMFE)
    print(f"\n⚠️  STEP 5: Processing Quality Defects...")
    
    if df_qmfe is not None and not df_qmfe.empty and df_qmel is not None and not df_qmel.empty:
        try:
            # Enhance QMFE with code descriptions
            qmfe_enhanced = enhance_codes_with_descriptions(
                df_qmfe, 'FEGRP', 'FECOD', df_qpcd, df_qpct, df_qpgt, 'DEFECT'
            )
            
            # Link defects to orders through QMEL
            qmel_aufnr_link = df_qmel[['MANDT', 'QMNUM', 'AUFNR']].dropna()
            
            defects_with_orders = qmfe_enhanced.merge(
                qmel_aufnr_link,
                on=['MANDT', 'QMNUM'] if 'MANDT' in qmel_aufnr_link.columns else ['QMNUM'],
                how='left'
            )
            
            if not defects_with_orders.empty and 'AUFNR' in defects_with_orders.columns:
                # Summarize defects per order
                group_cols = ['MANDT', 'AUFNR'] if 'MANDT' in defects_with_orders.columns else ['AUFNR']
                
                defect_agg = {
                    'FENUM': 'count',
                    'FECOD': lambda x: ', '.join(x.unique()[:5])
                }
                
                if 'ANZFEHLER' in defects_with_orders.columns:
                    defect_agg['ANZFEHLER'] = 'sum'
                
                if 'KURZTEXT' in defects_with_orders.columns:
                    defect_agg['KURZTEXT'] = lambda x: ' | '.join(x.dropna().unique()[:3])
                
                defects_summary = defects_with_orders.groupby(group_cols).agg(defect_agg).reset_index()
                
                rename_dict = {
                    'FENUM': 'DEFECT_COUNT',
                    'FECOD': 'DEFECT_CODES',
                    'ANZFEHLER': 'TOTAL_DEFECT_QUANTITY'
                }
                
                if 'KURZTEXT' in defect_agg:
                    rename_dict['KURZTEXT'] = 'DEFECT_DESCRIPTIONS'
                
                defects_summary = defects_summary.rename(columns=rename_dict)
                
                base_df = base_df.merge(defects_summary, on=group_cols, how='left')
                print(f"   ✓ Quality defects added: {len(base_df):,} orders")
            
        except Exception as e:
            print(f"   ⚠️  Quality defects processing failed: {e}")
    
    # STEP 6: Add Root Causes (QMUR)
    print(f"\n🎯 STEP 6: Processing Root Causes...")
    
    if df_qmur is not None and not df_qmur.empty and df_qmel is not None and not df_qmel.empty:
        try:
            # Enhance QMUR with code descriptions
            qmur_enhanced = enhance_codes_with_descriptions(
                df_qmur, 'URGRP', 'URCOD', df_qpcd, df_qpct, df_qpgt, 'CAUSE'
            )
            
            # Link causes to orders through QMEL
            qmel_aufnr_link = df_qmel[['MANDT', 'QMNUM', 'AUFNR']].dropna()
            
            causes_with_orders = qmur_enhanced.merge(
                qmel_aufnr_link,
                on=['MANDT', 'QMNUM'] if 'MANDT' in qmel_aufnr_link.columns else ['QMNUM'],
                how='left'
            )
            
            if not causes_with_orders.empty and 'AUFNR' in causes_with_orders.columns:
                group_cols = ['MANDT', 'AUFNR'] if 'MANDT' in causes_with_orders.columns else ['AUFNR']
                
                cause_agg = {
                    'URNUM': 'count',
                    'URCOD': lambda x: ', '.join(x.unique()[:5])
                }
                
                if 'ROOTCAUSE' in causes_with_orders.columns:
                    cause_agg['ROOTCAUSE'] = 'sum'
                
                if 'KURZTEXT' in causes_with_orders.columns:
                    cause_agg['KURZTEXT'] = lambda x: ' | '.join(x.dropna().unique()[:3])
                
                causes_summary = causes_with_orders.groupby(group_cols).agg(cause_agg).reset_index()
                
                rename_dict = {
                    'URNUM': 'CAUSE_COUNT',
                    'URCOD': 'CAUSE_CODES',
                    'ROOTCAUSE': 'ROOT_CAUSE_COUNT'
                }
                
                if 'KURZTEXT' in cause_agg:
                    rename_dict['KURZTEXT'] = 'CAUSE_DESCRIPTIONS'
                
                causes_summary = causes_summary.rename(columns=rename_dict)
                
                base_df = base_df.merge(causes_summary, on=group_cols, how='left')
                print(f"   ✓ Root causes added: {len(base_df):,} orders")
            
        except Exception as e:
            print(f"   ⚠️  Root causes processing failed: {e}")
    
    # STEP 7: Add Work Center Information
    print(f"\n⚙️  STEP 7: Adding Work Center Information...")
    
    if df_crhd_v1 is not None and not df_crhd_v1.empty:
        try:
            # Link work centers through AFKO.ARBPL_OBJID or direct ARBPL matches
            if 'ARBPL_OBJID' in base_df.columns and 'OBJID' in df_crhd_v1.columns:
                base_df = base_df.merge(
                    df_crhd_v1[['OBJID', 'ARBPL', 'WERKS', 'KTEXT']],
                    left_on='ARBPL_OBJID',
                    right_on='OBJID',
                    how='left',
                    suffixes=('', '_WC')
                )
                print(f"   ✓ Work centers linked via OBJID: {len(base_df):,} orders")
            
        except Exception as e:
            print(f"   ⚠️  Work center linking failed: {e}")
    
    # STEP 8: Add Status Information
    print(f"\n📊 STEP 8: Adding Status Information...")
    
    if df_jest is not None and not df_jest.empty and 'OBJNR' in base_df.columns:
        try:
            # Get active statuses for orders
            active_statuses = df_jest[df_jest['INACT'] == 0] if 'INACT' in df_jest.columns else df_jest
            
            status_summary = active_statuses.groupby('OBJNR').agg({
                'STAT': lambda x: ', '.join(x.unique())
            }).reset_index().rename(columns={'STAT': 'ORDER_STATUSES'})
            
            base_df = base_df.merge(status_summary, on='OBJNR', how='left')
            print(f"   ✓ Status information added: {len(base_df):,} orders")
            
        except Exception as e:
            print(f"   ⚠️  Status processing failed: {e}")
    
    # STEP 9: Create Derived Fields and KPIs
    print(f"\n🛠️  STEP 9: Creating KPIs and Derived Fields...")
    
    # Fill NaN values for numeric columns
    numeric_cols = [
        'QUALITY_NOTIF_COUNT', 'DEFECT_COUNT', 'CAUSE_COUNT', 'ROOT_CAUSE_COUNT',
        'GOODS_MOVEMENT_COUNT', 'ORDER_ITEM_COUNT', 'TOTAL_PLANNED_QTY', 
        'TOTAL_RECEIVED_QTY', 'TOTAL_MOVEMENT_QTY', 'AVG_QUALITY_PRIORITY'
    ]
    
    for col in numeric_cols:
        if col in base_df.columns:
            base_df[col] = base_df[col].fillna(0)
    
    # Create quality indicators
    base_df['HAS_QUALITY_ISSUES'] = (base_df.get('QUALITY_NOTIF_COUNT', 0) > 0)
    base_df['HAS_DEFECTS'] = (base_df.get('DEFECT_COUNT', 0) > 0)
    base_df['HAS_ROOT_CAUSES'] = (base_df.get('ROOT_CAUSE_COUNT', 0) > 0)
    base_df['HAS_GOODS_MOVEMENTS'] = (base_df.get('GOODS_MOVEMENT_COUNT', 0) > 0)
    
    # Calculate quality score (0-100, higher is better)
    max_notifications = max(base_df.get('QUALITY_NOTIF_COUNT', pd.Series([0])).max(), 1)
    max_defects = max(base_df.get('DEFECT_COUNT', pd.Series([0])).max(), 1)
    
    base_df['QUALITY_SCORE'] = 100 - (
        (base_df.get('QUALITY_NOTIF_COUNT', 0) / max_notifications * 40) +
        (base_df.get('DEFECT_COUNT', 0) / max_defects * 35) +
        (base_df['HAS_ROOT_CAUSES'].astype(int) * 25)
    )
    base_df['QUALITY_SCORE'] = base_df['QUALITY_SCORE'].clip(0, 100).round(1)
    
    # Quality category
    base_df['QUALITY_CATEGORY'] = pd.cut(
        base_df['QUALITY_SCORE'],
        bins=[0, 60, 80, 100],
        labels=['Poor', 'Good', 'Excellent'],
        include_lowest=True
    )
    
    # Production efficiency indicators
    if 'TOTAL_PLANNED_QTY' in base_df.columns and 'TOTAL_RECEIVED_QTY' in base_df.columns:
        base_df['PRODUCTION_EFFICIENCY'] = (
            base_df['TOTAL_RECEIVED_QTY'] / base_df['TOTAL_PLANNED_QTY'].replace(0, np.nan) * 100
        ).round(1)
        base_df['PRODUCTION_EFFICIENCY'] = base_df['PRODUCTION_EFFICIENCY'].clip(0, 150)  # Cap at 150%
    
    # Schedule performance (if we have dates)
    if 'GSTRP' in base_df.columns and 'GSTRS' in base_df.columns:
        try:
            base_df['GSTRP'] = pd.to_datetime(base_df['GSTRP'], errors='coerce')
            base_df['GSTRS'] = pd.to_datetime(base_df['GSTRS'], errors='coerce')
            base_df['SCHEDULE_VARIANCE_DAYS'] = (base_df['GSTRS'] - base_df['GSTRP']).dt.days
            base_df['ON_TIME_START'] = (abs(base_df['SCHEDULE_VARIANCE_DAYS']) <= 1).fillna(False)
        except:
            pass
    
    print(f"   ✓ KPIs and derived fields created")
    print(f"   ✓ Final comprehensive dataset: {len(base_df):,} orders with {len(base_df.columns)} columns")
    
    # STEP 10: Generate Comprehensive Summary Statistics
    print(f"\n📈 STEP 10: Generating Summary Statistics...")
    
    summary_stats = {
        'dataset_info': {
            'total_orders': len(base_df),
            'total_columns': len(base_df.columns),
            'creation_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        },
        'plant_analysis': {},
        'quality_analysis': {},
        'production_analysis': {},
        'order_type_analysis': {}
    }
    
    # Plant analysis
    plant_col = None
    for col in ['Plant_Name', 'Plant_Code', 'WERKS']:
        if col in base_df.columns and base_df[col].notna().sum() > 0:
            plant_col = col
            break
    
    if plant_col:
        summary_stats['plant_analysis'] = {
            'total_plants': base_df[plant_col].nunique(),
            'orders_per_plant': base_df[plant_col].value_counts().head(10).to_dict(),
            'quality_issues_by_plant': (
                base_df.groupby(plant_col)['QUALITY_NOTIF_COUNT'].sum().sort_values(ascending=False).head(10).to_dict()
                if 'QUALITY_NOTIF_COUNT' in base_df.columns else {}
            )
        }
    
    # Quality analysis
    if 'HAS_QUALITY_ISSUES' in base_df.columns:
        summary_stats['quality_analysis'] = {
            'total_orders_with_issues': base_df['HAS_QUALITY_ISSUES'].sum(),
            'quality_issue_rate': (base_df['HAS_QUALITY_ISSUES'].sum() / len(base_df) * 100).round(2),
            'total_notifications': base_df.get('QUALITY_NOTIF_COUNT', pd.Series([0])).sum(),
            'total_defects': base_df.get('DEFECT_COUNT', pd.Series([0])).sum(),
            'total_root_causes': base_df.get('ROOT_CAUSE_COUNT', pd.Series([0])).sum(),
            'avg_quality_score': base_df['QUALITY_SCORE'].mean().round(1),
            'quality_distribution': base_df['QUALITY_CATEGORY'].value_counts().to_dict()
        }
    
    # Production analysis
    summary_stats['production_analysis'] = {
        'total_order_items': base_df.get('ORDER_ITEM_COUNT', pd.Series([0])).sum(),
        'total_goods_movements': base_df.get('GOODS_MOVEMENT_COUNT', pd.Series([0])).sum(),
    }
    
    if 'PRODUCTION_EFFICIENCY' in base_df.columns:
        summary_stats['production_analysis']['avg_production_efficiency'] = base_df['PRODUCTION_EFFICIENCY'].mean().round(1)
    
    if 'ON_TIME_START' in base_df.columns:
        summary_stats['production_analysis']['on_time_start_rate'] = (base_df['ON_TIME_START'].sum() / len(base_df) * 100).round(1)
    
    # Order type analysis
    if 'AUART' in base_df.columns:
        summary_stats['order_type_analysis'] = {
            'order_types': base_df['AUART'].value_counts().to_dict(),
            'quality_issues_by_order_type': (
                base_df.groupby('AUART')['QUALITY_NOTIF_COUNT'].sum().sort_values(ascending=False).to_dict()
                if 'QUALITY_NOTIF_COUNT' in base_df.columns else {}
            )
        }
    
    # STEP 11: Create Quality Details for Deeper Analysis
    quality_details = {}
    
    if df_qmel is not None and not df_qmel.empty:
        quality_details['notification_summary'] = {
            'total_notifications': len(df_qmel),
            'notification_types': df_qmel['QMART'].value_counts().head(10).to_dict() if 'QMART' in df_qmel.columns else {},
            'priority_distribution': df_qmel['PRIOK'].value_counts().to_dict() if 'PRIOK' in df_qmel.columns else {}
        }
    
    if df_qmfe is not None and not df_qmfe.empty:
        quality_details['defect_summary'] = {
            'total_defects': len(df_qmfe),
            'defect_codes': df_qmfe['FECOD'].value_counts().head(10).to_dict() if 'FECOD' in df_qmfe.columns else {},
            'defect_categories': df_qmfe['FEKAT'].value_counts().to_dict() if 'FEKAT' in df_qmfe.columns else {}
        }
    
    if df_qmur is not None and not df_qmur.empty:
        quality_details['cause_summary'] = {
            'total_causes': len(df_qmur),
            'root_causes': df_qmur['ROOTCAUSE'].sum() if 'ROOTCAUSE' in df_qmur.columns else 0,
            'cause_codes': df_qmur['URCOD'].value_counts().head(10).to_dict() if 'URCOD' in df_qmur.columns else {}
        }
    
    print("\n✅ INTEGRATION COMPLETED SUCCESSFULLY!")
    print("=" * 80)
    print(f"📊 Final Results Summary:")
    print(f"   • Total Orders: {summary_stats['dataset_info']['total_orders']:,}")
    print(f"   • Total Columns: {summary_stats['dataset_info']['total_columns']}")
    
    if summary_stats['plant_analysis']:
        print(f"   • Plants Analyzed: {summary_stats['plant_analysis']['total_plants']}")
    
    if summary_stats['quality_analysis']:
        print(f"   • Orders with Quality Issues: {summary_stats['quality_analysis']['total_orders_with_issues']:,} ({summary_stats['quality_analysis']['quality_issue_rate']:.1f}%)")
        print(f"   • Average Quality Score: {summary_stats['quality_analysis']['avg_quality_score']:.1f}/100")
        print(f"   • Total Quality Notifications: {summary_stats['quality_analysis']['total_notifications']:,}")
        print(f"   • Total Defects: {summary_stats['quality_analysis']['total_defects']:,}")
    
    return base_df, summary_stats, quality_details

def analyze_quality_trends(comprehensive_df, summary_stats):
    """
    Perform advanced quality trend analysis
    """
    print("\n🔍 ADVANCED QUALITY ANALYSIS")
    print("=" * 60)
    
    if 'ERDAT' not in comprehensive_df.columns:
        print("   ⚠️  No creation date available for trend analysis")
        return {}
    
    try:
        # Convert dates
        comprehensive_df['ERDAT'] = pd.to_datetime(comprehensive_df['ERDAT'], errors='coerce')
        comprehensive_df['YEAR_MONTH'] = comprehensive_df['ERDAT'].dt.to_period('M')
        
        trends = {}
        
        # Monthly quality trends
        if 'QUALITY_NOTIF_COUNT' in comprehensive_df.columns:
            monthly_quality = comprehensive_df.groupby('YEAR_MONTH').agg({
                'AUFNR': 'count',
                'QUALITY_NOTIF_COUNT': 'sum',
                'QUALITY_SCORE': 'mean'
            }).round(2)
            
            trends['monthly_trends'] = monthly_quality.to_dict()
            print(f"   ✓ Monthly quality trends calculated for {len(monthly_quality)} months")
        
        # Plant performance comparison
        plant_col = None
        for col in ['Plant_Name', 'Plant_Code', 'WERKS']:
            if col in comprehensive_df.columns and comprehensive_df[col].notna().sum() > 0:
                plant_col = col
                break
        
        if plant_col and 'QUALITY_SCORE' in comprehensive_df.columns:
            plant_performance = comprehensive_df.groupby(plant_col).agg({
                'AUFNR': 'count',
                'QUALITY_SCORE': 'mean',
                'QUALITY_NOTIF_COUNT': 'sum'
            }).round(2).sort_values('QUALITY_SCORE', ascending=False)
            
            trends['plant_performance'] = plant_performance.to_dict()
            print(f"   ✓ Plant performance analysis completed for {len(plant_performance)} plants")
        
        return trends
        
    except Exception as e:
        print(f"   ⚠️  Trend analysis failed: {e}")
        return {}

def export_comprehensive_results(comprehensive_df, summary_stats, quality_details, filename_prefix="sap_comprehensive"):
    """
    Export comprehensive results to multiple formats
    """
    print(f"\n💾 EXPORTING RESULTS...")
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    try:
        # Excel export with multiple sheets
        excel_file = f"{filename_prefix}_{timestamp}.xlsx"
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            # Main comprehensive data
            comprehensive_df.to_excel(writer, sheet_name='Comprehensive_Data', index=False)
            
            # Summary statistics
            summary_df = pd.json_normalize(summary_stats, sep='_')
            summary_df.to_excel(writer, sheet_name='Summary_Statistics', index=False)
            
            # Quality analysis
            if summary_stats.get('quality_analysis'):
                quality_df = pd.DataFrame([summary_stats['quality_analysis']])
                quality_df.to_excel(writer, sheet_name='Quality_Analysis', index=False)
            
            # Plant analysis
            if summary_stats.get('plant_analysis', {}).get('orders_per_plant'):
                plant_df = pd.DataFrame(list(summary_stats['plant_analysis']['orders_per_plant'].items()), 
                                      columns=['Plant', 'Order_Count'])
                plant_df.to_excel(writer, sheet_name='Plant_Analysis', index=False)
            
            # Top quality issues
            if 'QUALITY_NOTIF_COUNT' in comprehensive_df.columns:
                top_issues = comprehensive_df[comprehensive_df['QUALITY_NOTIF_COUNT'] > 0].nlargest(20, 'QUALITY_NOTIF_COUNT')
                top_issues.to_excel(writer, sheet_name='Top_Quality_Issues', index=False)
        
        print(f"   ✅ Excel file created: {excel_file}")
        
        # CSV export for easy analysis
        csv_file = f"{filename_prefix}_{timestamp}.csv"
        comprehensive_df.to_csv(csv_file, index=False)
        print(f"   ✅ CSV file created: {csv_file}")
        
        # Summary report
        report_file = f"{filename_prefix}_summary_{timestamp}.txt"
        with open(report_file, 'w') as f:
            f.write("SAP COMPREHENSIVE QUALITY MANAGEMENT ANALYSIS REPORT\n")
            f.write("=" * 60 + "\n\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            f.write("DATASET OVERVIEW:\n")
            f.write(f"Total Orders: {summary_stats['dataset_info']['total_orders']:,}\n")
            f.write(f"Total Columns: {summary_stats['dataset_info']['total_columns']}\n\n")
            
            if summary_stats.get('quality_analysis'):
                qa = summary_stats['quality_analysis']
                f.write("QUALITY ANALYSIS:\n")
                f.write(f"Orders with Quality Issues: {qa['total_orders_with_issues']:,} ({qa['quality_issue_rate']:.1f}%)\n")
                f.write(f"Average Quality Score: {qa['avg_quality_score']:.1f}/100\n")
                f.write(f"Total Quality Notifications: {qa['total_notifications']:,}\n")
                f.write(f"Total Defects: {qa['total_defects']:,}\n")
                f.write(f"Total Root Causes: {qa['total_root_causes']:,}\n\n")
            
            if summary_stats.get('plant_analysis'):
                pa = summary_stats['plant_analysis']
                f.write("PLANT ANALYSIS:\n")
                f.write(f"Total Plants: {pa['total_plants']}\n")
                f.write("Top Plants by Order Volume:\n")
                for plant, count in list(pa['orders_per_plant'].items())[:5]:
                    f.write(f"  {plant}: {count:,} orders\n")
        
        print(f"   ✅ Summary report created: {report_file}")
        
        return excel_file, csv_file, report_file
        
    except Exception as e:
        print(f"   ❌ Export failed: {e}")
        return None, None, None

# Main execution function
def main_integration():
    """
    Main function to demonstrate usage of the comprehensive integration
    """
    print("🚀 SAP QUALITY MANAGEMENT COMPREHENSIVE INTEGRATION")
    print("=" * 80)
    print("\nTo use this script with your data:")
    print("\n1. Load your SAP dataframes:")
    print("   df_aufk = pd.read_csv('your_aufk_file.csv')")
    print("   df_afko = pd.read_csv('your_afko_file.csv')")
    print("   # ... load all other SAP tables")
    
    print("\n2. Run the comprehensive integration:")
    print("   comprehensive_df, summary_stats, quality_details = create_comprehensive_sap_view(")
    print("       df_aufk=df_aufk, df_afko=df_afko, df_afpo=df_afpo, df_aufm=df_aufm,")
    print("       df_qmel=df_qmel, df_qmfe=df_qmfe, df_qmur=df_qmur, df_qmih=df_qmih,")
    print("       df_qpcd=df_qpcd, df_qpct=df_qpct, df_qpgt=df_qpgt,")
    print("       df_crhd_v1=df_crhd_v1, df_jest=df_jest")
    print("   )")
    
    print("\n3. Analyze trends:")
    print("   trends = analyze_quality_trends(comprehensive_df, summary_stats)")
    
    print("\n4. Export results:")
    print("   excel_file, csv_file, report_file = export_comprehensive_results(")
    print("       comprehensive_df, summary_stats, quality_details")
    print("   )")
    
    print("\n5. Explore your data:")
    print("   print(comprehensive_df.columns.tolist())")
    print("   print(summary_stats)")
    print("   comprehensive_df.head()")

if __name__ == "__main__":
    main_integration()

🚀 SAP QUALITY MANAGEMENT COMPREHENSIVE INTEGRATION

To use this script with your data:

1. Load your SAP dataframes:
   df_aufk = pd.read_csv('your_aufk_file.csv')
   df_afko = pd.read_csv('your_afko_file.csv')
   # ... load all other SAP tables

2. Run the comprehensive integration:
   comprehensive_df, summary_stats, quality_details = create_comprehensive_sap_view(
       df_aufk=df_aufk, df_afko=df_afko, df_afpo=df_afpo, df_aufm=df_aufm,
       df_qmel=df_qmel, df_qmfe=df_qmfe, df_qmur=df_qmur, df_qmih=df_qmih,
       df_qpcd=df_qpcd, df_qpct=df_qpct, df_qpgt=df_qpgt,
       df_crhd_v1=df_crhd_v1, df_jest=df_jest
   )

3. Analyze trends:
   trends = analyze_quality_trends(comprehensive_df, summary_stats)

4. Export results:
   excel_file, csv_file, report_file = export_comprehensive_results(
       comprehensive_df, summary_stats, quality_details
   )

5. Explore your data:
   print(comprehensive_df.columns.tolist())
   print(summary_stats)
   comprehensive_df.head()


In [20]:
comprehensive_df, summary_stats, quality_details = create_comprehensive_sap_view(
       df_aufk=df_aufk, df_afko=df_afko, df_afpo=df_afpo, df_aufm=df_aufm,
       df_qmel=df_qmel, df_qmfe=df_qmfe, df_qmur=df_qmur, df_qmih=df_qmih,
       df_qpcd=df_qpcd, df_qpct=df_qpct, df_qpgt=df_qpgt,
       df_crhd_v1=df_crhd_v1, df_jest=df_jest
   )

🚀 SAP COMPREHENSIVE QUALITY MANAGEMENT INTEGRATION
✓ Created plant description from your data
📊 Input Data Summary:
   • AUFK (Orders): 3 records
   • AFKO (Headers): 2 records
   • AFPO (Items): 3 records
   • QMEL (Quality): 2 records
   • Plants: 13 plants

🏭 STEP 1: Building Production Order Foundation...
   ✓ Order headers joined: 3 orders

📦 STEP 2: Processing Order Items...
   ✓ Order items summary added: 3 orders

📊 STEP 3: Processing Goods Movements...
   ✓ Goods movements summary added: 3 orders

🔍 STEP 4: Processing Quality Notifications...
   ⚠️  Quality notifications processing failed: "Column(s) ['PRIOK'] do not exist"

⚠️  STEP 5: Processing Quality Defects...
   ✓ Quality defects added: 3 orders

🎯 STEP 6: Processing Root Causes...
   ✓ Root causes added: 3 orders

⚙️  STEP 7: Adding Work Center Information...
   ✓ Work centers linked via OBJID: 3 orders

📊 STEP 8: Adding Status Information...

🛠️  STEP 9: Creating KPIs and Derived Fields...
   ✓ KPIs and derived fields

In [21]:
trends = analyze_quality_trends(comprehensive_df, summary_stats)


🔍 ADVANCED QUALITY ANALYSIS
   ⚠️  No creation date available for trend analysis


In [22]:


class SAPDataValidator:
    """
    Comprehensive data validation and quality checks for SAP data
    """
    
    def __init__(self):
        self.validation_results = {}
        self.data_quality_score = 0
        self.recommendations = []
    
    def validate_all_tables(self, **tables):
        """
        Validate all SAP tables and generate quality report
        
        Usage: validate_all_tables(df_aufk=df_aufk, df_afko=df_afko, ...)
        """
        print("🔍 SAP DATA VALIDATION AND QUALITY ASSESSMENT")
        print("=" * 70)
        
        validation_summary = {
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'tables_validated': 0,
            'total_records': 0,
            'issues_found': [],
            'quality_scores': {},
            'recommendations': []
        }
        
        # Validate each table
        for table_name, df in tables.items():
            if df is not None and not df.empty:
                print(f"\n📋 Validating {table_name.upper()}...")
                table_results = self._validate_table(df, table_name)
                self.validation_results[table_name] = table_results
                validation_summary['tables_validated'] += 1
                validation_summary['total_records'] += len(df)
                validation_summary['quality_scores'][table_name] = table_results['quality_score']
                
                if table_results['issues']:
                    validation_summary['issues_found'].extend([
                        f"{table_name}: {issue}" for issue in table_results['issues']
                    ])
        
        # Generate overall assessment
        overall_score = np.mean(list(validation_summary['quality_scores'].values())) if validation_summary['quality_scores'] else 0
        validation_summary['overall_quality_score'] = round(overall_score, 1)
        
        # Generate recommendations
        validation_summary['recommendations'] = self._generate_recommendations(validation_summary)
        
        print(f"\n✅ VALIDATION COMPLETE")
        print(f"   📊 Overall Data Quality Score: {validation_summary['overall_quality_score']}/100")
        print(f"   📁 Tables Validated: {validation_summary['tables_validated']}")
        print(f"   📈 Total Records: {validation_summary['total_records']:,}")
        print(f"   ⚠️  Issues Found: {len(validation_summary['issues_found'])}")
        
        return validation_summary
    
    def _validate_table(self, df, table_name):
        """
        Validate individual table
        """
        results = {
            'table_name': table_name,
            'record_count': len(df),
            'column_count': len(df.columns),
            'quality_score': 100,  # Start with perfect score
            'issues': [],
            'checks_performed': [],
            'data_completeness': {},
            'data_consistency': {},
            'data_accuracy': {}
        }
        
        # Basic structure checks
        results['checks_performed'].append('Structure validation')
        if len(df) == 0:
            results['issues'].append("Table is empty")
            results['quality_score'] -= 50
        
        if len(df.columns) == 0:
            results['issues'].append("No columns found")
            results['quality_score'] -= 50
            return results
        
        # Data completeness checks
        results['checks_performed'].append('Completeness analysis')
        completeness = self._check_completeness(df, table_name)
        results['data_completeness'] = completeness
        
        # Penalize for low completeness
        avg_completeness = np.mean(list(completeness['column_completeness'].values()))
        if avg_completeness < 80:
            results['issues'].append(f"Low data completeness: {avg_completeness:.1f}%")
            results['quality_score'] -= (100 - avg_completeness) * 0.3
        
        # Data consistency checks
        results['checks_performed'].append('Consistency validation')
        consistency = self._check_consistency(df, table_name)
        results['data_consistency'] = consistency
        
        if consistency['issues']:
            results['issues'].extend(consistency['issues'])
            results['quality_score'] -= len(consistency['issues']) * 5
        
        # Data accuracy checks
        results['checks_performed'].append('Accuracy validation')
        accuracy = self._check_accuracy(df, table_name)
        results['data_accuracy'] = accuracy
        
        if accuracy['issues']:
            results['issues'].extend(accuracy['issues'])
            results['quality_score'] -= len(accuracy['issues']) * 3
        
        # Table-specific validations
        table_specific_issues = self._validate_table_specific(df, table_name)
        if table_specific_issues:
            results['issues'].extend(table_specific_issues)
            results['quality_score'] -= len(table_specific_issues) * 2
        
        # Ensure score doesn't go below 0
        results['quality_score'] = max(0, round(results['quality_score'], 1))
        
        print(f"   ✓ {table_name}: Score {results['quality_score']}/100, {len(results['issues'])} issues")
        
        return results
    
    def _check_completeness(self, df, table_name):
        """
        Check data completeness
        """
        completeness = {
            'total_cells': df.shape[0] * df.shape[1],
            'missing_cells': df.isnull().sum().sum(),
            'column_completeness': {},
            'critical_columns_missing': []
        }
        
        # Calculate completeness per column
        for col in df.columns:
            missing_pct = (df[col].isnull().sum() / len(df)) * 100
            completeness['column_completeness'][col] = round(100 - missing_pct, 1)
        
        # Check critical columns based on table type
        critical_columns = self._get_critical_columns(table_name)
        for col in critical_columns:
            if col in df.columns:
                missing_pct = (df[col].isnull().sum() / len(df)) * 100
                if missing_pct > 10:  # More than 10% missing
                    completeness['critical_columns_missing'].append(f"{col} ({missing_pct:.1f}% missing)")
        
        completeness['overall_completeness'] = round(
            ((completeness['total_cells'] - completeness['missing_cells']) / completeness['total_cells']) * 100, 1
        )
        
        return completeness
    
    def _check_consistency(self, df, table_name):
        """
        Check data consistency
        """
        consistency = {
            'issues': [],
            'duplicate_records': 0,
            'data_type_issues': [],
            'value_range_issues': []
        }
        
        # Check for duplicates based on key columns
        key_columns = self._get_key_columns(table_name)
        if key_columns:
            available_keys = [col for col in key_columns if col in df.columns]
            if available_keys:
                duplicates = df.duplicated(subset=available_keys).sum()
                consistency['duplicate_records'] = duplicates
                if duplicates > 0:
                    consistency['issues'].append(f"{duplicates} duplicate records found")
        
        # Check data types
        for col in df.columns:
            if col.upper().endswith('DAT') or 'DATE' in col.upper():
                # Should be date-like
                try:
                    pd.to_datetime(df[col], errors='coerce')
                except:
                    consistency['data_type_issues'].append(f"{col} has invalid date format")
            
            elif col.upper().endswith('MNG') or col.upper().endswith('QTY'):
                # Should be numeric
                if not pd.api.types.is_numeric_dtype(df[col]):
                    non_numeric = df[col].apply(lambda x: not str(x).replace('.', '').replace('-', '').isdigit() if pd.notna(x) else False).sum()
                    if non_numeric > 0:
                        consistency['data_type_issues'].append(f"{col} has {non_numeric} non-numeric values")
        
        # Check value ranges
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df[col].min() < -999999 or df[col].max() > 999999999:
                consistency['value_range_issues'].append(f"{col} has extreme values")
        
        if consistency['data_type_issues']:
            consistency['issues'].extend(consistency['data_type_issues'])
        
        if consistency['value_range_issues']:
            consistency['issues'].extend(consistency['value_range_issues'])
        
        return consistency
    
    def _check_accuracy(self, df, table_name):
        """
        Check data accuracy
        """
        accuracy = {
            'issues': [],
            'invalid_codes': [],
            'logical_inconsistencies': [],
            'reference_integrity_issues': []
        }
        
        # Check for valid plant codes (if applicable)
        plant_columns = ['WERKS', 'PWERK', 'Plant_Code']
        for col in plant_columns:
            if col in df.columns:
                # Check if plant codes follow expected pattern
                valid_pattern = df[col].str.match(r'^[A-Z]\d{3}$', na=False).sum() if df[col].dtype == 'object' else 0
                total_non_null = df[col].notna().sum()
                if total_non_null > 0 and (valid_pattern / total_non_null) < 0.8:
                    accuracy['invalid_codes'].append(f"{col} has non-standard plant codes")
        
        # Check order number formats
        order_columns = ['AUFNR', 'ABNUM']
        for col in order_columns:
            if col in df.columns and df[col].dtype == 'object':
                # Basic order number validation
                invalid_orders = df[col].str.len().fillna(0)
                if (invalid_orders < 6).sum() > 0 and df[col].notna().sum() > 0:
                    accuracy['invalid_codes'].append(f"{col} has suspiciously short order numbers")
        
        # Check for logical inconsistencies
        if table_name == 'df_afko':
            # Start date should be before end date
            if 'GSTRP' in df.columns and 'GLTRP' in df.columns:
                try:
                    start_dates = pd.to_datetime(df['GSTRP'], errors='coerce')
                    end_dates = pd.to_datetime(df['GLTRP'], errors='coerce')
                    invalid_dates = (start_dates > end_dates).sum()
                    if invalid_dates > 0:
                        accuracy['logical_inconsistencies'].append(
                            f"{invalid_dates} orders have start date after end date"
                        )
                except:
                    pass
        
        if table_name == 'df_afpo':
            # Planned quantity should be positive
            if 'PSMNG' in df.columns:
                negative_qty = (df['PSMNG'] < 0).sum()
                if negative_qty > 0:
                    accuracy['logical_inconsistencies'].append(
                        f"{negative_qty} items have negative planned quantities"
                    )
        
        if table_name == 'df_qmel':
            # Quality notification dates should be reasonable
            if 'QMDAT' in df.columns:
                try:
                    qm_dates = pd.to_datetime(df['QMDAT'], errors='coerce')
                    future_dates = (qm_dates > datetime.now()).sum()
                    old_dates = (qm_dates < datetime(2000, 1, 1)).sum()
                    
                    if future_dates > 0:
                        accuracy['logical_inconsistencies'].append(
                            f"{future_dates} quality notifications have future dates"
                        )
                    if old_dates > 0:
                        accuracy['logical_inconsistencies'].append(
                            f"{old_dates} quality notifications have very old dates"
                        )
                except:
                    pass
        
        # Combine all issues
        if accuracy['invalid_codes']:
            accuracy['issues'].extend(accuracy['invalid_codes'])
        if accuracy['logical_inconsistencies']:
            accuracy['issues'].extend(accuracy['logical_inconsistencies'])
        if accuracy['reference_integrity_issues']:
            accuracy['issues'].extend(accuracy['reference_integrity_issues'])
        
        return accuracy
    
    def _validate_table_specific(self, df, table_name):
        """
        Table-specific validation rules
        """
        issues = []
        
        if table_name == 'df_aufk':
            # Order master specific checks
            if 'AUFNR' in df.columns:
                if df['AUFNR'].duplicated().any():
                    issues.append("Duplicate order numbers in master data")
            
            if 'ERDAT' in df.columns:
                try:
                    creation_dates = pd.to_datetime(df['ERDAT'], errors='coerce')
                    recent_cutoff = datetime.now() - timedelta(days=3650)  # 10 years
                    very_old = (creation_dates < recent_cutoff).sum()
                    if very_old > len(df) * 0.1:  # More than 10% very old
                        issues.append("High percentage of very old orders")
                except:
                    pass
        
        elif table_name == 'df_afko':
            # Order header specific checks
            if 'GASMG' in df.columns and 'GAMNG' in df.columns:
                # Confirmed quantity shouldn't exceed ordered quantity significantly
                over_confirmed = ((df['GAMNG'] / df['GASMG'].replace(0, 1)) > 1.1).sum()
                if over_confirmed > 0:
                    issues.append(f"{over_confirmed} orders have confirmed quantity > 110% of ordered")
        
        elif table_name == 'df_qmel':
            # Quality notification specific checks
            if 'PRIOK' in df.columns:
                # Priority should be within expected range
                invalid_priority = (~df['PRIOK'].between(1, 9, na=True)).sum()
                if invalid_priority > 0:
                    issues.append(f"{invalid_priority} notifications have invalid priority values")
            
            if 'QMART' in df.columns:
                # Should have valid notification types
                if df['QMART'].notna().sum() == 0:
                    issues.append("No quality notification types specified")
        
        elif table_name == 'df_qmfe':
            # Quality defect specific checks
            if 'ANZFEHLER' in df.columns:
                # Number of defects should be reasonable
                extreme_defects = (df['ANZFEHLER'] > 1000).sum()
                if extreme_defects > 0:
                    issues.append(f"{extreme_defects} defect records have unrealistic defect counts")
        
        elif table_name == 'df_aufm':
            # Goods movement specific checks
            if 'MENGE' in df.columns and 'SHKZG' in df.columns:
                # Quantity and debit/credit should be consistent
                inconsistent_signs = 0
                for idx, row in df.iterrows():
                    if pd.notna(row.get('MENGE')) and pd.notna(row.get('SHKZG')):
                        if row['SHKZG'] == 'H' and row['MENGE'] > 0:  # Credit with positive qty
                            inconsistent_signs += 1
                        elif row['SHKZG'] == 'S' and row['MENGE'] < 0:  # Debit with negative qty
                            inconsistent_signs += 1
                
                if inconsistent_signs > 0:
                    issues.append(f"{inconsistent_signs} movements have inconsistent debit/credit signs")
        
        return issues
    
    def _get_critical_columns(self, table_name):
        """
        Get critical columns for each table type
        """
        critical_columns = {
            'df_aufk': ['AUFNR', 'AUART', 'ERDAT'],
            'df_afko': ['AUFNR', 'GSTRP', 'GLTRP'],
            'df_afpo': ['AUFNR', 'POSNR', 'MATNR'],
            'df_aufm': ['AUFNR', 'MBLNR', 'BWART', 'MATNR'],
            'df_qmel': ['QMNUM', 'QMART', 'AUFNR'],
            'df_qmfe': ['QMNUM', 'FENUM', 'FECOD'],
            'df_qmur': ['QMNUM', 'URNUM', 'URCOD'],
            'df_qmih': ['QMNUM', 'IWERK'],
            'df_qpcd': ['KATALOGART', 'CODEGRUPPE', 'CODE'],
            'df_qpct': ['KATALOGART', 'CODEGRUPPE', 'CODE', 'KURZTEXT'],
            'df_crhd_v1': ['OBJID', 'ARBPL'],
            'df_jest': ['OBJNR', 'STAT']
        }
        
        return critical_columns.get(table_name, [])
    
    def _get_key_columns(self, table_name):
        """
        Get key columns for duplicate detection
        """
        key_columns = {
            'df_aufk': ['MANDT', 'AUFNR'],
            'df_afko': ['MANDT', 'AUFNR'],
            'df_afpo': ['MANDT', 'AUFNR', 'POSNR'],
            'df_aufm': ['MANDT', 'MBLNR', 'MJAHR', 'ZEILE'],
            'df_qmel': ['MANDT', 'QMNUM'],
            'df_qmfe': ['MANDT', 'QMNUM', 'FENUM'],
            'df_qmur': ['MANDT', 'QMNUM', 'FENUM', 'URNUM'],
            'df_qmih': ['MANDT', 'QMNUM'],
            'df_qpcd': ['MANDT', 'KATALOGART', 'CODEGRUPPE', 'CODE'],
            'df_qpct': ['MANDT', 'KATALOGART', 'CODEGRUPPE', 'CODE', 'SPRACHE'],
            'df_crhd_v1': ['MANDT', 'OBJTY', 'OBJID', 'SPRAS'],
            'df_jest': ['MANDT', 'OBJNR', 'STAT']
        }
        
        return key_columns.get(table_name, [])
    
    def _generate_recommendations(self, validation_summary):
        """
        Generate specific recommendations based on validation results
        """
        recommendations = []
        overall_score = validation_summary['overall_quality_score']
        
        # Overall score recommendations
        if overall_score < 70:
            recommendations.append("CRITICAL: Data quality is poor. Immediate data cleansing required.")
        elif overall_score < 85:
            recommendations.append("Data quality needs improvement. Implement data governance processes.")
        else:
            recommendations.append("Data quality is good. Continue monitoring and maintenance.")
        
        # Specific issue recommendations
        for issue in validation_summary['issues_found']:
            if 'duplicate' in issue.lower():
                recommendations.append("Implement data deduplication process for affected tables")
            elif 'missing' in issue.lower() or 'completeness' in issue.lower():
                recommendations.append("Improve data entry processes to reduce missing values")
            elif 'date' in issue.lower():
                recommendations.append("Validate date formats and ranges during data entry")
            elif 'quantity' in issue.lower() or 'negative' in issue.lower():
                recommendations.append("Add business logic validation for numeric fields")
        
        # Table-specific recommendations
        for table_name, score in validation_summary['quality_scores'].items():
            if score < 70:
                recommendations.append(f"Focus data improvement efforts on {table_name.replace('df_', '').upper()} table")
        
        return list(set(recommendations))  # Remove duplicates
    
    def generate_data_quality_report(self, validation_summary, output_file=None):
        """
        Generate comprehensive data quality report
        """
        print("\n📊 GENERATING DATA QUALITY REPORT...")
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        if output_file is None:
            output_file = f"sap_data_quality_report_{timestamp}.txt"
        
        with open(output_file, 'w') as f:
            f.write("SAP DATA QUALITY ASSESSMENT REPORT\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Generated: {validation_summary['timestamp']}\n")
            f.write(f"Overall Quality Score: {validation_summary['overall_quality_score']}/100\n\n")
            
            # Executive Summary
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-" * 40 + "\n")
            f.write(f"Tables Validated: {validation_summary['tables_validated']}\n")
            f.write(f"Total Records: {validation_summary['total_records']:,}\n")
            f.write(f"Issues Identified: {len(validation_summary['issues_found'])}\n\n")
            
            # Quality Scores by Table
            f.write("QUALITY SCORES BY TABLE\n")
            f.write("-" * 40 + "\n")
            for table, score in validation_summary['quality_scores'].items():
                status = "✓ GOOD" if score >= 85 else "⚠ NEEDS IMPROVEMENT" if score >= 70 else "❌ POOR"
                f.write(f"{table.replace('df_', '').upper():15} {score:6.1f}/100  {status}\n")
            f.write("\n")
            
            # Issues Found
            if validation_summary['issues_found']:
                f.write("ISSUES IDENTIFIED\n")
                f.write("-" * 40 + "\n")
                for i, issue in enumerate(validation_summary['issues_found'], 1):
                    f.write(f"{i:2d}. {issue}\n")
                f.write("\n")
            
            # Recommendations
            f.write("RECOMMENDATIONS\n")
            f.write("-" * 40 + "\n")
            for i, rec in enumerate(validation_summary['recommendations'], 1):
                f.write(f"{i:2d}. {rec}\n")
            f.write("\n")
            
            # Detailed Findings
            f.write("DETAILED FINDINGS BY TABLE\n")
            f.write("-" * 40 + "\n")
            for table_name, results in self.validation_results.items():
                f.write(f"\n{table_name.replace('df_', '').upper()}\n")
                f.write(f"Records: {results['record_count']:,}\n")
                f.write(f"Columns: {results['column_count']}\n")
                f.write(f"Quality Score: {results['quality_score']}/100\n")
                
                if results['data_completeness']['overall_completeness'] < 95:
                    f.write(f"Data Completeness: {results['data_completeness']['overall_completeness']}%\n")
                
                if results['issues']:
                    f.write("Issues:\n")
                    for issue in results['issues']:
                        f.write(f"  • {issue}\n")
                f.write("\n")
        
        print(f"   ✅ Report saved to: {output_file}")
        return output_file
    
    def suggest_data_improvements(self, validation_summary):
        """
        Suggest specific data improvement actions
        """
        print("\n💡 GENERATING DATA IMPROVEMENT SUGGESTIONS...")
        
        improvements = {
            'immediate_actions': [],
            'process_improvements': [],
            'system_enhancements': [],
            'training_needs': []
        }
        
        # Analyze validation results for specific suggestions
        overall_score = validation_summary['overall_quality_score']
        
        if overall_score < 70:
            improvements['immediate_actions'].extend([
                "Stop using affected data for critical analysis until cleaned",
                "Perform emergency data cleansing on critical tables",
                "Identify and fix root causes of data quality issues"
            ])
        
        # Check for common issues
        issues_text = ' '.join(validation_summary['issues_found']).lower()
        
        if 'duplicate' in issues_text:
            improvements['process_improvements'].append(
                "Implement master data management (MDM) processes"
            )
            improvements['system_enhancements'].append(
                "Add duplicate detection rules to data entry systems"
            )
        
        if 'missing' in issues_text or 'completeness' in issues_text:
            improvements['process_improvements'].append(
                "Make critical fields mandatory in data entry forms"
            )
            improvements['training_needs'].append(
                "Train users on importance of complete data entry"
            )
        
        if 'date' in issues_text:
            improvements['system_enhancements'].append(
                "Implement date validation controls in SAP"
            )
            improvements['training_needs'].append(
                "Provide training on proper date entry formats"
            )
        
        if 'quantity' in issues_text or 'negative' in issues_text:
            improvements['process_improvements'].append(
                "Add business logic validation for quantity fields"
            )
            improvements['system_enhancements'].append(
                "Implement range checks for numeric fields"
            )
        
        # Table-specific suggestions
        poor_tables = [table for table, score in validation_summary['quality_scores'].items() if score < 70]
        if poor_tables:
            improvements['immediate_actions'].append(
                f"Focus data cleansing efforts on: {', '.join([t.replace('df_', '') for t in poor_tables])}"
            )
        
        print("   ✅ Data improvement suggestions generated")
        return improvements

def create_data_monitoring_plan(validation_summary):
    """
    Create ongoing data monitoring plan
    """
    print("📋 CREATING DATA MONITORING PLAN...")
    
    monitoring_plan = {
        'daily_checks': [],
        'weekly_checks': [],
        'monthly_checks': [],
        'automated_alerts': [],
        'data_steward_responsibilities': {}
    }
    
    # Daily checks for critical issues
    if validation_summary['overall_quality_score'] < 85:
        monitoring_plan['daily_checks'].extend([
            "Check for new duplicate records in master data tables",
            "Monitor completeness of critical fields",
            "Validate new order entries for data consistency"
        ])
    
    # Weekly checks
    monitoring_plan['weekly_checks'].extend([
        "Review data quality metrics dashboard",
        "Analyze data completeness trends",
        "Check for new data validation rule violations",
        "Review and resolve data quality alerts"
    ])
    
    # Monthly checks
    monitoring_plan['monthly_checks'].extend([
        "Perform comprehensive data quality assessment",
        "Review and update data validation rules",
        "Analyze data quality trends and patterns",
        "Generate data quality report for management"
    ])
    
    # Automated alerts
    monitoring_plan['automated_alerts'].extend([
        "Alert when duplicate records exceed threshold",
        "Alert when data completeness drops below 90%",
        "Alert for unusual data patterns or outliers",
        "Alert for failed data validation checks"
    ])
    
    # Data steward responsibilities
    monitoring_plan['data_steward_responsibilities'] = {
        'Master Data Steward': [
            "Monitor AUFK and AFKO data quality",
            "Resolve master data inconsistencies",
            "Maintain data validation rules"
        ],
        'Quality Data Steward': [
            "Monitor QMEL, QMFE, QMUR data quality",
            "Ensure quality notification completeness",
            "Validate quality code consistency"
        ],
        'Production Data Steward': [
            "Monitor AFPO and AUFM data quality",
            "Validate production quantities and movements",
            "Ensure material master data accuracy"
        ]
    }
    
    print("   ✅ Data monitoring plan created")
    return monitoring_plan

def run_complete_validation_suite(df_aufk=None, df_afko=None, df_afpo=None, df_aufm=None,
                                df_qmel=None, df_qmfe=None, df_qmur=None, df_qmih=None,
                                df_qpcd=None, df_qpct=None, df_qpgt=None, 
                                df_crhd_v1=None, df_jest=None, df_plant_description=None,
                                generate_reports=True):
    """
    Run complete validation suite on all SAP tables
    """
    print("🚀 SAP COMPLETE DATA VALIDATION SUITE")
    print("=" * 80)
    
    # Initialize validator
    validator = SAPDataValidator()
    
    # Prepare tables dictionary
    tables = {
        'df_aufk': df_aufk,
        'df_afko': df_afko,
        'df_afpo': df_afpo,
        'df_aufm': df_aufm,
        'df_qmel': df_qmel,
        'df_qmfe': df_qmfe,
        'df_qmur': df_qmur,
        'df_qmih': df_qmih,
        'df_qpcd': df_qpcd,
        'df_qpct': df_qpct,
        'df_qpgt': df_qpgt,
        'df_crhd_v1': df_crhd_v1,
        'df_jest': df_jest,
        'df_plant_description': df_plant_description
    }
    
    # Remove None tables
    tables = {name: df for name, df in tables.items() if df is not None and not df.empty}
    
    # Run validation
    validation_summary = validator.validate_all_tables(**tables)
    
    # Generate additional analyses
    improvements = validator.suggest_data_improvements(validation_summary)
    monitoring_plan = create_data_monitoring_plan(validation_summary)
    
    # Combine all results
    complete_results = {
        'validation_summary': validation_summary,
        'detailed_results': validator.validation_results,
        'improvement_suggestions': improvements,
        'monitoring_plan': monitoring_plan
    }
    
    # Generate reports if requested
    if generate_reports:
        print("\n📄 GENERATING REPORTS...")
        
        # Main data quality report
        report_file = validator.generate_data_quality_report(validation_summary)
        
        # Export complete results to Excel
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        excel_file = f"sap_validation_complete_{timestamp}.xlsx"
        
        try:
            with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
                # Summary sheet
                summary_df = pd.DataFrame([validation_summary])
                summary_df.to_excel(writer, sheet_name='Validation_Summary', index=False)
                
                # Quality scores
                scores_df = pd.DataFrame(list(validation_summary['quality_scores'].items()),
                                       columns=['Table', 'Quality_Score'])
                scores_df.to_excel(writer, sheet_name='Quality_Scores', index=False)
                
                # Issues
                if validation_summary['issues_found']:
                    issues_df = pd.DataFrame(validation_summary['issues_found'], columns=['Issue'])
                    issues_df.to_excel(writer, sheet_name='Issues_Found', index=False)
                
                # Recommendations
                rec_df = pd.DataFrame(validation_summary['recommendations'], columns=['Recommendation'])
                rec_df.to_excel(writer, sheet_name='Recommendations', index=False)
                
                # Improvement suggestions
                for category, suggestions in improvements.items():
                    if suggestions:
                        imp_df = pd.DataFrame(suggestions, columns=[category.replace('_', ' ').title()])
                        imp_df.to_excel(writer, sheet_name=f'Improvements_{category}', index=False)
            
            print(f"   ✅ Excel report created: {excel_file}")
            
        except Exception as e:
            print(f"   ⚠️  Excel export failed: {e}")
            excel_file = None
        
        print(f"\n📁 VALIDATION REPORTS CREATED:")
        print(f"   • Text Report: {report_file}")
        if excel_file:
            print(f"   • Excel Report: {excel_file}")
    
    print(f"\n✅ VALIDATION SUITE COMPLETED!")
    print(f"   📊 Overall Quality Score: {validation_summary['overall_quality_score']}/100")
    print(f"   🔍 Tables Validated: {validation_summary['tables_validated']}")
    print(f"   📋 Total Issues: {len(validation_summary['issues_found'])}")
    
    return complete_results

# Usage example
def example_validation_usage():
    """
    Example of how to use the validation suite
    """
    print("📚 SAP DATA VALIDATION USAGE EXAMPLE")
    print("=" * 60)
    
    print("\n1. BASIC VALIDATION:")
    print("   # Validate all your tables")
    print("   results = run_complete_validation_suite(")
    print("       df_aufk=df_aufk, df_afko=df_afko, df_afpo=df_afpo,")
    print("       df_qmel=df_qmel, df_qmfe=df_qmfe, # ... other tables")
    print("   )")
    
    print("\n2. ACCESS RESULTS:")
    print("   # Get overall quality score")
    print("   quality_score = results['validation_summary']['overall_quality_score']")
    print("   ")
    print("   # Get specific table issues")
    print("   aufk_issues = results['detailed_results']['df_aufk']['issues']")
    print("   ")
    print("   # Get improvement suggestions")
    print("   improvements = results['improvement_suggestions']")
    
    print("\n3. IMPLEMENT MONITORING:")
    print("   # Use monitoring plan for ongoing data quality")
    print("   monitoring = results['monitoring_plan']")
    print("   daily_checks = monitoring['daily_checks']")
    
    print("\n4. SCHEDULED VALIDATION:")
    print("   # Run this weekly/monthly for ongoing monitoring")
    print("   # Set up automated alerts based on quality scores")

if __name__ == "__main__":
    example_validation_usage()

📚 SAP DATA VALIDATION USAGE EXAMPLE

1. BASIC VALIDATION:
   # Validate all your tables
   results = run_complete_validation_suite(
       df_aufk=df_aufk, df_afko=df_afko, df_afpo=df_afpo,
       df_qmel=df_qmel, df_qmfe=df_qmfe, # ... other tables
   )

2. ACCESS RESULTS:
   # Get overall quality score
   quality_score = results['validation_summary']['overall_quality_score']
   
   # Get specific table issues
   aufk_issues = results['detailed_results']['df_aufk']['issues']
   
   # Get improvement suggestions
   improvements = results['improvement_suggestions']

3. IMPLEMENT MONITORING:
   # Use monitoring plan for ongoing data quality
   monitoring = results['monitoring_plan']
   daily_checks = monitoring['daily_checks']

4. SCHEDULED VALIDATION:
   # Run this weekly/monthly for ongoing monitoring
   # Set up automated alerts based on quality scores


In [23]:
# 1. Run core integration
comprehensive_df, summary, quality_details = create_comprehensive_sap_view(
    df_aufk, df_afko, df_afpo, df_aufm, df_qmel, df_qmfe, df_qmur, 
    df_qmih, df_qpcd, df_qpct, df_qpgt, df_crhd_v1, df_jest
)

# 2. Run advanced analytics
all_analyses = run_complete_advanced_analytics(
    comprehensive_df, summary, quality_details
)

# 3. Validate data quality
validation_results = run_complete_validation_suite(
    df_aufk, df_afko, df_afpo, df_qmel, df_qmfe  # ... all tables
)

# 4. Access results
quality_score = comprehensive_df['QUALITY_SCORE'].mean()
plant_performance = all_analyses['plant_performance']
action_plans = all_analyses['action_plans']

🚀 SAP COMPREHENSIVE QUALITY MANAGEMENT INTEGRATION
✓ Created plant description from your data
📊 Input Data Summary:
   • AUFK (Orders): 3 records
   • AFKO (Headers): 2 records
   • AFPO (Items): 3 records
   • QMEL (Quality): 2 records
   • Plants: 13 plants

🏭 STEP 1: Building Production Order Foundation...
   ✓ Order headers joined: 3 orders

📦 STEP 2: Processing Order Items...
   ✓ Order items summary added: 3 orders

📊 STEP 3: Processing Goods Movements...
   ✓ Goods movements summary added: 3 orders

🔍 STEP 4: Processing Quality Notifications...
   ⚠️  Quality notifications processing failed: "Column(s) ['PRIOK'] do not exist"

⚠️  STEP 5: Processing Quality Defects...
   ✓ Quality defects added: 3 orders

🎯 STEP 6: Processing Root Causes...
   ✓ Root causes added: 3 orders

⚙️  STEP 7: Adding Work Center Information...
   ✓ Work centers linked via OBJID: 3 orders

📊 STEP 8: Adding Status Information...

🛠️  STEP 9: Creating KPIs and Derived Fields...
   ✓ KPIs and derived fields

NameError: name 'run_complete_advanced_analytics' is not defined

In [24]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

class SAPResultsManager:
    """
    Manages all SAP analysis results and exports to organized folder structure
    """
    
    def __init__(self, base_folder="result"):
        self.base_folder = base_folder
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.session_folder = None
        self.created_files = []
        
    def setup_folder_structure(self):
        """
        Create organized folder structure for results
        """
        print(f"📁 Setting up folder structure in '{self.base_folder}'...")
        
        # Create main result folder
        if not os.path.exists(self.base_folder):
            os.makedirs(self.base_folder)
            print(f"   ✓ Created main folder: {self.base_folder}")
        
        # Create session-specific subfolder
        self.session_folder = os.path.join(self.base_folder, f"sap_analysis_{self.timestamp}")
        os.makedirs(self.session_folder, exist_ok=True)
        print(f"   ✓ Created session folder: {self.session_folder}")
        
        # Create subfolders for different types of outputs
        subfolders = [
            "01_comprehensive_data",
            "02_advanced_analytics", 
            "03_data_validation",
            "04_executive_reports",
            "05_detailed_analysis",
            "06_action_plans",
            "07_dashboards",
            "08_quality_scorecards"
        ]
        
        for subfolder in subfolders:
            folder_path = os.path.join(self.session_folder, subfolder)
            os.makedirs(folder_path, exist_ok=True)
            print(f"   ✓ Created: {subfolder}")
        
        print(f"   ✅ Folder structure ready!")
        return self.session_folder
    
    def save_comprehensive_data(self, comprehensive_df, summary_stats, quality_details):
        """
        Save comprehensive integration results
        """
        print("\n💾 Saving Comprehensive Data...")
        folder = os.path.join(self.session_folder, "01_comprehensive_data")
        
        # Main comprehensive dataset
        main_file = os.path.join(folder, f"comprehensive_dataset_{self.timestamp}.xlsx")
        
        with pd.ExcelWriter(main_file, engine='openpyxl') as writer:
            # Main data
            comprehensive_df.to_excel(writer, sheet_name='Comprehensive_Data', index=False)
            
            # Summary statistics
            summary_df = pd.json_normalize(summary_stats, sep='_')
            summary_df.to_excel(writer, sheet_name='Summary_Statistics', index=False)
            
            # Quality distribution
            if 'QUALITY_CATEGORY' in comprehensive_df.columns:
                quality_dist = comprehensive_df['QUALITY_CATEGORY'].value_counts().reset_index()
                quality_dist.to_excel(writer, sheet_name='Quality_Distribution', index=False)
            
            # Plant analysis
            plant_col = self._get_plant_column(comprehensive_df)
            if plant_col:
                plant_summary = comprehensive_df.groupby(plant_col).agg({
                    'AUFNR': 'count',
                    'QUALITY_NOTIF_COUNT': 'sum',
                    'DEFECT_COUNT': 'sum',
                    'QUALITY_SCORE': 'mean'
                }).reset_index()
                plant_summary.to_excel(writer, sheet_name='Plant_Analysis', index=False)
            
            # Top quality issues
            if 'QUALITY_NOTIF_COUNT' in comprehensive_df.columns:
                top_issues = comprehensive_df[comprehensive_df['QUALITY_NOTIF_COUNT'] > 0].nlargest(20, 'QUALITY_NOTIF_COUNT')
                top_issues.to_excel(writer, sheet_name='Top_Quality_Issues', index=False)
        
        self.created_files.append(main_file)
        print(f"   ✓ Comprehensive Excel: {os.path.basename(main_file)}")
        
        # CSV export for easy analysis
        csv_file = os.path.join(folder, f"comprehensive_dataset_{self.timestamp}.csv")
        comprehensive_df.to_csv(csv_file, index=False)
        self.created_files.append(csv_file)
        print(f"   ✓ Comprehensive CSV: {os.path.basename(csv_file)}")
        
        # JSON export for APIs
        json_file = os.path.join(folder, f"summary_stats_{self.timestamp}.json")
        with open(json_file, 'w') as f:
            json.dump(summary_stats, f, indent=2, default=str)
        self.created_files.append(json_file)
        print(f"   ✓ Summary JSON: {os.path.basename(json_file)}")
        
        return main_file, csv_file, json_file
    
    def save_advanced_analytics(self, all_analyses, comprehensive_df):
        """
        Save advanced analytics results
        """
        print("\n📊 Saving Advanced Analytics...")
        folder = os.path.join(self.session_folder, "02_advanced_analytics")
        
        # Main analytics Excel
        analytics_file = os.path.join(folder, f"advanced_analytics_{self.timestamp}.xlsx")
        
        with pd.ExcelWriter(analytics_file, engine='openpyxl') as writer:
            # Dashboard data
            if 'dashboard_data' in all_analyses:
                dashboard_df = pd.json_normalize(all_analyses['dashboard_data'], sep='_')
                dashboard_df.to_excel(writer, sheet_name='Dashboard_Data', index=False)
            
            # Material quality analysis
            if 'material_quality' in all_analyses and 'top_risk_materials' in all_analyses['material_quality']:
                material_df = pd.DataFrame(all_analyses['material_quality']['top_risk_materials']).T
                material_df.to_excel(writer, sheet_name='Material_Quality')
            
            # Operational efficiency
            if 'operational_efficiency' in all_analyses:
                efficiency_df = pd.json_normalize(all_analyses['operational_efficiency'], sep='_')
                efficiency_df.to_excel(writer, sheet_name='Operational_Efficiency', index=False)
            
            # Defect patterns
            if 'defect_patterns' in all_analyses:
                if 'top_defect_patterns' in all_analyses['defect_patterns']:
                    defect_df = pd.DataFrame(all_analyses['defect_patterns']['top_defect_patterns']).T
                    defect_df.to_excel(writer, sheet_name='Defect_Patterns')
            
            # Quality costs
            if 'quality_costs' in all_analyses and 'cost_by_plant' in all_analyses['quality_costs']:
                cost_df = pd.DataFrame(all_analyses['quality_costs']['cost_by_plant']).T
                cost_df.to_excel(writer, sheet_name='Quality_Costs')
            
            # High-risk orders
            if 'QUALITY_SCORE' in comprehensive_df.columns:
                high_risk = comprehensive_df[comprehensive_df['QUALITY_SCORE'] < 70].copy()
                if not high_risk.empty:
                    high_risk.to_excel(writer, sheet_name='High_Risk_Orders', index=False)
            
            # Top performers
            if 'QUALITY_SCORE' in comprehensive_df.columns:
                top_performers = comprehensive_df[comprehensive_df['QUALITY_SCORE'] >= 95].copy()
                if not top_performers.empty:
                    top_performers.to_excel(writer, sheet_name='Top_Performers', index=False)
        
        self.created_files.append(analytics_file)
        print(f"   ✓ Advanced Analytics Excel: {os.path.basename(analytics_file)}")
        
        # JSON export for APIs/dashboards
        json_file = os.path.join(folder, f"advanced_analytics_{self.timestamp}.json")
        
        # Convert non-serializable objects
        def convert_for_json(obj):
            if isinstance(obj, (pd.Timestamp, datetime)):
                return obj.isoformat()
            elif isinstance(obj, pd.Series):
                return obj.to_dict()
            elif isinstance(obj, pd.DataFrame):
                return obj.to_dict('records')
            elif isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            return obj
        
        json_analyses = json.loads(json.dumps(all_analyses, default=convert_for_json))
        
        with open(json_file, 'w') as f:
            json.dump(json_analyses, f, indent=2)
        
        self.created_files.append(json_file)
        print(f"   ✓ Advanced Analytics JSON: {os.path.basename(json_file)}")
        
        return analytics_file, json_file
    
    def save_data_validation_results(self, validation_results):
        """
        Save data validation results
        """
        print("\n🔍 Saving Data Validation Results...")
        folder = os.path.join(self.session_folder, "03_data_validation")
        
        validation_summary = validation_results['validation_summary']
        detailed_results = validation_results['detailed_results']
        improvements = validation_results['improvement_suggestions']
        monitoring_plan = validation_results['monitoring_plan']
        
        # Main validation report
        report_file = os.path.join(folder, f"data_quality_report_{self.timestamp}.txt")
        
        with open(report_file, 'w') as f:
            f.write("SAP DATA QUALITY ASSESSMENT REPORT\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Generated: {validation_summary['timestamp']}\n")
            f.write(f"Overall Quality Score: {validation_summary['overall_quality_score']}/100\n\n")
            
            # Executive Summary
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-" * 40 + "\n")
            f.write(f"Tables Validated: {validation_summary['tables_validated']}\n")
            f.write(f"Total Records: {validation_summary['total_records']:,}\n")
            f.write(f"Issues Identified: {len(validation_summary['issues_found'])}\n\n")
            
            # Quality Scores by Table
            f.write("QUALITY SCORES BY TABLE\n")
            f.write("-" * 40 + "\n")
            for table, score in validation_summary['quality_scores'].items():
                status = "✓ GOOD" if score >= 85 else "⚠ NEEDS IMPROVEMENT" if score >= 70 else "❌ POOR"
                f.write(f"{table.replace('df_', '').upper():15} {score:6.1f}/100  {status}\n")
            f.write("\n")
            
            # Issues Found
            if validation_summary['issues_found']:
                f.write("ISSUES IDENTIFIED\n")
                f.write("-" * 40 + "\n")
                for i, issue in enumerate(validation_summary['issues_found'], 1):
                    f.write(f"{i:2d}. {issue}\n")
                f.write("\n")
            
            # Recommendations
            f.write("RECOMMENDATIONS\n")
            f.write("-" * 40 + "\n")
            for i, rec in enumerate(validation_summary['recommendations'], 1):
                f.write(f"{i:2d}. {rec}\n")
            f.write("\n")
        
        self.created_files.append(report_file)
        print(f"   ✓ Data Quality Report: {os.path.basename(report_file)}")
        
        # Excel export
        excel_file = os.path.join(folder, f"data_validation_complete_{self.timestamp}.xlsx")
        
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            # Summary sheet
            summary_df = pd.DataFrame([validation_summary])
            summary_df.to_excel(writer, sheet_name='Validation_Summary', index=False)
            
            # Quality scores
            scores_df = pd.DataFrame(list(validation_summary['quality_scores'].items()),
                                   columns=['Table', 'Quality_Score'])
            scores_df.to_excel(writer, sheet_name='Quality_Scores', index=False)
            
            # Issues
            if validation_summary['issues_found']:
                issues_df = pd.DataFrame(validation_summary['issues_found'], columns=['Issue'])
                issues_df.to_excel(writer, sheet_name='Issues_Found', index=False)
            
            # Recommendations
            rec_df = pd.DataFrame(validation_summary['recommendations'], columns=['Recommendation'])
            rec_df.to_excel(writer, sheet_name='Recommendations', index=False)
            
            # Improvement suggestions
            for category, suggestions in improvements.items():
                if suggestions:
                    imp_df = pd.DataFrame(suggestions, columns=[category.replace('_', ' ').title()])
                    imp_df.to_excel(writer, sheet_name=f'Improvements_{category[:15]}', index=False)
        
        self.created_files.append(excel_file)
        print(f"   ✓ Data Validation Excel: {os.path.basename(excel_file)}")
        
        return report_file, excel_file
    
    def save_executive_reports(self, all_analyses, comprehensive_df, validation_results):
        """
        Save executive-level reports
        """
        print("\n👔 Saving Executive Reports...")
        folder = os.path.join(self.session_folder, "04_executive_reports")
        
        # Executive Summary Report
        exec_report = os.path.join(folder, f"executive_summary_{self.timestamp}.txt")
        
        with open(exec_report, 'w') as f:
            f.write("SAP QUALITY MANAGEMENT - EXECUTIVE SUMMARY\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Report Date: {datetime.now().strftime('%B %d, %Y')}\n")
            f.write(f"Analysis Period: {self.timestamp}\n\n")
            
            # Key Metrics
            f.write("KEY PERFORMANCE INDICATORS\n")
            f.write("-" * 40 + "\n")
            
            if 'executive_summary' in all_analyses:
                exec_sum = all_analyses['executive_summary']
                f.write(f"Total Orders Analyzed: {exec_sum['overview']['total_orders']:,}\n")
                f.write(f"Data Coverage: {exec_sum['overview']['data_coverage']}%\n")
                
                if exec_sum['key_findings']:
                    f.write("\nKEY FINDINGS:\n")
                    for finding in exec_sum['key_findings']:
                        f.write(f"• {finding}\n")
            
            # Data Quality Status
            if validation_results:
                val_summary = validation_results['validation_summary']
                f.write(f"\nDATA QUALITY STATUS\n")
                f.write("-" * 40 + "\n")
                f.write(f"Overall Data Quality Score: {val_summary['overall_quality_score']}/100\n")
                
                quality_status = "EXCELLENT" if val_summary['overall_quality_score'] >= 90 else \
                               "GOOD" if val_summary['overall_quality_score'] >= 80 else \
                               "NEEDS IMPROVEMENT" if val_summary['overall_quality_score'] >= 70 else "POOR"
                f.write(f"Data Quality Status: {quality_status}\n")
            
            # Quality Performance
            if 'quality_analysis' in all_analyses.get('dashboard_data', {}):
                qa = all_analyses['dashboard_data']['quality_analysis'] 
                f.write(f"\nQUALITY PERFORMANCE\n")
                f.write("-" * 40 + "\n")
                f.write(f"Quality Issue Rate: {qa.get('quality_issue_rate', 'N/A')}%\n")
                f.write(f"Average Quality Score: {qa.get('avg_quality_score', 'N/A')}/100\n")
                f.write(f"Orders with Quality Issues: {qa.get('orders_with_quality_issues', 'N/A'):,}\n")
            
            # Recommendations
            if 'executive_summary' in all_analyses and all_analyses['executive_summary'].get('recommendations'):
                f.write(f"\nSTRATEGIC RECOMMENDATIONS\n")
                f.write("-" * 40 + "\n")
                for i, rec in enumerate(all_analyses['executive_summary']['recommendations'], 1):
                    f.write(f"{i}. {rec}\n")
            
            # Next Steps
            if 'executive_summary' in all_analyses and all_analyses['executive_summary'].get('next_steps'):
                f.write(f"\nNEXT STEPS\n")
                f.write("-" * 40 + "\n")
                for i, step in enumerate(all_analyses['executive_summary']['next_steps'], 1):
                    f.write(f"{i}. {step}\n")
        
        self.created_files.append(exec_report)
        print(f"   ✓ Executive Summary: {os.path.basename(exec_report)}")
        
        # Executive Dashboard Data (JSON for visualizations)
        dashboard_json = os.path.join(folder, f"executive_dashboard_{self.timestamp}.json")
        
        exec_dashboard = {
            'timestamp': self.timestamp,
            'report_date': datetime.now().isoformat(),
            'kpis': {},
            'charts': {},
            'alerts': []
        }
        
        # Extract KPIs
        if 'dashboard_data' in all_analyses and 'quality_kpis' in all_analyses['dashboard_data']:
            exec_dashboard['kpis'] = all_analyses['dashboard_data']['quality_kpis']
        
        # Plant performance for charts
        if 'dashboard_data' in all_analyses and 'plant_performance' in all_analyses['dashboard_data']:
            exec_dashboard['charts']['plant_performance'] = all_analyses['dashboard_data']['plant_performance']
        
        # Quality alerts
        if validation_results:
            val_summary = validation_results['validation_summary']
            if val_summary['overall_quality_score'] < 80:
                exec_dashboard['alerts'].append({
                    'type': 'warning',
                    'message': f"Data quality score ({val_summary['overall_quality_score']}) below target (80)",
                    'priority': 'high'
                })
            
            if len(val_summary['issues_found']) > 10:
                exec_dashboard['alerts'].append({
                    'type': 'warning', 
                    'message': f"{len(val_summary['issues_found'])} data quality issues identified",
                    'priority': 'medium'
                })
        
        with open(dashboard_json, 'w') as f:
            json.dump(exec_dashboard, f, indent=2, default=str)
        
        self.created_files.append(dashboard_json)
        print(f"   ✓ Executive Dashboard JSON: {os.path.basename(dashboard_json)}")
        
        return exec_report, dashboard_json
    
    def save_action_plans(self, all_analyses):
        """
        Save action plans and recommendations
        """
        print("\n📋 Saving Action Plans...")
        folder = os.path.join(self.session_folder, "06_action_plans")
        
        if 'action_plans' not in all_analyses:
            print("   ⚠️  No action plans available")
            return None
        
        action_plans = all_analyses['action_plans']
        
        # Main action plan document
        action_file = os.path.join(folder, f"action_plan_{self.timestamp}.txt")
        
        with open(action_file, 'w') as f:
            f.write("SAP QUALITY IMPROVEMENT ACTION PLAN\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            # Immediate Actions (1-30 days)
            f.write("IMMEDIATE ACTIONS (1-30 DAYS)\n")
            f.write("-" * 40 + "\n")
            for i, action in enumerate(action_plans.get('immediate_actions', []), 1):
                f.write(f"{i}. {action.get('action', action)}\n")
                if isinstance(action, dict):
                    f.write(f"   Timeline: {action.get('timeline', 'TBD')}\n")
                    f.write(f"   Responsible: {action.get('responsible', 'TBD')}\n")
                    f.write(f"   Priority: {action.get('priority', 'Medium')}\n")
                f.write("\n")
            
            # Short-term Actions (1-3 months)
            f.write("SHORT-TERM ACTIONS (1-3 MONTHS)\n")
            f.write("-" * 40 + "\n")
            for i, action in enumerate(action_plans.get('short_term_actions', []), 1):
                f.write(f"{i}. {action.get('action', action)}\n")
                if isinstance(action, dict):
                    f.write(f"   Timeline: {action.get('timeline', 'TBD')}\n")
                    f.write(f"   Responsible: {action.get('responsible', 'TBD')}\n")
                    f.write(f"   Priority: {action.get('priority', 'Medium')}\n")
                f.write("\n")
            
            # Long-term Actions (3-12 months)
            f.write("LONG-TERM ACTIONS (3-12 MONTHS)\n")
            f.write("-" * 40 + "\n")
            for i, action in enumerate(action_plans.get('long_term_actions', []), 1):
                f.write(f"{i}. {action.get('action', action)}\n")
                if isinstance(action, dict):
                    f.write(f"   Timeline: {action.get('timeline', 'TBD')}\n")
                    f.write(f"   Responsible: {action.get('responsible', 'TBD')}\n")
                    f.write(f"   Priority: {action.get('priority', 'Medium')}\n")
                f.write("\n")
            
            # Responsible Parties
            if 'responsible_parties' in action_plans:
                f.write("RESPONSIBLE PARTIES\n")
                f.write("-" * 40 + "\n")
                for party, responsibilities in action_plans['responsible_parties'].items():
                    f.write(f"{party}:\n")
                    f.write(f"  {responsibilities}\n\n")
        
        self.created_files.append(action_file)
        print(f"   ✓ Action Plan: {os.path.basename(action_file)}")
        
        # Excel format for tracking
        excel_file = os.path.join(folder, f"action_plan_tracker_{self.timestamp}.xlsx")
        
        # Convert action plans to tracking format
        tracking_data = []
        
        for category, actions in [
            ('Immediate', action_plans.get('immediate_actions', [])),
            ('Short-term', action_plans.get('short_term_actions', [])),
            ('Long-term', action_plans.get('long_term_actions', []))
        ]:
            for action in actions:
                if isinstance(action, dict):
                    tracking_data.append({
                        'Category': category,
                        'Action': action.get('action', ''),
                        'Timeline': action.get('timeline', 'TBD'),
                        'Responsible': action.get('responsible', 'TBD'),
                        'Priority': action.get('priority', 'Medium'),
                        'Status': 'Not Started',
                        'Progress': 0,
                        'Notes': ''
                    })
                else:
                    tracking_data.append({
                        'Category': category,
                        'Action': str(action),
                        'Timeline': 'TBD',
                        'Responsible': 'TBD',
                        'Priority': 'Medium',
                        'Status': 'Not Started',
                        'Progress': 0,
                        'Notes': ''
                    })
        
        if tracking_data:
            tracking_df = pd.DataFrame(tracking_data)
            tracking_df.to_excel(excel_file, index=False)
            self.created_files.append(excel_file)
            print(f"   ✓ Action Plan Tracker: {os.path.basename(excel_file)}")
        
        return action_file, excel_file
    
    def save_quality_scorecards(self, all_analyses, comprehensive_df):
        """
        Save quality scorecards
        """
        print("\n🏆 Saving Quality Scorecards...")
        folder = os.path.join(self.session_folder, "08_quality_scorecards")
        
        # Overall scorecard
        if 'quality_scorecard' in all_analyses:
            scorecard = all_analyses['quality_scorecard']
            
            scorecard_file = os.path.join(folder, f"quality_scorecard_{self.timestamp}.txt")
            
            with open(scorecard_file, 'w') as f:
                f.write("QUALITY PERFORMANCE SCORECARD\n")
                f.write("=" * 60 + "\n\n")
                f.write(f"Period: {scorecard.get('period', self.timestamp)}\n")
                f.write(f"Scope: {scorecard.get('scope', 'All Plants')}\n\n")
                
                f.write("PERFORMANCE METRICS\n")
                f.write("-" * 30 + "\n")
                
                for metric, data in scorecard.get('metrics', {}).items():
                    f.write(f"{metric}:\n")
                    f.write(f"  Current Value: {data.get('value', 'N/A')}\n")
                    f.write(f"  Target: {data.get('target', 'N/A')}\n")
                    f.write(f"  Status: {data.get('status', 'N/A')}\n")
                    f.write(f"  Trend: {data.get('trend', 'N/A')}\n\n")
            
            self.created_files.append(scorecard_file)
            print(f"   ✓ Quality Scorecard: {os.path.basename(scorecard_file)}")
        
        # Plant-specific scorecards
        plant_col = self._get_plant_column(comprehensive_df)
        if plant_col:
            plants = comprehensive_df[plant_col].unique()
            
            for plant in plants[:5]:  # Top 5 plants
                if pd.notna(plant):
                    plant_scorecard_file = os.path.join(folder, f"scorecard_{plant}_{self.timestamp}.txt")
                    
                    plant_data = comprehensive_df[comprehensive_df[plant_col] == plant]
                    
                    with open(plant_scorecard_file, 'w') as f:
                        f.write(f"QUALITY SCORECARD - {plant}\n")
                        f.write("=" * 50 + "\n\n")
                        f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d')}\n")
                        f.write(f"Total Orders: {len(plant_data):,}\n\n")
                        
                        if 'QUALITY_SCORE' in plant_data.columns:
                            avg_score = plant_data['QUALITY_SCORE'].mean()
                            f.write(f"Average Quality Score: {avg_score:.1f}/100\n")
                        
                        if 'HAS_QUALITY_ISSUES' in plant_data.columns:
                            issue_rate = (plant_data['HAS_QUALITY_ISSUES'].sum() / len(plant_data) * 100)
                            f.write(f"Quality Issue Rate: {issue_rate:.1f}%\n")
                        
                        if 'PRODUCTION_EFFICIENCY' in plant_data.columns:
                            efficiency = plant_data['PRODUCTION_EFFICIENCY'].mean()
                            f.write(f"Production Efficiency: {efficiency:.1f}%\n")
                    
                    self.created_files.append(plant_scorecard_file)
            
            print(f"   ✓ Plant scorecards created for {min(len(plants), 5)} plants")
    
    def generate_summary_index(self):
        """
        Generate an index file listing all created reports
        """
        print("\n📋 Generating Summary Index...")
        
        index_file = os.path.join(self.session_folder, f"INDEX_README_{self.timestamp}.txt")
        
        with open(index_file, 'w') as f:
            f.write("SAP QUALITY MANAGEMENT ANALYSIS - FILE INDEX\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Analysis Session: {self.timestamp}\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Files Created: {len(self.created_files)}\n\n")
            
            f.write("FOLDER STRUCTURE AND FILES\n")
            f.write("-" * 40 + "\n\n")
            
            # Group files by folder
            folder_files = {}
            for file_path in self.created_files:
                folder = os.path.dirname(file_path)
                folder_name = os.path.basename(folder)
                if folder_name not in folder_files:
                    folder_files[folder_name] = []
                folder_files[folder_name].append(os.path.basename(file_path))
            
            for folder, files in sorted(folder_files.items()):
                f.write(f"{folder}/\n")
                for file in sorted(files):
                    f.write(f"  ├── {file}\n")
                f.write("\n")
            
            f.write("FILE DESCRIPTIONS\n")
            f.write("-" * 40 + "\n")
            f.write("01_comprehensive_data/    - Main integrated dataset and summaries\n")
            f.write("02_advanced_analytics/    - Detailed analysis results and insights\n")
            f.write("03_data_validation/       - Data quality assessment reports\n")
            f.write("04_executive_reports/     - Management-level summaries\n")
            f.write("05_detailed_analysis/     - Deep-dive analysis results\n")
            f.write("06_action_plans/          - Improvement action plans and trackers\n")
            f.write("07_dashboards/            - Dashboard configurations and data\n")
            f.write("08_quality_scorecards/    - Performance scorecards by plant\n\n")
            
            f.write("QUICK START GUIDE\n")
            f.write("-" * 40 + "\n")
            f.write("1. Start with: 04_executive_reports/executive_summary_*.txt\n")
            f.write("2. Review: 01_comprehensive_data/comprehensive_dataset_*.xlsx\n")
            f.write("3. Check: 03_data_validation/data_quality_report_*.txt\n")
            f.write("4. Implement: 06_action_plans/action_plan_*.txt\n")
            f.write("5. Monitor: 08_quality_scorecards/quality_scorecard_*.txt\n")
        
        self.created_files.append(index_file)
        print(f"   ✓ Index file: {os.path.basename(index_file)}")
        
        return index_file
    
    def _get_plant_column(self, df):
        """Helper method to find the best plant column"""
        for col in ['Plant_Name', 'Plant_Code', 'WERKS']:
            if col in df.columns and df[col].notna().sum() > 0:
                return col
        return None

def run_complete_analysis_with_results_folder(
    df_aufk=None, df_afko=None, df_afpo=None, df_aufm=None,
    df_qmel=None, df_qmfe=None, df_qmur=None, df_qmih=None,
    df_qpcd=None, df_qpct=None, df_qpgt=None, 
    df_crhd_v1=None, df_jest=None, df_plant_description=None,
    result_folder="result"
):
    """
    Run complete SAP analysis and save all results to organized folder structure
    """
    print("🚀 SAP COMPLETE ANALYSIS WITH RESULTS MANAGEMENT")
    print("=" * 80)
    
    # Initialize results manager
    results_manager = SAPResultsManager(result_folder)
    session_folder = results_manager.setup_folder_structure()
    
    try:
        # Import the required functions (assuming they're available)
        from complete_sap_integration import create_comprehensive_sap_view
        from sap_advanced_analytics import create_all_advanced_analyses
        from sap_data_validation import run_complete_validation_suite
        
        print("\n🔄 Running Core Integration...")
        
        # Step 1: Create comprehensive view
        comprehensive_df, summary_stats, quality_details = create_comprehensive_sap_view(
            df_aufk, df_afko, df_afpo, df_aufm, df_qmel, df_qmfe, df_qmur, 
            df_qmih, df_qpcd, df_qpct, df_qpgt, df_crhd_v1, df_jest, df_plant_description
        )
        
        # Save comprehensive data
        results_manager.save_comprehensive_data(comprehensive_df, summary_stats, quality_details)
        
        print("\n📊 Running Advanced Analytics...")
        
        # Step 2: Run advanced analytics
        all_analyses = create_all_advanced_analyses(comprehensive_df, summary_stats, quality_details)
        
        # Save advanced analytics
        results_manager.save_advanced_analytics(all_analyses, comprehensive_df)
        
        print("\n🔍 Running Data Validation...")
        
        # Step 3: Run data validation
        tables = {
            'df_aufk': df_aufk, 'df_afko': df_afko, 'df_afpo': df_afpo, 'df_aufm': df_aufm,
            'df_qmel': df_qmel, 'df_qmfe': df_qmfe, 'df_qmur': df_qmur, 'df_qmih': df_qmih,
            'df_qpcd': df_qpcd, 'df_qpct': df_qpct, 'df_qpgt': df_qpgt,
            'df_crhd_v1': df_crhd_v1, 'df_jest': df_jest, 'df_plant_description': df_plant_description
        }
        
        # Remove None tables
        tables = {name: df for name, df in tables.items() if df is not None and not df.empty}
        
        validation_results = run_complete_validation_suite(**tables, generate_reports=False)
        
        # Save validation results
        results_manager.save_data_validation_results(validation_results)
        
        print("\n📋 Generating Executive Reports...")
        
        # Step 4: Save executive reports
        results_manager.save_executive_reports(all_analyses, comprehensive_df, validation_results)
        
        print("\n📝 Saving Action Plans...")
        
        # Step 5: Save action plans
        results_manager.save_action_plans(all_analyses)
        
        print("\n🏆 Creating Quality Scorecards...")
        
        # Step 6: Save quality scorecards
        results_manager.save_quality_scorecards(all_analyses, comprehensive_df)
        
        print("\n📑 Generating Summary Index...")
        
        # Step 7: Generate index file
        index_file = results_manager.generate_summary_index()
        
        print(f"\n✅ ANALYSIS COMPLETE!")
        print("=" * 80)
        print(f"📁 Results saved to: {session_folder}")
        print(f"📋 Total files created: {len(results_manager.created_files)}")
        print(f"📄 Start with: {os.path.basename(index_file)}")
        
        # Return summary for further use
        return {
            'session_folder': session_folder,
            'comprehensive_df': comprehensive_df,
            'all_analyses': all_analyses,
            'validation_results': validation_results,
            'created_files': results_manager.created_files,
            'results_manager': results_manager
        }
        
    except ImportError as e:
        print(f"❌ Import Error: {e}")
        print("Please ensure all required modules are available")
        return None
    
    except Exception as e:
        print(f"❌ Analysis Error: {e}")
        print("Check your data and try again")
        return None

def create_standalone_result_manager():
    """
    Create a standalone result manager for use with existing analysis results
    """
    print("📁 STANDALONE RESULT MANAGER")
    print("=" * 50)
    
    print("\nThis function helps you organize existing analysis results into folders.")
    print("\nUsage example:")
    print("```python")
    print("# After running your analysis")
    print("results = run_complete_analysis_with_results_folder(")
    print("    df_aufk=df_aufk, df_afko=df_afko, df_afpo=df_afpo,")
    print("    df_qmel=df_qmel, df_qmfe=df_qmfe, df_qmur=df_qmur,")
    print("    # ... other tables")
    print("    result_folder='result'  # Your chosen folder name")
    print(")")
    print("")
    print("# Access results")
    print("session_folder = results['session_folder']")
    print("comprehensive_data = results['comprehensive_df']")
    print("analytics = results['all_analyses']")
    print("```")
    
    return SAPResultsManager()

def demonstrate_folder_structure():
    """
    Show the folder structure that will be created
    """
    print("📁 FOLDER STRUCTURE PREVIEW")
    print("=" * 50)
    
    structure = """
result/
└── sap_analysis_YYYYMMDD_HHMMSS/
    ├── INDEX_README_YYYYMMDD_HHMMSS.txt
    ├── 01_comprehensive_data/
    │   ├── comprehensive_dataset_YYYYMMDD_HHMMSS.xlsx
    │   ├── comprehensive_dataset_YYYYMMDD_HHMMSS.csv
    │   └── summary_stats_YYYYMMDD_HHMMSS.json
    ├── 02_advanced_analytics/
    │   ├── advanced_analytics_YYYYMMDD_HHMMSS.xlsx
    │   └── advanced_analytics_YYYYMMDD_HHMMSS.json
    ├── 03_data_validation/
    │   ├── data_quality_report_YYYYMMDD_HHMMSS.txt
    │   └── data_validation_complete_YYYYMMDD_HHMMSS.xlsx
    ├── 04_executive_reports/
    │   ├── executive_summary_YYYYMMDD_HHMMSS.txt
    │   └── executive_dashboard_YYYYMMDD_HHMMSS.json
    ├── 05_detailed_analysis/
    │   └── (Future: Additional detailed reports)
    ├── 06_action_plans/
    │   ├── action_plan_YYYYMMDD_HHMMSS.txt
    │   └── action_plan_tracker_YYYYMMDD_HHMMSS.xlsx
    ├── 07_dashboards/
    │   └── (Future: Dashboard configurations)
    └── 08_quality_scorecards/
        ├── quality_scorecard_YYYYMMDD_HHMMSS.txt
        ├── scorecard_A110_YYYYMMDD_HHMMSS.txt
        ├── scorecard_A111_YYYYMMDD_HHMMSS.txt
        └── ... (one per plant)
    """
    
    print(structure)
    
    print("\n📋 FILE DESCRIPTIONS:")
    descriptions = {
        "INDEX_README": "Main index with file listing and quick start guide",
        "comprehensive_dataset.xlsx": "Main integrated dataset with all SAP data",
        "comprehensive_dataset.csv": "CSV version for analysis tools",
        "advanced_analytics.xlsx": "Detailed analytics with multiple sheets",
        "data_quality_report.txt": "Data validation and quality assessment",
        "executive_summary.txt": "Management-level summary and KPIs",
        "action_plan.txt": "Specific improvement actions with timelines",
        "quality_scorecard.txt": "Performance metrics and targets"
    }
    
    for file, desc in descriptions.items():
        print(f"• {file:25} - {desc}")

def example_complete_workflow():
    """
    Complete example workflow with result folder management
    """
    print("📚 COMPLETE WORKFLOW EXAMPLE")
    print("=" * 60)
    
    print("\n1. PREPARE YOUR DATA:")
    print("```python")
    print("import pandas as pd")
    print("")
    print("# Load your SAP tables")
    print("df_aufk = pd.read_csv('aufk_data.csv')")
    print("df_afko = pd.read_csv('afko_data.csv')")
    print("df_afpo = pd.read_csv('afpo_data.csv')")
    print("df_qmel = pd.read_csv('qmel_data.csv')")
    print("# ... load other tables")
    print("```")
    
    print("\n2. RUN COMPLETE ANALYSIS:")
    print("```python")
    print("# Run everything with organized results")
    print("results = run_complete_analysis_with_results_folder(")
    print("    df_aufk=df_aufk,")
    print("    df_afko=df_afko,")
    print("    df_afpo=df_afpo,")
    print("    df_qmel=df_qmel,")
    print("    df_qmfe=df_qmfe,")
    print("    # ... other tables")
    print("    result_folder='my_sap_analysis'  # Custom folder name")
    print(")")
    print("```")
    
    print("\n3. ACCESS RESULTS:")
    print("```python")
    print("# Get the session folder path")
    print("session_folder = results['session_folder']")
    print("print(f'Results saved to: {session_folder}')")
    print("")
    print("# Access the data")
    print("comprehensive_data = results['comprehensive_df']")
    print("analytics = results['all_analyses']")
    print("validation = results['validation_results']")
    print("")
    print("# List all created files")
    print("for file in results['created_files']:")
    print("    print(file)")
    print("```")
    
    print("\n4. REVIEW REPORTS:")
    print("```")
    print("# Start with the executive summary")
    print("result/sap_analysis_*/04_executive_reports/executive_summary_*.txt")
    print("")
    print("# Check data quality")
    print("result/sap_analysis_*/03_data_validation/data_quality_report_*.txt")
    print("")
    print("# Review action plans")
    print("result/sap_analysis_*/06_action_plans/action_plan_*.txt")
    print("")
    print("# Use the Excel files for detailed analysis")
    print("result/sap_analysis_*/01_comprehensive_data/comprehensive_dataset_*.xlsx")
    print("```")
    
    print("\n5. IMPLEMENT IMPROVEMENTS:")
    print("• Use action_plan_tracker.xlsx to track progress")
    print("• Share executive_summary.txt with management")
    print("• Use plant scorecards for operational reviews")
    print("• Monitor data quality scores regularly")

if __name__ == "__main__":
    print("🚀 SAP RESULTS MANAGER")
    print("=" * 50)
    
    print("\nChoose an option:")
    print("1. demonstrate_folder_structure() - See folder layout")
    print("2. example_complete_workflow() - See usage example")
    print("3. create_standalone_result_manager() - Create manager instance")
    
    print("\nMain function:")
    print("run_complete_analysis_with_results_folder(...)")
    
    demonstrate_folder_structure()
    example_complete_workflow()

🚀 SAP RESULTS MANAGER

Choose an option:
1. demonstrate_folder_structure() - See folder layout
2. example_complete_workflow() - See usage example
3. create_standalone_result_manager() - Create manager instance

Main function:
run_complete_analysis_with_results_folder(...)
📁 FOLDER STRUCTURE PREVIEW

result/
└── sap_analysis_YYYYMMDD_HHMMSS/
    ├── INDEX_README_YYYYMMDD_HHMMSS.txt
    ├── 01_comprehensive_data/
    │   ├── comprehensive_dataset_YYYYMMDD_HHMMSS.xlsx
    │   ├── comprehensive_dataset_YYYYMMDD_HHMMSS.csv
    │   └── summary_stats_YYYYMMDD_HHMMSS.json
    ├── 02_advanced_analytics/
    │   ├── advanced_analytics_YYYYMMDD_HHMMSS.xlsx
    │   └── advanced_analytics_YYYYMMDD_HHMMSS.json
    ├── 03_data_validation/
    │   ├── data_quality_report_YYYYMMDD_HHMMSS.txt
    │   └── data_validation_complete_YYYYMMDD_HHMMSS.xlsx
    ├── 04_executive_reports/
    │   ├── executive_summary_YYYYMMDD_HHMMSS.txt
    │   └── executive_dashboard_YYYYMMDD_HHMMSS.json
    ├── 05_detailed_ana

In [25]:
# 4. Extract key insights for your report
comprehensive_df = results['comprehensive_df']
all_analyses = results['all_analyses']

# Key metrics for your assessment
print("=== KEY FINDINGS ===")
print(f"Total Orders Analyzed: {len(comprehensive_df):,}")
print(f"Quality Issue Rate: {(comprehensive_df['HAS_QUALITY_ISSUES'].sum()/len(comprehensive_df)*100):.1f}%")
print(f"Average Quality Score: {comprehensive_df['QUALITY_SCORE'].mean():.1f}/100")

# Plant performance analysis
plant_performance = comprehensive_df.groupby('Plant_Code').agg({
    'QUALITY_SCORE': 'mean',
    'PRODUCTION_EFFICIENCY': 'mean',
    'QUALITY_NOTIF_COUNT': 'sum'
}).round(2)

print("\n=== PLANT PERFORMANCE ===")
print(plant_performance)

# Material risk analysis
if 'material_quality' in all_analyses:
    material_risks = all_analyses['material_quality']['top_risk_materials']
    print("\n=== HIGH RISK MATERIALS ===")
    for material, data in list(material_risks.items())[:5]:
        print(f"{material}: Risk Score {data['QUALITY_RISK_SCORE']}")

NameError: name 'results' is not defined

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

class SAPMachineLearningAnalytics:
    """
    Comprehensive Machine Learning Analytics for SAP Manufacturing and Quality Data
    """
    
    def __init__(self, comprehensive_df):
        self.comprehensive_df = comprehensive_df
        self.models = {}
        self.predictions = {}
        self.feature_importance = {}
        self.model_performance = {}
        
    def prepare_ml_features(self):
        """
        Prepare features for machine learning models
        """
        print("🔧 Preparing ML Features...")
        
        df = self.comprehensive_df.copy()
        
        # Create feature engineering
        feature_df = pd.DataFrame()
        
        # Basic order features
        if 'AUFNR' in df.columns:
            feature_df['ORDER_ID'] = df['AUFNR']
        
        # Plant features
        plant_cols = ['Plant_Code', 'Plant_Name', 'WERKS']
        for col in plant_cols:
            if col in df.columns:
                le = LabelEncoder()
                feature_df['PLANT_ENCODED'] = le.fit_transform(df[col].fillna('Unknown'))
                feature_df['PLANT_COUNT'] = df.groupby(col)['AUFNR'].transform('count')
                break
        
        # Order type features
        if 'AUART' in df.columns:
            le = LabelEncoder()
            feature_df['ORDER_TYPE_ENCODED'] = le.fit_transform(df['AUART'].fillna('Unknown'))
        
        # Quantity features
        qty_cols = ['TOTAL_PLANNED_QTY', 'TOTAL_RECEIVED_QTY', 'ORDER_ITEM_COUNT']
        for col in qty_cols:
            if col in df.columns:
                feature_df[col] = df[col].fillna(0)
                feature_df[f'{col}_LOG'] = np.log1p(df[col].fillna(0))
        
        # Quality features
        quality_cols = ['QUALITY_NOTIF_COUNT', 'DEFECT_COUNT', 'CAUSE_COUNT', 'ROOT_CAUSE_COUNT']
        for col in quality_cols:
            if col in df.columns:
                feature_df[col] = df[col].fillna(0)
                feature_df[f'{col}_BINARY'] = (df[col] > 0).astype(int)
        
        # Production features
        prod_cols = ['GOODS_MOVEMENT_COUNT', 'PRODUCTION_EFFICIENCY']
        for col in prod_cols:
            if col in df.columns:
                feature_df[col] = df[col].fillna(0)
        
        # Time-based features
        if 'ERDAT' in df.columns:
            try:
                dates = pd.to_datetime(df['ERDAT'], errors='coerce')
                feature_df['CREATION_YEAR'] = dates.dt.year
                feature_df['CREATION_MONTH'] = dates.dt.month
                feature_df['CREATION_QUARTER'] = dates.dt.quarter
                feature_df['CREATION_DAYOFWEEK'] = dates.dt.dayofweek
                feature_df['DAYS_SINCE_CREATION'] = (pd.Timestamp.now() - dates).dt.days
            except:
                pass
        
        # Target variables
        targets = {}
        
        # Quality prediction targets
        if 'HAS_QUALITY_ISSUES' in df.columns:
            targets['quality_issues'] = df['HAS_QUALITY_ISSUES'].astype(int)
        
        if 'QUALITY_SCORE' in df.columns:
            targets['quality_score'] = df['QUALITY_SCORE']
        
        if 'QUALITY_CATEGORY' in df.columns:
            le = LabelEncoder()
            targets['quality_category'] = le.fit_transform(df['QUALITY_CATEGORY'].fillna('Good'))
        
        # Production efficiency targets
        if 'PRODUCTION_EFFICIENCY' in df.columns:
            targets['production_efficiency'] = df['PRODUCTION_EFFICIENCY']
            targets['high_efficiency'] = (df['PRODUCTION_EFFICIENCY'] >= 95).astype(int)
        
        # Defect prediction targets
        if 'HAS_DEFECTS' in df.columns:
            targets['has_defects'] = df['HAS_DEFECTS'].astype(int)
        
        # Remove rows with all NaN values
        feature_df = feature_df.dropna(how='all')
        
        print(f"   ✓ Created {len(feature_df.columns)} features for {len(feature_df)} samples")
        print(f"   ✓ Target variables: {list(targets.keys())}")
        
        self.features = feature_df
        self.targets = targets
        
        return feature_df, targets
    
    def quality_issue_prediction(self):
        """
        Predict quality issues using classification models
        """
        print("\n🎯 Building Quality Issue Prediction Models...")
        
        if 'quality_issues' not in self.targets:
            print("   ⚠️  Quality issues target not available")
            return None
        
        X = self.features.select_dtypes(include=[np.number]).fillna(0)
        y = self.targets['quality_issues']
        
        # Remove samples where target is NaN
        mask = ~pd.isna(y)
        X, y = X[mask], y[mask]
        
        if len(X) < 50:
            print(f"   ⚠️  Insufficient data for modeling ({len(X)} samples)")
            return None
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Models to try
        models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
        }
        
        results = {}
        
        for name, model in models.items():
            try:
                # Train model
                if name == 'Logistic Regression':
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
                else:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    y_pred_proba = model.predict_proba(X_test)[:, 1]
                
                # Calculate metrics
                accuracy = (y_pred == y_test).mean()
                
                # Cross-validation
                cv_scores = cross_val_score(model, X_train_scaled if name == 'Logistic Regression' else X_train, 
                                          y_train, cv=5, scoring='accuracy')
                
                results[name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'predictions': y_pred,
                    'probabilities': y_pred_proba,
                    'test_indices': X_test.index
                }
                
                print(f"   ✓ {name}: Accuracy {accuracy:.3f}, CV {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")
                
            except Exception as e:
                print(f"   ❌ {name} failed: {e}")
        
        # Select best model
        if results:
            best_model_name = max(results.keys(), key=lambda x: results[x]['cv_mean'])
            best_model = results[best_model_name]
            
            print(f"   🏆 Best model: {best_model_name}")
            
            # Feature importance
            if hasattr(best_model['model'], 'feature_importances_'):
                importance_df = pd.DataFrame({
                    'feature': X.columns,
                    'importance': best_model['model'].feature_importances_
                }).sort_values('importance', ascending=False)
                
                self.feature_importance['quality_issues'] = importance_df
                print(f"   📊 Top features: {', '.join(importance_df.head(3)['feature'].tolist())}")
        
        self.models['quality_issues'] = results
        return results
    
    def production_efficiency_prediction(self):
        """
        Predict production efficiency using regression models
        """
        print("\n📈 Building Production Efficiency Prediction Models...")
        
        if 'production_efficiency' not in self.targets:
            print("   ⚠️  Production efficiency target not available")
            return None
        
        X = self.features.select_dtypes(include=[np.number]).fillna(0)
        y = self.targets['production_efficiency']
        
        # Remove samples where target is NaN
        mask = ~pd.isna(y)
        X, y = X[mask], y[mask]
        
        if len(X) < 50:
            print(f"   ⚠️  Insufficient data for modeling ({len(X)} samples)")
            return None
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Models to try
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'Linear Regression': LinearRegression()
        }
        
        results = {}
        
        for name, model in models.items():
            try:
                # Train model
                if name == 'Linear Regression':
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                else:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                
                # Calculate metrics
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                
                # Cross-validation
                cv_scores = cross_val_score(model, X_train_scaled if name == 'Linear Regression' else X_train, 
                                          y_train, cv=5, scoring='r2')
                
                results[name] = {
                    'model': model,
                    'mse': mse,
                    'r2': r2,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'predictions': y_pred,
                    'test_indices': X_test.index
                }
                
                print(f"   ✓ {name}: R² {r2:.3f}, MSE {mse:.3f}, CV {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")
                
            except Exception as e:
                print(f"   ❌ {name} failed: {e}")
        
        # Select best model
        if results:
            best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
            best_model = results[best_model_name]
            
            print(f"   🏆 Best model: {best_model_name}")
            
            # Feature importance
            if hasattr(best_model['model'], 'feature_importances_'):
                importance_df = pd.DataFrame({
                    'feature': X.columns,
                    'importance': best_model['model'].feature_importances_
                }).sort_values('importance', ascending=False)
                
                self.feature_importance['production_efficiency'] = importance_df
                print(f"   📊 Top features: {', '.join(importance_df.head(3)['feature'].tolist())}")
        
        self.models['production_efficiency'] = results
        return results
    
    def quality_clustering_analysis(self):
        """
        Perform clustering analysis to identify quality patterns
        """
        print("\n🔍 Performing Quality Clustering Analysis...")
        
        # Select quality-related features
        quality_features = []
        for col in self.features.columns:
            if any(keyword in col.upper() for keyword in ['QUALITY', 'DEFECT', 'CAUSE', 'NOTIF']):
                quality_features.append(col)
        
        if len(quality_features) < 2:
            print("   ⚠️  Insufficient quality features for clustering")
            return None
        
        X = self.features[quality_features].fillna(0)
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Determine optimal number of clusters
        inertias = []
        k_range = range(2, min(10, len(X)//10))
        
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(X_scaled)
            inertias.append(kmeans.inertia_)
        
        # Use elbow method (simple implementation)
        if len(inertias) >= 3:
            # Find the point with maximum rate of decrease
            rates = [inertias[i-1] - inertias[i] for i in range(1, len(inertias))]
            optimal_k = k_range[rates.index(max(rates)) + 1]
        else:
            optimal_k = 3
        
        # Perform clustering with optimal k
        kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(X_scaled)
        
        # Analyze clusters
        cluster_analysis = pd.DataFrame(X)
        cluster_analysis['Cluster'] = clusters
        
        cluster_summary = cluster_analysis.groupby('Cluster').agg({
            col: 'mean' for col in quality_features
        }).round(2)
        
        # Add cluster interpretation
        cluster_profiles = {}
        for cluster_id in range(optimal_k):
            cluster_data = cluster_summary.loc[cluster_id]
            
            # Determine cluster characteristics
            high_quality_score = cluster_data.get('QUALITY_SCORE', 0)
            high_defects = cluster_data.get('DEFECT_COUNT', 0)
            high_notifications = cluster_data.get('QUALITY_NOTIF_COUNT', 0)
            
            if high_defects > cluster_summary['DEFECT_COUNT'].mean():
                profile = "High Defect Risk"
            elif high_notifications > cluster_summary['QUALITY_NOTIF_COUNT'].mean():
                profile = "Quality Issues Prone"
            elif high_quality_score > cluster_summary.get('QUALITY_SCORE', pd.Series([0])).mean():
                profile = "High Quality Performance"
            else:
                profile = "Standard Performance"
            
            cluster_profiles[cluster_id] = profile
        
        results = {
            'optimal_k': optimal_k,
            'clusters': clusters,
            'cluster_summary': cluster_summary,
            'cluster_profiles': cluster_profiles,
            'feature_names': quality_features
        }
        
        print(f"   ✓ Identified {optimal_k} quality clusters")
        for cluster_id, profile in cluster_profiles.items():
            count = sum(clusters == cluster_id)
            print(f"   • Cluster {cluster_id}: {profile} ({count} orders)")
        
        self.models['quality_clustering'] = results
        return results
    
    def anomaly_detection(self):
        """
        Detect anomalous orders using isolation forest and statistical methods
        """
        print("\n🚨 Performing Anomaly Detection...")
        
        from sklearn.ensemble import IsolationForest
        from scipy import stats
        
        # Select numeric features
        X = self.features.select_dtypes(include=[np.number]).fillna(0)
        
        if len(X.columns) < 3:
            print("   ⚠️  Insufficient features for anomaly detection")
            return None
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Isolation Forest
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        anomaly_labels = iso_forest.fit_predict(X_scaled)
        
        # Statistical anomaly detection (Z-score based)
        z_scores = np.abs(stats.zscore(X_scaled, axis=0))
        statistical_anomalies = (z_scores > 3).any(axis=1)
        
        # Combine results
        anomalies_df = pd.DataFrame({
            'ORDER_INDEX': X.index,
            'ISO_FOREST_ANOMALY': anomaly_labels == -1,
            'STATISTICAL_ANOMALY': statistical_anomalies,
            'COMBINED_ANOMALY': (anomaly_labels == -1) | statistical_anomalies
        })
        
        # Add anomaly scores
        anomalies_df['ANOMALY_SCORE'] = iso_forest.decision_function(X_scaled)
        
        # Analyze anomalies
        total_anomalies = anomalies_df['COMBINED_ANOMALY'].sum()
        iso_anomalies = anomalies_df['ISO_FOREST_ANOMALY'].sum()
        stat_anomalies = anomalies_df['STATISTICAL_ANOMALY'].sum()
        
        print(f"   ✓ Detected {total_anomalies} anomalous orders")
        print(f"   • Isolation Forest: {iso_anomalies}")
        print(f"   • Statistical: {stat_anomalies}")
        
        # Get most anomalous orders
        top_anomalies = anomalies_df.nsmallest(10, 'ANOMALY_SCORE')
        
        results = {
            'anomalies_df': anomalies_df,
            'total_anomalies': total_anomalies,
            'top_anomalies': top_anomalies,
            'anomaly_rate': total_anomalies / len(X) * 100
        }
        
        self.models['anomaly_detection'] = results
        return results
    
    def feature_importance_analysis(self):
        """
        Analyze feature importance across all models
        """
        print("\n📊 Analyzing Feature Importance...")
        
        if not self.feature_importance:
            print("   ⚠️  No feature importance data available")
            return None
        
        # Combine feature importance from all models
        combined_importance = pd.DataFrame()
        
        for model_name, importance_df in self.feature_importance.items():
            importance_df = importance_df.copy()
            importance_df['model'] = model_name
            combined_importance = pd.concat([combined_importance, importance_df], ignore_index=True)
        
        # Calculate average importance across models
        avg_importance = combined_importance.groupby('feature')['importance'].agg(['mean', 'std', 'count']).reset_index()
        avg_importance = avg_importance.sort_values('mean', ascending=False)
        
        print(f"   ✓ Analyzed {len(avg_importance)} features across {len(self.feature_importance)} models")
        print("   📈 Top 5 most important features:")
        for i, row in avg_importance.head().iterrows():
            print(f"      {i+1}. {row['feature']}: {row['mean']:.3f} (±{row['std']:.3f})")
        
        return avg_importance
    
    def generate_ml_predictions(self):
        """
        Generate predictions for all orders using trained models
        """
        print("\n🔮 Generating ML Predictions...")
        
        predictions_df = pd.DataFrame(index=self.comprehensive_df.index)
        
        # Quality issue predictions
        if 'quality_issues' in self.models and self.models['quality_issues']:
            best_model_name = max(self.models['quality_issues'].keys(), 
                                key=lambda x: self.models['quality_issues'][x]['cv_mean'])
            model_data = self.models['quality_issues'][best_model_name]
            
            # Get full dataset predictions
            X_full = self.features.select_dtypes(include=[np.number]).fillna(0)
            
            try:
                if best_model_name == 'Logistic Regression':
                    scaler = StandardScaler()
                    X_scaled = scaler.fit_transform(X_full)
                    quality_pred = model_data['model'].predict_proba(X_scaled)[:, 1]
                else:
                    quality_pred = model_data['model'].predict_proba(X_full)[:, 1]
                
                predictions_df['QUALITY_ISSUE_PROBABILITY'] = quality_pred
                predictions_df['QUALITY_ISSUE_PREDICTION'] = quality_pred > 0.5
                
            except Exception as e:
                print(f"   ⚠️  Quality prediction failed: {e}")
        
        # Production efficiency predictions
        if 'production_efficiency' in self.models and self.models['production_efficiency']:
            best_model_name = max(self.models['production_efficiency'].keys(), 
                                key=lambda x: self.models['production_efficiency'][x]['r2'])
            model_data = self.models['production_efficiency'][best_model_name]
            
            X_full = self.features.select_dtypes(include=[np.number]).fillna(0)
            
            try:
                if best_model_name == 'Linear Regression':
                    scaler = StandardScaler()
                    X_scaled = scaler.fit_transform(X_full)
                    efficiency_pred = model_data['model'].predict(X_scaled)
                else:
                    efficiency_pred = model_data['model'].predict(X_full)
                
                predictions_df['PREDICTED_EFFICIENCY'] = efficiency_pred
                predictions_df['EFFICIENCY_IMPROVEMENT_POTENTIAL'] = np.maximum(0, 100 - efficiency_pred)
                
            except Exception as e:
                print(f"   ⚠️  Efficiency prediction failed: {e}")
        
        # Quality clusters
        if 'quality_clustering' in self.models:
            cluster_data = self.models['quality_clustering']
            predictions_df['QUALITY_CLUSTER'] = cluster_data['clusters']
            
            # Map cluster profiles
            cluster_mapping = cluster_data['cluster_profiles']
            predictions_df['QUALITY_CLUSTER_PROFILE'] = predictions_df['QUALITY_CLUSTER'].map(cluster_mapping)
        
        # Anomaly flags
        if 'anomaly_detection' in self.models:
            anomaly_data = self.models['anomaly_detection']
            anomaly_df = anomaly_data['anomalies_df'].set_index('ORDER_INDEX')
            
            for col in ['ISO_FOREST_ANOMALY', 'STATISTICAL_ANOMALY', 'COMBINED_ANOMALY', 'ANOMALY_SCORE']:
                if col in anomaly_df.columns:
                    predictions_df[col] = anomaly_df[col]
        
        print(f"   ✓ Generated {len(predictions_df.columns)} prediction columns")
        
        self.predictions = predictions_df
        return predictions_df
    
    def create_ml_insights(self):
        """
        Create actionable insights from ML models
        """
        print("\n💡 Generating ML-Based Insights...")
        
        insights = {
            'high_risk_orders': [],
            'improvement_opportunities': [],
            'quality_patterns': [],
            'efficiency_drivers': [],
            'anomaly_alerts': []
        }
        
        if self.predictions.empty:
            print("   ⚠️  No predictions available for insights")
            return insights
        
        # High-risk orders
        if 'QUALITY_ISSUE_PROBABILITY' in self.predictions.columns:
            high_risk = self.predictions[self.predictions['QUALITY_ISSUE_PROBABILITY'] > 0.7]
            insights['high_risk_orders'] = {
                'count': len(high_risk),
                'percentage': len(high_risk) / len(self.predictions) * 100,
                'order_indices': high_risk.index.tolist()[:10],  # Top 10
                'avg_probability': high_risk['QUALITY_ISSUE_PROBABILITY'].mean()
            }
        
        # Efficiency improvement opportunities
        if 'EFFICIENCY_IMPROVEMENT_POTENTIAL' in self.predictions.columns:
            high_potential = self.predictions[self.predictions['EFFICIENCY_IMPROVEMENT_POTENTIAL'] > 20]
            insights['improvement_opportunities'] = {
                'count': len(high_potential),
                'avg_potential': high_potential['EFFICIENCY_IMPROVEMENT_POTENTIAL'].mean(),
                'total_potential': high_potential['EFFICIENCY_IMPROVEMENT_POTENTIAL'].sum(),
                'order_indices': high_potential.index.tolist()[:10]
            }
        
        # Quality cluster patterns
        if 'QUALITY_CLUSTER_PROFILE' in self.predictions.columns:
            cluster_dist = self.predictions['QUALITY_CLUSTER_PROFILE'].value_counts()
            insights['quality_patterns'] = {
                'cluster_distribution': cluster_dist.to_dict(),
                'dominant_pattern': cluster_dist.index[0] if len(cluster_dist) > 0 else None,
                'pattern_percentage': cluster_dist.iloc[0] / len(self.predictions) * 100 if len(cluster_dist) > 0 else 0
            }
        
        # Feature importance insights
        if self.feature_importance:
            top_features = []
            for model_name, importance_df in self.feature_importance.items():
                top_features.extend(importance_df.head(3)['feature'].tolist())
            
            insights['efficiency_drivers'] = {
                'top_features': list(set(top_features)),
                'models_analyzed': list(self.feature_importance.keys())
            }
        
        # Anomaly alerts
        if 'COMBINED_ANOMALY' in self.predictions.columns:
            anomalies = self.predictions[self.predictions['COMBINED_ANOMALY'] == True]
            insights['anomaly_alerts'] = {
                'count': len(anomalies),
                'percentage': len(anomalies) / len(self.predictions) * 100,
                'order_indices': anomalies.index.tolist()[:5],  # Top 5
                'requires_investigation': len(anomalies) > 0
            }
        
        # Print summary
        print(f"   📊 High-risk orders identified: {insights['high_risk_orders'].get('count', 0)}")
        print(f"   ⚡ Efficiency improvement opportunities: {insights['improvement_opportunities'].get('count', 0)}")
        print(f"   🚨 Anomalies requiring investigation: {insights['anomaly_alerts'].get('count', 0)}")
        
        return insights

def run_complete_ml_analysis(comprehensive_df, result_folder="result"):
    """
    Run complete machine learning analysis on SAP data
    """
    print("🤖 RUNNING COMPLETE ML ANALYSIS")
    print("=" * 60)
    
    # Initialize ML analyzer
    ml_analyzer = SAPMachineLearningAnalytics(comprehensive_df)
    
    # Prepare features
    features, targets = ml_analyzer.prepare_ml_features()
    
    if features.empty:
        print("❌ No features available for ML analysis")
        return None
    
    # Run all ML analyses
    ml_results = {}
    
    # 1. Quality issue prediction
    quality_results = ml_analyzer.quality_issue_prediction()
    if quality_results:
        ml_results['quality_prediction'] = quality_results
    
    # 2. Production efficiency prediction  
    efficiency_results = ml_analyzer.production_efficiency_prediction()
    if efficiency_results:
        ml_results['efficiency_prediction'] = efficiency_results
    
    # 3. Quality clustering
    clustering_results = ml_analyzer.quality_clustering_analysis()
    if clustering_results:
        ml_results['quality_clustering'] = clustering_results
    
    # 4. Anomaly detection
    anomaly_results = ml_analyzer.anomaly_detection()
    if anomaly_results:
        ml_results['anomaly_detection'] = anomaly_results
    
    # 5. Feature importance analysis
    feature_importance = ml_analyzer.feature_importance_analysis()
    if feature_importance is not None:
        ml_results['feature_importance'] = feature_importance
    
    # 6. Generate predictions
    predictions = ml_analyzer.generate_ml_predictions()
    ml_results['predictions'] = predictions
    
    # 7. Create insights
    insights = ml_analyzer.create_ml_insights()
    ml_results['insights'] = insights
    
    print(f"\n✅ ML Analysis Complete!")
    print(f"   🔧 Models trained: {len([k for k in ml_results.keys() if 'prediction' in k])}")
    print(f"   📊 Predictions generated: {len(predictions.columns) if not predictions.empty else 0}")
    print(f"   💡 Insights created: {len(insights)}")
    
    return {
        'ml_analyzer': ml_analyzer,
        'ml_results': ml_results,
        'features': features,
        'targets': targets,
        'predictions': predictions,
        'insights': insights
    }

# Example usage for the assessment
def tolaram_ml_assessment_workflow(comprehensive_df):
    """
    Specific ML workflow for Tolaram assessment
    """
    print("🎯 TOLARAM ASSESSMENT - ML WORKFLOW")
    print("=" * 50)
    
    # Run complete ML analysis
    ml_analysis = run_complete_ml_analysis(comprehensive_df)
    
    if ml_analysis is None:
        return None
    
    insights = ml_analysis['insights']
    predictions = ml_analysis['predictions']
    
    # Create assessment-specific insights
    assessment_insights = {
        'executive_summary': {},
        'operational_recommendations': [],
        'predictive_capabilities': {},
        'risk_assessment': {},
        'business_impact': {}
    }
    
    # Executive Summary for Assessment
    total_orders = len(comprehensive_df)
    
    assessment_insights['executive_summary'] = {
        'total_orders_analyzed': total_orders,
        'ml_models_developed': len([k for k in ml_analysis['ml_results'].keys() if 'prediction' in k]),
        'high_risk_orders_identified': insights.get('high_risk_orders', {}).get('count', 0),
        'efficiency_opportunities': insights.get('improvement_opportunities', {}).get('count', 0),
        'anomalies_detected': insights.get('anomaly_alerts', {}).get('count', 0)
    }
    
    # Operational Recommendations
    if insights.get('high_risk_orders', {}).get('count', 0) > 0:
        risk_rate = insights['high_risk_orders']['percentage']
        assessment_insights['operational_recommendations'].append({
            'priority': 'HIGH',
            'area': 'Quality Management',
            'recommendation': f'Implement preventive measures for {insights["high_risk_orders"]["count"]} high-risk orders ({risk_rate:.1f}% of total)',
            'expected_impact': 'Reduce quality incidents by 30-50%'
        })
    
    if insights.get('improvement_opportunities', {}).get('count', 0) > 0:
        avg_potential = insights['improvement_opportunities']['avg_potential']
        assessment_insights['operational_recommendations'].append({
            'priority': 'MEDIUM',
            'area': 'Production Efficiency', 
            'recommendation': f'Focus improvement efforts on {insights["improvement_opportunities"]["count"]} orders with {avg_potential:.1f}% average efficiency gain potential',
            'expected_impact': f'Increase overall efficiency by {avg_potential/4:.1f}%'
        })
    
    if insights.get('anomaly_alerts', {}).get('count', 0) > 0:
        assessment_insights['operational_recommendations'].append({
            'priority': 'HIGH',
            'area': 'Process Investigation',
            'recommendation': f'Investigate {insights["anomaly_alerts"]["count"]} anomalous orders for process deviations',
            'expected_impact': 'Identify and eliminate root causes of process variations'
        })
    
    # Predictive Capabilities Assessment
    model_performance = {}
    
    if 'quality_prediction' in ml_analysis['ml_results']:
        best_quality_model = max(ml_analysis['ml_results']['quality_prediction'].keys(),
                               key=lambda x: ml_analysis['ml_results']['quality_prediction'][x]['cv_mean'])
        quality_accuracy = ml_analysis['ml_results']['quality_prediction'][best_quality_model]['cv_mean']
        model_performance['quality_prediction'] = {
            'model_type': best_quality_model,
            'accuracy': quality_accuracy,
            'business_value': 'Predict quality issues before they occur'
        }
    
    if 'efficiency_prediction' in ml_analysis['ml_results']:
        best_efficiency_model = max(ml_analysis['ml_results']['efficiency_prediction'].keys(),
                                  key=lambda x: ml_analysis['ml_results']['efficiency_prediction'][x]['r2'])
        efficiency_r2 = ml_analysis['ml_results']['efficiency_prediction'][best_efficiency_model]['r2']
        model_performance['efficiency_prediction'] = {
            'model_type': best_efficiency_model,
            'r2_score': efficiency_r2,
            'business_value': 'Optimize production planning and resource allocation'
        }
    
    assessment_insights['predictive_capabilities'] = model_performance
    
    # Risk Assessment
    quality_risk_rate = insights.get('high_risk_orders', {}).get('percentage', 0)
    anomaly_rate = insights.get('anomaly_alerts', {}).get('percentage', 0)
    
    assessment_insights['risk_assessment'] = {
        'quality_risk_level': 'HIGH' if quality_risk_rate > 20 else 'MEDIUM' if quality_risk_rate > 10 else 'LOW',
        'quality_risk_percentage': quality_risk_rate,
        'process_stability': 'UNSTABLE' if anomaly_rate > 5 else 'STABLE',
        'anomaly_percentage': anomaly_rate,
        'overall_risk_score': min(100, (quality_risk_rate * 2) + (anomaly_rate * 3))
    }
    
    # Business Impact Estimation
    potential_savings = 0
    
    if insights.get('improvement_opportunities', {}).get('total_potential'):
        # Assume each efficiency point is worth $1000 per order
        potential_savings += insights['improvement_opportunities']['total_potential'] * 1000
    
    if insights.get('high_risk_orders', {}).get('count'):
        # Assume preventing each quality issue saves $5000
        potential_savings += insights['high_risk_orders']['count'] * 5000
    
    assessment_insights['business_impact'] = {
        'estimated_annual_savings': potential_savings,
        'efficiency_improvement_potential': insights.get('improvement_opportunities', {}).get('avg_potential', 0),
        'quality_improvement_potential': insights.get('high_risk_orders', {}).get('percentage', 0),
        'roi_timeline': '6-12 months'
    }
    
    print("\n📋 Assessment Insights Generated:")
    print(f"   • Risk Level: {assessment_insights['risk_assessment']['quality_risk_level']}")
    print(f"   • Potential Savings: ${assessment_insights['business_impact']['estimated_annual_savings']:,.0f}")
    print(f"   • Models Developed: {assessment_insights['executive_summary']['ml_models_developed']}")
    
    return {
        'ml_analysis': ml_analysis,
        'assessment_insights': assessment_insights,
        'comprehensive_predictions': predictions
    }

def save_ml_results_to_folder(ml_analysis, result_folder="result"):
    """
    Save ML analysis results to organized folder structure
    """
    print("\n💾 Saving ML Results to Folder...")
    
    import os
    import json
    from datetime import datetime
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Create ML-specific folder
    ml_folder = os.path.join(result_folder, f"09_machine_learning_{timestamp}")
    os.makedirs(ml_folder, exist_ok=True)
    
    created_files = []
    
    # 1. Save model performance report
    performance_file = os.path.join(ml_folder, f"ml_model_performance_{timestamp}.txt")
    
    with open(performance_file, 'w') as f:
        f.write("MACHINE LEARNING MODEL PERFORMANCE REPORT\n")
        f.write("=" * 80 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        # Model Performance Summary
        if 'ml_results' in ml_analysis:
            ml_results = ml_analysis['ml_results']
            
            f.write("MODEL PERFORMANCE SUMMARY\n")
            f.write("-" * 40 + "\n")
            
            # Quality Prediction Models
            if 'quality_prediction' in ml_results:
                f.write("\nQUALITY ISSUE PREDICTION:\n")
                for model_name, results in ml_results['quality_prediction'].items():
                    f.write(f"  {model_name}:\n")
                    f.write(f"    Accuracy: {results['accuracy']:.3f}\n")
                    f.write(f"    Cross-validation: {results['cv_mean']:.3f} (±{results['cv_std']:.3f})\n")
            
            # Efficiency Prediction Models
            if 'efficiency_prediction' in ml_results:
                f.write("\nPRODUCTION EFFICIENCY PREDICTION:\n")
                for model_name, results in ml_results['efficiency_prediction'].items():
                    f.write(f"  {model_name}:\n")
                    f.write(f"    R² Score: {results['r2']:.3f}\n")
                    f.write(f"    MSE: {results['mse']:.3f}\n")
                    f.write(f"    Cross-validation: {results['cv_mean']:.3f} (±{results['cv_std']:.3f})\n")
            
            # Clustering Results
            if 'quality_clustering' in ml_results:
                f.write("\nQUALITY CLUSTERING ANALYSIS:\n")
                clustering = ml_results['quality_clustering']
                f.write(f"  Optimal clusters: {clustering['optimal_k']}\n")
                f.write("  Cluster profiles:\n")
                for cluster_id, profile in clustering['cluster_profiles'].items():
                    cluster_size = sum(clustering['clusters'] == cluster_id)
                    f.write(f"    Cluster {cluster_id}: {profile} ({cluster_size} orders)\n")
            
            # Anomaly Detection
            if 'anomaly_detection' in ml_results:
                f.write("\nANOMALY DETECTION:\n")
                anomaly = ml_results['anomaly_detection']
                f.write(f"  Total anomalies: {anomaly['total_anomalies']}\n")
                f.write(f"  Anomaly rate: {anomaly['anomaly_rate']:.2f}%\n")
        
        # Feature Importance
        if 'feature_importance' in ml_analysis['ml_results']:
            f.write("\nTOP FEATURE IMPORTANCE:\n")
            f.write("-" * 40 + "\n")
            importance_df = ml_analysis['ml_results']['feature_importance']
            for i, row in importance_df.head(10).iterrows():
                f.write(f"  {i+1:2d}. {row['feature']}: {row['mean']:.3f}\n")
    
    created_files.append(performance_file)
    print(f"   ✓ Model Performance Report: {os.path.basename(performance_file)}")
    
    # 2. Save predictions to Excel
    if 'predictions' in ml_analysis and not ml_analysis['predictions'].empty:
        predictions_file = os.path.join(ml_folder, f"ml_predictions_{timestamp}.xlsx")
        
        with pd.ExcelWriter(predictions_file, engine='openpyxl') as writer:
            # Main predictions
            ml_analysis['predictions'].to_excel(writer, sheet_name='ML_Predictions', index=True)
            
            # Feature importance (if available)
            if 'feature_importance' in ml_analysis['ml_results']:
                ml_analysis['ml_results']['feature_importance'].to_excel(writer, sheet_name='Feature_Importance', index=False)
            
            # High-risk orders
            if 'QUALITY_ISSUE_PROBABILITY' in ml_analysis['predictions'].columns:
                high_risk = ml_analysis['predictions'][ml_analysis['predictions']['QUALITY_ISSUE_PROBABILITY'] > 0.7]
                if not high_risk.empty:
                    high_risk.to_excel(writer, sheet_name='High_Risk_Orders', index=True)
            
            # Efficiency opportunities
            if 'EFFICIENCY_IMPROVEMENT_POTENTIAL' in ml_analysis['predictions'].columns:
                opportunities = ml_analysis['predictions'][ml_analysis['predictions']['EFFICIENCY_IMPROVEMENT_POTENTIAL'] > 20]
                if not opportunities.empty:
                    opportunities.to_excel(writer, sheet_name='Efficiency_Opportunities', index=True)
        
        created_files.append(predictions_file)
        print(f"   ✓ ML Predictions Excel: {os.path.basename(predictions_file)}")
    
    # 3. Save insights as JSON
    if 'insights' in ml_analysis:
        insights_file = os.path.join(ml_folder, f"ml_insights_{timestamp}.json")
        
        with open(insights_file, 'w') as f:
            json.dump(ml_analysis['insights'], f, indent=2, default=str)
        
        created_files.append(insights_file)
        print(f"   ✓ ML Insights JSON: {os.path.basename(insights_file)}")
    
    # 4. Save assessment-specific insights (if available)
    if 'assessment_insights' in ml_analysis:
        assessment_file = os.path.join(ml_folder, f"assessment_insights_{timestamp}.txt")
        
        with open(assessment_file, 'w') as f:
            f.write("TOLARAM ASSESSMENT - ML INSIGHTS REPORT\n")
            f.write("=" * 80 + "\n\n")
            
            insights = ml_analysis['assessment_insights']
            
            # Executive Summary
            if 'executive_summary' in insights:
                f.write("EXECUTIVE SUMMARY\n")
                f.write("-" * 40 + "\n")
                exec_sum = insights['executive_summary']
                f.write(f"Total Orders Analyzed: {exec_sum['total_orders_analyzed']:,}\n")
                f.write(f"ML Models Developed: {exec_sum['ml_models_developed']}\n")
                f.write(f"High-Risk Orders: {exec_sum['high_risk_orders_identified']:,}\n")
                f.write(f"Efficiency Opportunities: {exec_sum['efficiency_opportunities']:,}\n")
                f.write(f"Anomalies Detected: {exec_sum['anomalies_detected']:,}\n\n")
            
            # Risk Assessment
            if 'risk_assessment' in insights:
                f.write("RISK ASSESSMENT\n")
                f.write("-" * 40 + "\n")
                risk = insights['risk_assessment']
                f.write(f"Quality Risk Level: {risk['quality_risk_level']}\n")
                f.write(f"Quality Risk Rate: {risk['quality_risk_percentage']:.1f}%\n")
                f.write(f"Process Stability: {risk['process_stability']}\n")
                f.write(f"Overall Risk Score: {risk['overall_risk_score']:.1f}/100\n\n")
            
            # Business Impact
            if 'business_impact' in insights:
                f.write("BUSINESS IMPACT\n")
                f.write("-" * 40 + "\n")
                impact = insights['business_impact']
                f.write(f"Estimated Annual Savings: ${impact['estimated_annual_savings']:,.0f}\n")
                f.write(f"Efficiency Improvement Potential: {impact['efficiency_improvement_potential']:.1f}%\n")
                f.write(f"Quality Improvement Potential: {impact['quality_improvement_potential']:.1f}%\n")
                f.write(f"ROI Timeline: {impact['roi_timeline']}\n\n")
            
            # Recommendations
            if 'operational_recommendations' in insights:
                f.write("OPERATIONAL RECOMMENDATIONS\n")
                f.write("-" * 40 + "\n")
                for i, rec in enumerate(insights['operational_recommendations'], 1):
                    f.write(f"{i}. {rec['area']} (Priority: {rec['priority']})\n")
                    f.write(f"   Recommendation: {rec['recommendation']}\n")
                    f.write(f"   Expected Impact: {rec['expected_impact']}\n\n")
        
        created_files.append(assessment_file)
        print(f"   ✓ Assessment Insights: {os.path.basename(assessment_file)}")
    
    print(f"   ✅ Saved {len(created_files)} ML analysis files")
    return created_files

# Complete workflow for assessment
def complete_tolaram_assessment_with_ml(df_aufk=None, df_afko=None, df_afpo=None, df_aufm=None,
                                       df_qmel=None, df_qmfe=None, df_qmur=None, df_qmih=None,
                                       df_qpcd=None, df_qpct=None, df_qpgt=None, 
                                       df_crhd_v1=None, df_jest=None, df_plant_description=None):
    """
    Complete assessment workflow including ML analysis
    """
    print("🎯 COMPLETE TOLARAM ASSESSMENT WITH ML")
    print("=" * 80)
    
    try:
        # Step 1: Run basic integration (assuming functions are available)
        print("\n🔄 Step 1: Running Core Integration...")
        
        # This would need to import from your main integration script
        # For the assessment, you'll have these functions available
        comprehensive_df, summary_stats, quality_details = create_comprehensive_sap_view(
            df_aufk, df_afko, df_afpo, df_aufm, df_qmel, df_qmfe, df_qmur, 
            df_qmih, df_qpcd, df_qpct, df_qpgt, df_crhd_v1, df_jest, df_plant_description
        )
        
        # Step 2: Run ML analysis
        print("\n🤖 Step 2: Running ML Analysis...")
        ml_assessment = tolaram_ml_assessment_workflow(comprehensive_df)
        
        if ml_assessment is None:
            print("❌ ML analysis failed")
            return None
        
        # Step 3: Save results
        print("\n💾 Step 3: Saving Results...")
        results = run_complete_analysis_with_results_folder(
            df_aufk, df_afko, df_afpo, df_aufm, df_qmel, df_qmfe, df_qmur, 
            df_qmih, df_qpcd, df_qpct, df_qpgt, df_crhd_v1, df_jest, df_plant_description,
            result_folder="tolaram_assessment"
        )
        
        # Step 4: Save ML-specific results
        ml_files = save_ml_results_to_folder(ml_assessment, "tolaram_assessment")
        
        print("\n✅ ASSESSMENT COMPLETE!")
        print("=" * 80)
        print("📊 DELIVERABLES CREATED:")
        print("1. Executive Summary with ML insights")
        print("2. Comprehensive data analysis")
        print("3. ML model performance reports")
        print("4. Predictive analytics results")
        print("5. Risk assessment with ML validation")
        print("6. Actionable recommendations")
        
        return {
            'comprehensive_results': results,
            'ml_assessment': ml_assessment,
            'ml_files': ml_files,
            'session_folder': results['session_folder'] if results else None
        }
        
    except Exception as e:
        print(f"❌ Assessment failed: {e}")
        return None

# Usage instructions for the assessment
def assessment_usage_guide():
    """
    Guide for using ML analysis in the Tolaram assessment
    """
    print("📚 TOLARAM ASSESSMENT - ML USAGE GUIDE")
    print("=" * 60)
    
    print("\n1. LOAD YOUR DATA:")
    print("```python")
    print("import pandas as pd")
    print("")
    print("# Load the Excel file")
    print("excel_file = 'Project_Assessment_Data.xlsx'")
    print("sheet_names = pd.ExcelFile(excel_file).sheet_names")
    print("")
    print("# Load each sheet")
    print("tables = {}")
    print("for sheet in sheet_names:")
    print("    tables[f'df_{sheet.lower()}'] = pd.read_excel(excel_file, sheet_name=sheet)")
    print("```")
    
    print("\n2. RUN COMPLETE ANALYSIS WITH ML:")
    print("```python")
    print("# Run everything including ML")
    print("assessment_results = complete_tolaram_assessment_with_ml(")
    print("    df_aufk=tables.get('df_aufk'),")
    print("    df_afko=tables.get('df_afko'),")
    print("    df_afpo=tables.get('df_afpo'),")
    print("    df_qmel=tables.get('df_qmel'),")
    print("    # ... other tables as available")
    print(")")
    print("```")
    
    print("\n3. ACCESS ML INSIGHTS:")
    print("```python")
    print("# Get ML predictions")
    print("predictions = assessment_results['ml_assessment']['comprehensive_predictions']")
    print("")
    print("# Get business insights")
    print("insights = assessment_results['ml_assessment']['assessment_insights']")
    print("")
    print("# Print key findings")
    print("print('Risk Level:', insights['risk_assessment']['quality_risk_level'])")
    print("print('Potential Savings:', insights['business_impact']['estimated_annual_savings'])")
    print("```")
    
    print("\n4. ASSESSMENT REPORT SECTIONS:")
    print("• Executive Summary: ML-driven KPIs and risk assessment")
    print("• Predictive Models: Quality issue and efficiency prediction")
    print("• Pattern Analysis: Clustering and anomaly detection")
    print("• Business Impact: Quantified savings and ROI")
    print("• Recommendations: ML-backed improvement actions")
    
    print("\n5. KEY ML CAPABILITIES:")
    print("• Quality Issue Prediction (Classification)")
    print("• Production Efficiency Prediction (Regression)")
    print("• Quality Pattern Clustering (Unsupervised)")
    print("• Anomaly Detection (Outlier Analysis)")
    print("• Feature Importance Analysis")
    print("• Risk Scoring and Prioritization")

if __name__ == "__main__":
    assessment_usage_guide()

📚 TOLARAM ASSESSMENT - ML USAGE GUIDE

1. LOAD YOUR DATA:
```python
import pandas as pd

# Load the Excel file
excel_file = 'Project_Assessment_Data.xlsx'
sheet_names = pd.ExcelFile(excel_file).sheet_names

# Load each sheet
tables = {}
for sheet in sheet_names:
    tables[f'df_{sheet.lower()}'] = pd.read_excel(excel_file, sheet_name=sheet)
```

2. RUN COMPLETE ANALYSIS WITH ML:
```python
# Run everything including ML
assessment_results = complete_tolaram_assessment_with_ml(
    df_aufk=tables.get('df_aufk'),
    df_afko=tables.get('df_afko'),
    df_afpo=tables.get('df_afpo'),
    df_qmel=tables.get('df_qmel'),
    # ... other tables as available
)
```

3. ACCESS ML INSIGHTS:
```python
# Get ML predictions
predictions = assessment_results['ml_assessment']['comprehensive_predictions']

# Get business insights
insights = assessment_results['ml_assessment']['assessment_insights']

# Print key findings
print('Risk Level:', insights['risk_assessment']['quality_risk_level'])
print('Potentia

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')

class SAPBottleneckDetector:
    """
    Advanced Bottleneck Detection and Downtime Analysis for SAP Manufacturing Data
    """
    
    def __init__(self, comprehensive_df):
        self.comprehensive_df = comprehensive_df
        self.bottleneck_analysis = {}
        self.downtime_analysis = {}
        self.throughput_analysis = {}
        self.capacity_analysis = {}
        
    def detect_schedule_bottlenecks(self):
        """
        Detect bottlenecks based on scheduling and timing data
        """
        print("🕐 Analyzing Schedule-Based Bottlenecks...")
        
        df = self.comprehensive_df.copy()
        bottlenecks = {}
        
        # Analyze planned vs actual dates
        schedule_cols = {
            'planned_start': ['GSTRP', 'PLANNED_START_DATE'],
            'planned_end': ['GLTRP', 'PLANNED_END_DATE'], 
            'actual_start': ['GSTRS', 'ACTUAL_START_DATE'],
            'actual_end': ['GLTRS', 'ACTUAL_END_DATE']
        }
        
        date_analysis = {}
        
        for date_type, possible_cols in schedule_cols.items():
            for col in possible_cols:
                if col in df.columns:
                    try:
                        df[f'{date_type}_parsed'] = pd.to_datetime(df[col], errors='coerce')
                        date_analysis[date_type] = f'{date_type}_parsed'
                        break
                    except:
                        continue
        
        if len(date_analysis) >= 2:
            # Calculate delays and lead times
            if 'planned_start' in date_analysis and 'actual_start' in date_analysis:
                df['START_DELAY_DAYS'] = (df[date_analysis['actual_start']] - df[date_analysis['planned_start']]).dt.days
                
            if 'planned_end' in date_analysis and 'actual_end' in date_analysis:
                df['END_DELAY_DAYS'] = (df[date_analysis['actual_end']] - df[date_analysis['planned_end']]).dt.days
                
            if 'planned_start' in date_analysis and 'planned_end' in date_analysis:
                df['PLANNED_CYCLE_TIME'] = (df[date_analysis['planned_end']] - df[date_analysis['planned_start']]).dt.days
                
            if 'actual_start' in date_analysis and 'actual_end' in date_analysis:
                df['ACTUAL_CYCLE_TIME'] = (df[date_analysis['actual_end']] - df[date_analysis['actual_start']]).dt.days
            
            # Identify bottleneck patterns
            bottlenecks['schedule_delays'] = self._analyze_schedule_delays(df)
            
        else:
            print("   ⚠️  Insufficient date columns for schedule analysis")
            
        return bottlenecks
    
    def _analyze_schedule_delays(self, df):
        """
        Analyze schedule delays to identify bottlenecks
        """
        delay_analysis = {}
        
        # Analyze start delays
        if 'START_DELAY_DAYS' in df.columns:
            start_delays = df['START_DELAY_DAYS'].dropna()
            
            delay_analysis['start_delays'] = {
                'avg_delay': start_delays.mean(),
                'median_delay': start_delays.median(),
                'max_delay': start_delays.max(),
                'delayed_orders_count': (start_delays > 0).sum(),
                'delayed_orders_pct': (start_delays > 0).mean() * 100,
                'severe_delays': (start_delays > 7).sum()  # More than 1 week
            }
        
        # Analyze end delays
        if 'END_DELAY_DAYS' in df.columns:
            end_delays = df['END_DELAY_DAYS'].dropna()
            
            delay_analysis['end_delays'] = {
                'avg_delay': end_delays.mean(),
                'median_delay': end_delays.median(),
                'max_delay': end_delays.max(),
                'delayed_orders_count': (end_delays > 0).sum(),
                'delayed_orders_pct': (end_delays > 0).mean() * 100,
                'severe_delays': (end_delays > 7).sum()
            }
        
        # Analyze cycle time variations
        if 'PLANNED_CYCLE_TIME' in df.columns and 'ACTUAL_CYCLE_TIME' in df.columns:
            cycle_variance = df['ACTUAL_CYCLE_TIME'] - df['PLANNED_CYCLE_TIME']
            
            delay_analysis['cycle_time_variance'] = {
                'avg_variance': cycle_variance.mean(),
                'std_variance': cycle_variance.std(),
                'orders_over_planned': (cycle_variance > 0).sum(),
                'avg_overrun': cycle_variance[cycle_variance > 0].mean() if (cycle_variance > 0).any() else 0
            }
        
        return delay_analysis
    
    def detect_plant_bottlenecks(self):
        """
        Detect bottlenecks by analyzing plant performance
        """
        print("🏭 Analyzing Plant-Level Bottlenecks...")
        
        df = self.comprehensive_df.copy()
        plant_bottlenecks = {}
        
        # Find plant column
        plant_col = None
        for col in ['Plant_Name', 'Plant_Code', 'WERKS', 'PWERK']:
            if col in df.columns and df[col].notna().sum() > 0:
                plant_col = col
                break
        
        if not plant_col:
            print("   ⚠️  No plant column found")
            return {}
        
        # Plant performance metrics
        plant_metrics = df.groupby(plant_col).agg({
            'AUFNR': 'count',
            'QUALITY_NOTIF_COUNT': ['sum', 'mean'],
            'DEFECT_COUNT': ['sum', 'mean'],
            'ORDER_ITEM_COUNT': ['sum', 'mean'],
            'GOODS_MOVEMENT_COUNT': ['sum', 'mean'],
            'QUALITY_SCORE': 'mean',
            'PRODUCTION_EFFICIENCY': 'mean'
        }).round(3)
        
        # Flatten column names
        plant_metrics.columns = ['_'.join(col).strip() for col in plant_metrics.columns]
        
        # Calculate bottleneck indicators
        plant_metrics['THROUGHPUT_SCORE'] = plant_metrics['AUFNR_count'] / plant_metrics['AUFNR_count'].max()
        plant_metrics['QUALITY_BURDEN'] = (
            plant_metrics['QUALITY_NOTIF_COUNT_sum'] + plant_metrics['DEFECT_COUNT_sum']
        ) / plant_metrics['AUFNR_count']
        
        # Identify bottleneck plants
        # Low throughput + high quality issues = bottleneck
        plant_metrics['BOTTLENECK_SCORE'] = (
            (1 - plant_metrics['THROUGHPUT_SCORE']) * 0.4 +
            (plant_metrics['QUALITY_BURDEN'] / plant_metrics['QUALITY_BURDEN'].max()) * 0.3 +
            (1 - plant_metrics.get('PRODUCTION_EFFICIENCY_mean', pd.Series([0])) / 100) * 0.3
        )
        
        # Rank plants by bottleneck risk
        bottleneck_ranking = plant_metrics.sort_values('BOTTLENECK_SCORE', ascending=False)
        
        plant_bottlenecks = {
            'plant_metrics': plant_metrics,
            'bottleneck_ranking': bottleneck_ranking,
            'top_bottleneck_plants': bottleneck_ranking.head(3).index.tolist(),
            'bottleneck_indicators': {
                'low_throughput_plants': plant_metrics[plant_metrics['THROUGHPUT_SCORE'] < 0.7].index.tolist(),
                'high_quality_burden_plants': plant_metrics[
                    plant_metrics['QUALITY_BURDEN'] > plant_metrics['QUALITY_BURDEN'].quantile(0.8)
                ].index.tolist(),
                'low_efficiency_plants': plant_metrics[
                    plant_metrics.get('PRODUCTION_EFFICIENCY_mean', pd.Series([100])) < 85
                ].index.tolist()
            }
        }
        
        print(f"   ✓ Analyzed {len(plant_metrics)} plants")
        print(f"   🚨 Top bottleneck plants: {', '.join(plant_bottlenecks['top_bottleneck_plants'][:3])}")
        
        return plant_bottlenecks
    
    def detect_material_bottlenecks(self):
        """
        Detect bottlenecks related to materials and supply chain
        """
        print("📦 Analyzing Material-Related Bottlenecks...")
        
        df = self.comprehensive_df.copy()
        material_bottlenecks = {}
        
        if 'ORDER_MATERIALS' not in df.columns:
            print("   ⚠️  No material information available")
            return {}
        
        # Extract individual materials from comma-separated lists
        all_materials = []
        material_orders = []
        
        for idx, row in df.iterrows():
            if pd.notna(row.get('ORDER_MATERIALS')):
                materials = str(row['ORDER_MATERIALS']).split(', ')
                for material in materials[:3]:  # Top 3 materials per order
                    all_materials.append(material.strip())
                    material_orders.append({
                        'MATERIAL': material.strip(),
                        'ORDER_INDEX': idx,
                        'QUALITY_NOTIF_COUNT': row.get('QUALITY_NOTIF_COUNT', 0),
                        'DEFECT_COUNT': row.get('DEFECT_COUNT', 0),
                        'PRODUCTION_EFFICIENCY': row.get('PRODUCTION_EFFICIENCY', 100),
                        'GOODS_MOVEMENT_COUNT': row.get('GOODS_MOVEMENT_COUNT', 0)
                    })
        
        if not material_orders:
            print("   ⚠️  No material data to analyze")
            return {}
        
        material_df = pd.DataFrame(material_orders)
        
        # Analyze material performance
        material_analysis = material_df.groupby('MATERIAL').agg({
            'ORDER_INDEX': 'count',
            'QUALITY_NOTIF_COUNT': ['sum', 'mean'],
            'DEFECT_COUNT': ['sum', 'mean'],
            'PRODUCTION_EFFICIENCY': 'mean',
            'GOODS_MOVEMENT_COUNT': ['sum', 'mean']
        }).round(3)
        
        # Flatten columns
        material_analysis.columns = ['_'.join(col).strip() for col in material_analysis.columns]
        
        # Calculate material bottleneck scores
        material_analysis['USAGE_FREQUENCY'] = material_analysis['ORDER_INDEX_count']
        material_analysis['QUALITY_ISSUES_PER_ORDER'] = (
            material_analysis['QUALITY_NOTIF_COUNT_sum'] + material_analysis['DEFECT_COUNT_sum']
        ) / material_analysis['ORDER_INDEX_count']
        
        material_analysis['MATERIAL_BOTTLENECK_SCORE'] = (
            (material_analysis['QUALITY_ISSUES_PER_ORDER'] / material_analysis['QUALITY_ISSUES_PER_ORDER'].max()) * 0.5 +
            (1 - material_analysis['PRODUCTION_EFFICIENCY_mean'] / 100) * 0.3 +
            (material_analysis['USAGE_FREQUENCY'] / material_analysis['USAGE_FREQUENCY'].max()) * 0.2
        )
        
        # Identify problematic materials
        high_risk_materials = material_analysis[
            material_analysis['MATERIAL_BOTTLENECK_SCORE'] > material_analysis['MATERIAL_BOTTLENECK_SCORE'].quantile(0.8)
        ].sort_values('MATERIAL_BOTTLENECK_SCORE', ascending=False)
        
        material_bottlenecks = {
            'material_analysis': material_analysis,
            'high_risk_materials': high_risk_materials,
            'top_problematic_materials': high_risk_materials.head(5).index.tolist(),
            'material_quality_issues': material_analysis.sort_values('QUALITY_ISSUES_PER_ORDER', ascending=False).head(10),
            'most_used_materials': material_analysis.sort_values('USAGE_FREQUENCY', ascending=False).head(10)
        }
        
        print(f"   ✓ Analyzed {len(material_analysis)} unique materials")
        print(f"   🚨 High-risk materials: {len(high_risk_materials)}")
        
        return material_bottlenecks
    
    def detect_work_center_bottlenecks(self):
        """
        Detect bottlenecks at work center level
        """
        print("⚙️ Analyzing Work Center Bottlenecks...")
        
        df = self.comprehensive_df.copy()
        wc_bottlenecks = {}
        
        # Find work center related columns
        wc_cols = [col for col in df.columns if any(keyword in col.upper() for keyword in ['ARBPL', 'WORK_CENTER', 'WC'])]
        
        if not wc_cols:
            print("   ⚠️  No work center information available")
            return {}
        
        wc_col = wc_cols[0]  # Use first available work center column
        
        if df[wc_col].notna().sum() == 0:
            print("   ⚠️  Work center column is empty")
            return {}
        
        # Analyze work center performance
        wc_analysis = df.groupby(wc_col).agg({
            'AUFNR': 'count',
            'QUALITY_NOTIF_COUNT': ['sum', 'mean'],
            'DEFECT_COUNT': ['sum', 'mean'], 
            'PRODUCTION_EFFICIENCY': 'mean',
            'ORDER_ITEM_COUNT': ['sum', 'mean'],
            'GOODS_MOVEMENT_COUNT': ['sum', 'mean']
        }).round(3)
        
        # Flatten columns
        wc_analysis.columns = ['_'.join(col).strip() for col in wc_analysis.columns]
        
        # Calculate work center utilization and bottleneck indicators
        wc_analysis['THROUGHPUT'] = wc_analysis['AUFNR_count']
        wc_analysis['UTILIZATION_SCORE'] = wc_analysis['THROUGHPUT'] / wc_analysis['THROUGHPUT'].max()
        wc_analysis['QUALITY_BURDEN'] = (
            wc_analysis['QUALITY_NOTIF_COUNT_sum'] + wc_analysis['DEFECT_COUNT_sum']
        ) / wc_analysis['AUFNR_count']
        
        # Work center bottleneck score
        wc_analysis['WC_BOTTLENECK_SCORE'] = (
            (1 - wc_analysis['UTILIZATION_SCORE']) * 0.3 +
            (wc_analysis['QUALITY_BURDEN'] / wc_analysis['QUALITY_BURDEN'].max()) * 0.4 +
            (1 - wc_analysis.get('PRODUCTION_EFFICIENCY_mean', pd.Series([100])) / 100) * 0.3
        )
        
        # Identify bottleneck work centers
        bottleneck_wcs = wc_analysis.sort_values('WC_BOTTLENECK_SCORE', ascending=False)
        
        wc_bottlenecks = {
            'work_center_analysis': wc_analysis,
            'bottleneck_ranking': bottleneck_wcs,
            'top_bottleneck_work_centers': bottleneck_wcs.head(5).index.tolist(),
            'underutilized_work_centers': wc_analysis[wc_analysis['UTILIZATION_SCORE'] < 0.5].index.tolist(),
            'high_quality_burden_wcs': wc_analysis[
                wc_analysis['QUALITY_BURDEN'] > wc_analysis['QUALITY_BURDEN'].quantile(0.8)
            ].index.tolist()
        }
        
        print(f"   ✓ Analyzed {len(wc_analysis)} work centers")
        print(f"   🚨 Top bottleneck work centers: {', '.join(bottleneck_wcs.head(3).index.astype(str).tolist())}")
        
        return wc_bottlenecks
    
    def detect_downtime_patterns(self):
        """
        Detect downtime patterns and maintenance-related bottlenecks
        """
        print("⏰ Analyzing Downtime Patterns...")
        
        df = self.comprehensive_df.copy()
        downtime_analysis = {}
        
        # Look for maintenance-related indicators
        maintenance_indicators = []
        
        # Check for maintenance notifications
        if 'HAS_MAINTENANCE' in df.columns:
            maintenance_orders = df[df['HAS_MAINTENANCE'] == True]
            maintenance_indicators.append(('maintenance_notifications', len(maintenance_orders)))
        
        # Check for quality issues that might indicate downtime
        if 'QUALITY_NOTIF_COUNT' in df.columns:
            high_quality_issues = df[df['QUALITY_NOTIF_COUNT'] > df['QUALITY_NOTIF_COUNT'].quantile(0.9)]
            maintenance_indicators.append(('high_quality_issues', len(high_quality_issues)))
        
        # Analyze production gaps (if we have date information)
        if 'ERDAT' in df.columns:
            try:
                df['ERDAT_parsed'] = pd.to_datetime(df['ERDAT'], errors='coerce')
                df_sorted = df.sort_values('ERDAT_parsed')
                
                # Calculate gaps between orders
                df_sorted['TIME_GAP'] = df_sorted['ERDAT_parsed'].diff().dt.days
                
                # Identify unusual gaps (potential downtime)
                gap_threshold = df_sorted['TIME_GAP'].quantile(0.95)  # 95th percentile
                unusual_gaps = df_sorted[df_sorted['TIME_GAP'] > gap_threshold]
                
                downtime_analysis['production_gaps'] = {
                    'total_unusual_gaps': len(unusual_gaps),
                    'avg_gap_days': unusual_gaps['TIME_GAP'].mean() if len(unusual_gaps) > 0 else 0,
                    'max_gap_days': unusual_gaps['TIME_GAP'].max() if len(unusual_gaps) > 0 else 0,
                    'gap_threshold': gap_threshold
                }
                
            except Exception as e:
                print(f"   ⚠️  Date analysis failed: {e}")
        
        # Analyze efficiency drops (potential downtime indicators)
        if 'PRODUCTION_EFFICIENCY' in df.columns:
            low_efficiency = df[df['PRODUCTION_EFFICIENCY'] < 70]  # Below 70% efficiency
            
            downtime_analysis['efficiency_drops'] = {
                'low_efficiency_orders': len(low_efficiency),
                'low_efficiency_percentage': len(low_efficiency) / len(df) * 100,
                'avg_efficiency_drop': (100 - low_efficiency['PRODUCTION_EFFICIENCY'].mean()) if len(low_efficiency) > 0 else 0
            }
        
        # Estimate downtime impact
        if 'PLANNED_CYCLE_TIME' in df.columns and 'ACTUAL_CYCLE_TIME' in df.columns:
            cycle_overruns = df[df['ACTUAL_CYCLE_TIME'] > df['PLANNED_CYCLE_TIME']]
            total_overrun_days = (cycle_overruns['ACTUAL_CYCLE_TIME'] - cycle_overruns['PLANNED_CYCLE_TIME']).sum()
            
            downtime_analysis['cycle_time_analysis'] = {
                'orders_with_overruns': len(cycle_overruns),
                'total_overrun_days': total_overrun_days,
                'avg_overrun_per_order': total_overrun_days / len(cycle_overruns) if len(cycle_overruns) > 0 else 0
            }
        
        # Overall downtime risk assessment
        risk_factors = []
        
        if downtime_analysis.get('production_gaps', {}).get('total_unusual_gaps', 0) > 0:
            risk_factors.append('production_gaps')
        
        if downtime_analysis.get('efficiency_drops', {}).get('low_efficiency_percentage', 0) > 10:
            risk_factors.append('efficiency_issues')
        
        if len([ind for ind_name, count in maintenance_indicators if count > 0]) > 0:
            risk_factors.append('maintenance_issues')
        
        downtime_analysis['risk_assessment'] = {
            'risk_factors': risk_factors,
            'risk_level': 'HIGH' if len(risk_factors) >= 2 else 'MEDIUM' if len(risk_factors) == 1 else 'LOW',
            'maintenance_indicators': dict(maintenance_indicators)
        }
        
        print(f"   ✓ Downtime risk level: {downtime_analysis.get('risk_assessment', {}).get('risk_level', 'UNKNOWN')}")
        print(f"   📊 Risk factors identified: {len(risk_factors)}")
        
        return downtime_analysis
    
    def analyze_throughput_bottlenecks(self):
        """
        Analyze throughput bottlenecks across different dimensions
        """
        print("📈 Analyzing Throughput Bottlenecks...")
        
        df = self.comprehensive_df.copy()
        throughput_analysis = {}
        
        # Time-based throughput analysis
        if 'ERDAT' in df.columns:
            try:
                df['ERDAT_parsed'] = pd.to_datetime(df['ERDAT'], errors='coerce')
                df['YEAR_MONTH'] = df['ERDAT_parsed'].dt.to_period('M')
                
                monthly_throughput = df.groupby('YEAR_MONTH').agg({
                    'AUFNR': 'count',
                    'ORDER_ITEM_COUNT': 'sum',
                    'GOODS_MOVEMENT_COUNT': 'sum'
                })
                
                # Identify throughput bottlenecks (low-throughput periods)
                throughput_threshold = monthly_throughput['AUFNR'].quantile(0.25)  # Bottom 25%
                low_throughput_periods = monthly_throughput[monthly_throughput['AUFNR'] < throughput_threshold]
                
                throughput_analysis['temporal_analysis'] = {
                    'monthly_throughput': monthly_throughput,
                    'low_throughput_periods': low_throughput_periods,
                    'throughput_variability': monthly_throughput['AUFNR'].std() / monthly_throughput['AUFNR'].mean()
                }
                
            except Exception as e:
                print(f"   ⚠️  Temporal analysis failed: {e}")
        
        # Order type throughput analysis
        if 'AUART' in df.columns:
            order_type_throughput = df.groupby('AUART').agg({
                'AUFNR': 'count',
                'PRODUCTION_EFFICIENCY': 'mean',
                'QUALITY_NOTIF_COUNT': 'mean'
            })
            
            # Identify slow order types
            slow_order_types = order_type_throughput[
                order_type_throughput['PRODUCTION_EFFICIENCY'] < order_type_throughput['PRODUCTION_EFFICIENCY'].median()
            ]
            
            throughput_analysis['order_type_analysis'] = {
                'order_type_throughput': order_type_throughput,
                'slow_order_types': slow_order_types.index.tolist(),
                'fastest_order_type': order_type_throughput['PRODUCTION_EFFICIENCY'].idxmax(),
                'slowest_order_type': order_type_throughput['PRODUCTION_EFFICIENCY'].idxmin()
            }
        
        # Plant throughput analysis
        plant_col = None
        for col in ['Plant_Name', 'Plant_Code', 'WERKS']:
            if col in df.columns and df[col].notna().sum() > 0:
                plant_col = col
                break
        
        if plant_col:
            plant_throughput = df.groupby(plant_col).agg({
                'AUFNR': 'count',
                'ORDER_ITEM_COUNT': ['sum', 'mean'],
                'PRODUCTION_EFFICIENCY': 'mean',
                'GOODS_MOVEMENT_COUNT': ['sum', 'mean']
            })
            
            # Flatten columns
            plant_throughput.columns = ['_'.join(col).strip() for col in plant_throughput.columns]
            
            # Calculate throughput scores
            plant_throughput['THROUGHPUT_SCORE'] = (
                plant_throughput['AUFNR_count'] / plant_throughput['AUFNR_count'].max() * 0.4 +
                plant_throughput.get('PRODUCTION_EFFICIENCY_mean', pd.Series([0])) / 100 * 0.4 +
                plant_throughput['GOODS_MOVEMENT_COUNT_sum'] / plant_throughput['GOODS_MOVEMENT_COUNT_sum'].max() * 0.2
            )
            
            bottleneck_plants = plant_throughput[plant_throughput['THROUGHPUT_SCORE'] < 0.6]
            
            throughput_analysis['plant_analysis'] = {
                'plant_throughput': plant_throughput,
                'bottleneck_plants': bottleneck_plants.index.tolist(),
                'best_performing_plant': plant_throughput['THROUGHPUT_SCORE'].idxmax(),
                'worst_performing_plant': plant_throughput['THROUGHPUT_SCORE'].idxmin()
            }
        
        print(f"   ✓ Throughput analysis completed")
        
        return throughput_analysis
    
    def generate_bottleneck_recommendations(self):
        """
        Generate specific recommendations based on bottleneck analysis
        """
        print("💡 Generating Bottleneck Recommendations...")
        
        recommendations = {
            'immediate_actions': [],
            'process_improvements': [],
            'capacity_optimizations': [],
            'maintenance_actions': [],
            'supply_chain_actions': []
        }
        
        # Plant bottleneck recommendations
        if 'plant_bottlenecks' in self.bottleneck_analysis:
            plant_data = self.bottleneck_analysis['plant_bottlenecks']
            
            if plant_data.get('top_bottleneck_plants'):
                top_bottleneck = plant_data['top_bottleneck_plants'][0]
                recommendations['immediate_actions'].append({
                    'action': f'Conduct detailed capacity analysis at {top_bottleneck}',
                    'priority': 'HIGH',
                    'timeline': '1 week',
                    'category': 'Plant Optimization'
                })
            
            if plant_data.get('bottleneck_indicators', {}).get('low_efficiency_plants'):
                recommendations['process_improvements'].append({
                    'action': f'Implement lean manufacturing at plants: {", ".join(plant_data["bottleneck_indicators"]["low_efficiency_plants"][:3])}',
                    'priority': 'MEDIUM',
                    'timeline': '2-3 months',
                    'category': 'Efficiency Improvement'
                })
        
        # Material bottleneck recommendations
        if 'material_bottlenecks' in self.bottleneck_analysis:
            material_data = self.bottleneck_analysis['material_bottlenecks']
            
            if material_data.get('top_problematic_materials'):
                problematic_materials = material_data['top_problematic_materials'][:3]
                recommendations['supply_chain_actions'].append({
                    'action': f'Review supplier quality agreements for materials: {", ".join(problematic_materials)}',
                    'priority': 'HIGH',
                    'timeline': '2 weeks',
                    'category': 'Supply Chain'
                })
        
        # Work center bottleneck recommendations
        if 'work_center_bottlenecks' in self.bottleneck_analysis:
            wc_data = self.bottleneck_analysis['work_center_bottlenecks']
            
            if wc_data.get('top_bottleneck_work_centers'):
                bottleneck_wcs = wc_data['top_bottleneck_work_centers'][:2]
                recommendations['capacity_optimizations'].append({
                    'action': f'Increase capacity or redistribute workload for work centers: {", ".join(map(str, bottleneck_wcs))}',
                    'priority': 'HIGH',
                    'timeline': '1 month',
                    'category': 'Capacity Management'
                })
        
        # Downtime recommendations
        if 'downtime_analysis' in self.bottleneck_analysis:
            downtime_data = self.bottleneck_analysis['downtime_analysis']
            
            if downtime_data.get('risk_assessment', {}).get('risk_level') == 'HIGH':
                recommendations['maintenance_actions'].append({
                    'action': 'Implement predictive maintenance program to reduce unplanned downtime',
                    'priority': 'HIGH',
                    'timeline': '3 months',
                    'category': 'Maintenance'
                })
            
            if downtime_data.get('efficiency_drops', {}).get('low_efficiency_percentage', 0) > 10:
                recommendations['process_improvements'].append({
                    'action': f'Investigate root causes of efficiency drops affecting {downtime_data["efficiency_drops"]["low_efficiency_percentage"]:.1f}% of orders',
                    'priority': 'MEDIUM',
                    'timeline': '1 month',
                    'category': 'Process Analysis'
                })
        
        # Throughput recommendations
        if 'throughput_analysis' in self.bottleneck_analysis:
            throughput_data = self.bottleneck_analysis['throughput_analysis']
            
            if throughput_data.get('plant_analysis', {}).get('bottleneck_plants'):
                recommendations['capacity_optimizations'].append({
                    'action': f'Balance workload distribution across plants, reduce load on: {", ".join(throughput_data["plant_analysis"]["bottleneck_plants"][:2])}',
                    'priority': 'MEDIUM',
                    'timeline': '2 months',
                    'category': 'Load Balancing'
                })
        
        print(f"   ✓ Generated {sum(len(recs) for recs in recommendations.values())} recommendations")
        
        return recommendations
    
    def run_complete_bottleneck_analysis(self):
        """
        Run complete bottleneck and downtime analysis
        """
        print("🔍 RUNNING COMPLETE BOTTLENECK ANALYSIS")
        print("=" * 60)
        
        # Run all bottleneck detection methods
        self.bottleneck_analysis['schedule_bottlenecks'] = self.detect_schedule_bottlenecks()
        self.bottleneck_analysis['plant_bottlenecks'] = self.detect_plant_bottlenecks()
        self.bottleneck_analysis['material_bottlenecks'] = self.detect_material_bottlenecks()
        self.bottleneck_analysis['work_center_bottlenecks'] = self.detect_work_center_bottlenecks()
        self.bottleneck_analysis['downtime_analysis'] = self.detect_downtime_patterns()
        self.bottleneck_analysis['throughput_analysis'] = self.analyze_throughput_bottlenecks()
        
        # Generate recommendations
        recommendations = self.generate_bottleneck_recommendations()
        self.bottleneck_analysis['recommendations'] = recommendations
        
        # Create summary
        summary = self.create_bottleneck_summary()
        self.bottleneck_analysis['summary'] = summary
        
        print("\n✅ Bottleneck Analysis Complete!")
        print(f"   🎯 Total bottlenecks identified: {summary['total_bottlenecks']}")
        print(f"   ⚠️  Critical issues: {summary['critical_issues']}")
        print(f"   💡 Recommendations generated: {summary['total_recommendations']}")
        
        return self.bottleneck_analysis
    
    def create_bottleneck_summary(self):
        """
        Create executive summary of bottleneck analysis
        """
        summary = {
            'total_bottlenecks': 0,
            'critical_issues': 0,
            'total_recommendations': 0,
            'bottleneck_categories': {},
            'priority_actions': [],
            'estimated_impact': {}
        }
        
        # Count bottlenecks by category
        categories = ['plant_bottlenecks', 'material_bottlenecks', 'work_center_bottlenecks']
        
        for category in categories:
            if category in self.bottleneck_analysis:
                data = self.bottleneck_analysis[category]
                
                if category == 'plant_bottlenecks':
                    count = len(data.get('top_bottleneck_plants', []))
                elif category == 'material_bottlenecks':
                    count = len(data.get('top_problematic_materials', []))
                elif category == 'work_center_bottlenecks':
                    count = len(data.get('top_bottleneck_work_centers', []))
                else:
                    count = 0
                
                summary['bottleneck_categories'][category] = count
                summary['total_bottlenecks'] += count
        
        # Count critical issues
        if 'downtime_analysis' in self.bottleneck_analysis:
            downtime_data = self.bottleneck_analysis['downtime_analysis']
            if downtime_data.get('risk_assessment', {}).get('risk_level') == 'HIGH':
                summary['critical_issues'] += 1
        
        # Count recommendations
        if 'recommendations' in self.bottleneck_analysis:
            recs = self.bottleneck_analysis['recommendations']
            summary['total_recommendations'] = sum(len(rec_list) for rec_list in recs.values())
            
            # Extract priority actions
            for rec_list in recs.values():
                for rec in rec_list:
                    if rec.get('priority') == 'HIGH':
                        summary['priority_actions'].append(rec['action'])
        
        # Estimate impact
        summary['estimated_impact'] = {
            'potential_efficiency_gain': '15-25%',
            'downtime_reduction': '20-30%',
            'quality_improvement': '10-20%',
            'cost_savings_potential': 'High'
        }
        
        return summary

def create_downtime_prediction_model(comprehensive_df):
    """
    Create ML model to predict potential downtime events
    """
    print("🤖 Creating Downtime Prediction Model...")
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import classification_report, confusion_matrix
    
    df = comprehensive_df.copy()
    
    # Create downtime indicators (target variable)
    downtime_indicators = []
    
    # Low efficiency as downtime indicator
    if 'PRODUCTION_EFFICIENCY' in df.columns:
        downtime_indicators.append(df['PRODUCTION_EFFICIENCY'] < 70)
    
    # High quality issues as downtime indicator
    if 'QUALITY_NOTIF_COUNT' in df.columns:
        downtime_indicators.append(df['QUALITY_NOTIF_COUNT'] > df['QUALITY_NOTIF_COUNT'].quantile(0.8))
    
    # Maintenance issues as downtime indicator
    if 'HAS_MAINTENANCE' in df.columns:
        downtime_indicators.append(df['HAS_MAINTENANCE'] == True)
    
    if not downtime_indicators:
        print("   ⚠️  No downtime indicators available for modeling")
        return None
    
    # Combine indicators
    df['POTENTIAL_DOWNTIME'] = pd.DataFrame(downtime_indicators).T.any(axis=1).astype(int)
    
    # Prepare features
    feature_cols = []
    
    # Numeric features
    numeric_features = ['ORDER_ITEM_COUNT', 'GOODS_MOVEMENT_COUNT', 'QUALITY_NOTIF_COUNT', 'DEFECT_COUNT']
    for col in numeric_features:
        if col in df.columns:
            feature_cols.append(col)
    
    # Categorical features (encoded)
    if 'AUART' in df.columns:
        df['AUART_ENCODED'] = pd.Categorical(df['AUART']).codes
        feature_cols.append('AUART_ENCODED')
    
    # Plant features
    plant_col = None
    for col in ['Plant_Code', 'WERKS', 'PWERK']:
        if col in df.columns and df[col].notna().sum() > 0:
            df['PLANT_ENCODED'] = pd.Categorical(df[col]).codes
            feature_cols.append('PLANT_ENCODED')
            break
    
    if len(feature_cols) < 3:
        print("   ⚠️  Insufficient features for downtime modeling")
        return None
    
    # Prepare data
    X = df[feature_cols].fillna(0)
    y = df['POTENTIAL_DOWNTIME']
    
    # Remove samples where target is NaN
    mask = ~pd.isna(y)
    X, y = X[mask], y[mask]
    
    if len(X) < 50 or y.sum() < 5:
        print(f"   ⚠️  Insufficient data for modeling (samples: {len(X)}, positive cases: {y.sum()})")
        return None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = (y_pred == y_test).mean()
    
    # Feature importance
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Predict for all orders
    downtime_probabilities = model.predict_proba(X)[:, 1]
    
    results = {
        'model': model,
        'accuracy': accuracy,
        'feature_importance': importance_df,
        'downtime_probabilities': downtime_probabilities,
        'high_risk_orders': X.index[downtime_probabilities > 0.7].tolist(),
        'prediction_summary': {
            'total_orders': len(X),
            'high_risk_count': (downtime_probabilities > 0.7).sum(),
            'medium_risk_count': ((downtime_probabilities > 0.4) & (downtime_probabilities <= 0.7)).sum(),
            'low_risk_count': (downtime_probabilities <= 0.4).sum()
        }
    }
    
    print(f"   ✓ Model trained with {accuracy:.3f} accuracy")
    print(f"   📊 High-risk orders identified: {results['prediction_summary']['high_risk_count']}")
    
    return results

def create_capacity_optimization_analysis(comprehensive_df):
    """
    Analyze capacity optimization opportunities
    """
    print("📊 Creating Capacity Optimization Analysis...")
    
    df = comprehensive_df.copy()
    optimization_analysis = {}
    
    # Plant capacity analysis
    plant_col = None
    for col in ['Plant_Name', 'Plant_Code', 'WERKS']:
        if col in df.columns and df[col].notna().sum() > 0:
            plant_col = col
            break
    
    if plant_col:
        # Current utilization
        plant_utilization = df.groupby(plant_col).agg({
            'AUFNR': 'count',
            'ORDER_ITEM_COUNT': 'sum',
            'PRODUCTION_EFFICIENCY': 'mean',
            'QUALITY_NOTIF_COUNT': 'sum'
        })
        
        # Calculate relative utilization
        max_orders = plant_utilization['AUFNR'].max()
        plant_utilization['UTILIZATION_RATE'] = plant_utilization['AUFNR'] / max_orders
        plant_utilization['CAPACITY_AVAILABLE'] = 1 - plant_utilization['UTILIZATION_RATE']
        
        # Identify optimization opportunities
        underutilized_plants = plant_utilization[plant_utilization['UTILIZATION_RATE'] < 0.6]
        overutilized_plants = plant_utilization[plant_utilization['UTILIZATION_RATE'] > 0.9]
        
        optimization_analysis['plant_capacity'] = {
            'utilization_analysis': plant_utilization,
            'underutilized_plants': underutilized_plants.index.tolist(),
            'overutilized_plants': overutilized_plants.index.tolist(),
            'optimization_potential': underutilized_plants['CAPACITY_AVAILABLE'].sum() * 100
        }
    
    # Load balancing opportunities
    if 'AUART' in df.columns and plant_col:
        order_distribution = df.groupby([plant_col, 'AUART']).size().unstack(fill_value=0)
        
        # Calculate distribution efficiency
        distribution_variance = order_distribution.var(axis=0)
        imbalanced_order_types = distribution_variance[distribution_variance > distribution_variance.quantile(0.8)].index.tolist()
        
        optimization_analysis['load_balancing'] = {
            'order_distribution': order_distribution,
            'imbalanced_order_types': imbalanced_order_types,
            'rebalancing_opportunities': len(imbalanced_order_types)
        }
    
    # Efficiency optimization
    if 'PRODUCTION_EFFICIENCY' in df.columns:
        low_efficiency_threshold = df['PRODUCTION_EFFICIENCY'].quantile(0.25)
        improvement_opportunities = df[df['PRODUCTION_EFFICIENCY'] < low_efficiency_threshold]
        
        potential_gain = (85 - improvement_opportunities['PRODUCTION_EFFICIENCY']).sum()  # Target 85% efficiency
        
        optimization_analysis['efficiency_optimization'] = {
            'low_efficiency_orders': len(improvement_opportunities),
            'current_avg_efficiency': improvement_opportunities['PRODUCTION_EFFICIENCY'].mean(),
            'potential_efficiency_gain': potential_gain,
            'target_efficiency': 85
        }
    
    # Resource allocation optimization
    if plant_col and 'ORDER_ITEM_COUNT' in df.columns:
        resource_allocation = df.groupby(plant_col).agg({
            'ORDER_ITEM_COUNT': ['sum', 'mean'],
            'AUFNR': 'count',
            'PRODUCTION_EFFICIENCY': 'mean'
        })
        
        # Calculate resource efficiency
        resource_allocation.columns = ['_'.join(col).strip() for col in resource_allocation.columns]
        resource_allocation['RESOURCE_EFFICIENCY'] = (
            resource_allocation['ORDER_ITEM_COUNT_sum'] / resource_allocation['AUFNR_count'] *
            resource_allocation['PRODUCTION_EFFICIENCY_mean'] / 100
        )
        
        optimization_analysis['resource_allocation'] = {
            'current_allocation': resource_allocation,
            'most_efficient_plant': resource_allocation['RESOURCE_EFFICIENCY'].idxmax(),
            'least_efficient_plant': resource_allocation['RESOURCE_EFFICIENCY'].idxmin(),
            'efficiency_gap': resource_allocation['RESOURCE_EFFICIENCY'].max() - resource_allocation['RESOURCE_EFFICIENCY'].min()
        }
    
    print(f"   ✓ Capacity optimization analysis completed")
    
    return optimization_analysis

def save_bottleneck_analysis_results(bottleneck_analysis, downtime_model, capacity_analysis, result_folder="result"):
    """
    Save bottleneck analysis results to organized folder
    """
    print("\n💾 Saving Bottleneck Analysis Results...")
    
    import os
    import json
    from datetime import datetime
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Create bottleneck-specific folder
    bottleneck_folder = os.path.join(result_folder, f"10_bottleneck_analysis_{timestamp}")
    os.makedirs(bottleneck_folder, exist_ok=True)
    
    created_files = []
    
    # 1. Bottleneck Summary Report
    summary_file = os.path.join(bottleneck_folder, f"bottleneck_summary_report_{timestamp}.txt")
    
    with open(summary_file, 'w') as f:
        f.write("BOTTLENECK AND DOWNTIME ANALYSIS REPORT\n")
        f.write("=" * 80 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        # Executive Summary
        if 'summary' in bottleneck_analysis:
            summary = bottleneck_analysis['summary']
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-" * 40 + "\n")
            f.write(f"Total Bottlenecks Identified: {summary['total_bottlenecks']}\n")
            f.write(f"Critical Issues: {summary['critical_issues']}\n")
            f.write(f"Recommendations Generated: {summary['total_recommendations']}\n\n")
            
            f.write("BOTTLENECK BREAKDOWN:\n")
            for category, count in summary['bottleneck_categories'].items():
                f.write(f"  {category.replace('_', ' ').title()}: {count}\n")
            f.write("\n")
        
        # Plant Bottlenecks
        if 'plant_bottlenecks' in bottleneck_analysis:
            plant_data = bottleneck_analysis['plant_bottlenecks']
            f.write("PLANT BOTTLENECKS\n")
            f.write("-" * 40 + "\n")
            
            if plant_data.get('top_bottleneck_plants'):
                f.write(f"Top Bottleneck Plants: {', '.join(plant_data['top_bottleneck_plants'][:5])}\n")
            
            indicators = plant_data.get('bottleneck_indicators', {})
            if indicators.get('low_efficiency_plants'):
                f.write(f"Low Efficiency Plants: {', '.join(indicators['low_efficiency_plants'][:5])}\n")
            f.write("\n")
        
        # Material Bottlenecks
        if 'material_bottlenecks' in bottleneck_analysis:
            material_data = bottleneck_analysis['material_bottlenecks']
            f.write("MATERIAL BOTTLENECKS\n")
            f.write("-" * 40 + "\n")
            
            if material_data.get('top_problematic_materials'):
                f.write(f"High-Risk Materials: {', '.join(material_data['top_problematic_materials'][:5])}\n")
            f.write("\n")
        
        # Downtime Analysis
        if 'downtime_analysis' in bottleneck_analysis:
            downtime_data = bottleneck_analysis['downtime_analysis']
            f.write("DOWNTIME ANALYSIS\n")
            f.write("-" * 40 + "\n")
            
            risk_assessment = downtime_data.get('risk_assessment', {})
            f.write(f"Downtime Risk Level: {risk_assessment.get('risk_level', 'UNKNOWN')}\n")
            f.write(f"Risk Factors: {', '.join(risk_assessment.get('risk_factors', []))}\n")
            f.write("\n")
        
        # Recommendations
        if 'recommendations' in bottleneck_analysis:
            f.write("KEY RECOMMENDATIONS\n")
            f.write("-" * 40 + "\n")
            recs = bottleneck_analysis['recommendations']
            
            priority_order = ['immediate_actions', 'process_improvements', 'capacity_optimizations', 'maintenance_actions']
            
            for category in priority_order:
                if category in recs and recs[category]:
                    f.write(f"\n{category.replace('_', ' ').upper()}:\n")
                    for i, rec in enumerate(recs[category][:3], 1):  # Top 3 per category
                        f.write(f"  {i}. {rec.get('action', rec)}\n")
                        if isinstance(rec, dict):
                            f.write(f"     Priority: {rec.get('priority', 'N/A')}, Timeline: {rec.get('timeline', 'N/A')}\n")
    
    created_files.append(summary_file)
    print(f"   ✓ Bottleneck Summary: {os.path.basename(summary_file)}")
    
    # 2. Detailed Analysis Excel
    excel_file = os.path.join(bottleneck_folder, f"bottleneck_detailed_analysis_{timestamp}.xlsx")
    
    with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
        # Plant analysis
        if 'plant_bottlenecks' in bottleneck_analysis:
            plant_data = bottleneck_analysis['plant_bottlenecks']
            if 'plant_metrics' in plant_data:
                plant_data['plant_metrics'].to_excel(writer, sheet_name='Plant_Bottlenecks')
        
        # Material analysis
        if 'material_bottlenecks' in bottleneck_analysis:
            material_data = bottleneck_analysis['material_bottlenecks']
            if 'material_analysis' in material_data:
                material_data['material_analysis'].to_excel(writer, sheet_name='Material_Bottlenecks')
        
        # Work center analysis
        if 'work_center_bottlenecks' in bottleneck_analysis:
            wc_data = bottleneck_analysis['work_center_bottlenecks']
            if 'work_center_analysis' in wc_data:
                wc_data['work_center_analysis'].to_excel(writer, sheet_name='WorkCenter_Bottlenecks')
        
        # Downtime prediction results
        if downtime_model:
            downtime_df = pd.DataFrame({
                'Order_Index': range(len(downtime_model['downtime_probabilities'])),
                'Downtime_Probability': downtime_model['downtime_probabilities'],
                'Risk_Level': pd.cut(downtime_model['downtime_probabilities'], 
                                   bins=[0, 0.4, 0.7, 1.0], 
                                   labels=['Low', 'Medium', 'High'])
            })
            downtime_df.to_excel(writer, sheet_name='Downtime_Predictions', index=False)
            
            # Feature importance
            if 'feature_importance' in downtime_model:
                downtime_model['feature_importance'].to_excel(writer, sheet_name='Downtime_Features', index=False)
        
        # Capacity optimization
        if capacity_analysis and 'plant_capacity' in capacity_analysis:
            capacity_data = capacity_analysis['plant_capacity']['utilization_analysis']
            capacity_data.to_excel(writer, sheet_name='Capacity_Analysis')
    
    created_files.append(excel_file)
    print(f"   ✓ Detailed Analysis Excel: {os.path.basename(excel_file)}")
    
    # 3. Recommendations Action Plan
    action_plan_file = os.path.join(bottleneck_folder, f"bottleneck_action_plan_{timestamp}.txt")
    
    with open(action_plan_file, 'w') as f:
        f.write("BOTTLENECK RESOLUTION ACTION PLAN\n")
        f.write("=" * 80 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        if 'recommendations' in bottleneck_analysis:
            recs = bottleneck_analysis['recommendations']
            
            # Immediate Actions
            f.write("IMMEDIATE ACTIONS (1-30 DAYS)\n")
            f.write("-" * 40 + "\n")
            for i, action in enumerate(recs.get('immediate_actions', []), 1):
                f.write(f"{i}. {action.get('action', action)}\n")
                if isinstance(action, dict):
                    f.write(f"   Timeline: {action.get('timeline', 'TBD')}\n")
                    f.write(f"   Priority: {action.get('priority', 'Medium')}\n")
                    f.write(f"   Category: {action.get('category', 'General')}\n")
                f.write("\n")
            
            # Process Improvements
            f.write("PROCESS IMPROVEMENTS (1-3 MONTHS)\n")
            f.write("-" * 40 + "\n")
            for i, action in enumerate(recs.get('process_improvements', []), 1):
                f.write(f"{i}. {action.get('action', action)}\n")
                if isinstance(action, dict):
                    f.write(f"   Timeline: {action.get('timeline', 'TBD')}\n")
                    f.write(f"   Priority: {action.get('priority', 'Medium')}\n")
                    f.write(f"   Category: {action.get('category', 'General')}\n")
                f.write("\n")
            
            # Capacity Optimizations
            f.write("CAPACITY OPTIMIZATIONS (1-6 MONTHS)\n")
            f.write("-" * 40 + "\n")
            for i, action in enumerate(recs.get('capacity_optimizations', []), 1):
                f.write(f"{i}. {action.get('action', action)}\n")
                if isinstance(action, dict):
                    f.write(f"   Timeline: {action.get('timeline', 'TBD')}\n")
                    f.write(f"   Priority: {action.get('priority', 'Medium')}\n")
                    f.write(f"   Category: {action.get('category', 'General')}\n")
                f.write("\n")
    
    created_files.append(action_plan_file)
    print(f"   ✓ Action Plan: {os.path.basename(action_plan_file)}")
    
    # 4. JSON export for APIs/dashboards
    json_file = os.path.join(bottleneck_folder, f"bottleneck_analysis_{timestamp}.json")
    
    # Prepare data for JSON serialization
    json_data = {
        'bottleneck_analysis': bottleneck_analysis,
        'downtime_model_summary': {
            'accuracy': downtime_model['accuracy'] if downtime_model else None,
            'high_risk_orders': len(downtime_model['high_risk_orders']) if downtime_model else 0,
            'prediction_summary': downtime_model['prediction_summary'] if downtime_model else {}
        },
        'capacity_analysis': capacity_analysis
    }
    
    with open(json_file, 'w') as f:
        json.dump(json_data, f, indent=2, default=str)
    
    created_files.append(json_file)
    print(f"   ✓ JSON Export: {os.path.basename(json_file)}")
    
    print(f"   ✅ Saved {len(created_files)} bottleneck analysis files")
    return created_files

# Main integration function
def run_complete_bottleneck_analysis(comprehensive_df, result_folder="result"):
    """
    Run complete bottleneck and downtime analysis
    """
    print("🔍 RUNNING COMPLETE BOTTLENECK & DOWNTIME ANALYSIS")
    print("=" * 80)
    
    # Initialize bottleneck detector
    bottleneck_detector = SAPBottleneckDetector(comprehensive_df)
    
    # Run bottleneck analysis
    bottleneck_analysis = bottleneck_detector.run_complete_bottleneck_analysis()
    
    # Create downtime prediction model
    downtime_model = create_downtime_prediction_model(comprehensive_df)
    
    # Create capacity optimization analysis
    capacity_analysis = create_capacity_optimization_analysis(comprehensive_df)
    
    # Save results
    created_files = save_bottleneck_analysis_results(
        bottleneck_analysis, downtime_model, capacity_analysis, result_folder
    )
    
    print(f"\n✅ BOTTLENECK ANALYSIS COMPLETE!")
    print("=" * 80)
    print("📊 ANALYSIS RESULTS:")
    print(f"   • Bottlenecks Identified: {bottleneck_analysis.get('summary', {}).get('total_bottlenecks', 0)}")
    print(f"   • Critical Issues: {bottleneck_analysis.get('summary', {}).get('critical_issues', 0)}")
    print(f"   • High-Risk Orders: {downtime_model['prediction_summary']['high_risk_count'] if downtime_model else 0}")
    print(f"   • Action Items: {bottleneck_analysis.get('summary', {}).get('total_recommendations', 0)}")
    
    return {
        'bottleneck_detector': bottleneck_detector,
        'bottleneck_analysis': bottleneck_analysis,
        'downtime_model': downtime_model,
        'capacity_analysis': capacity_analysis,
        'created_files': created_files
    }

# Usage example for Tolaram assessment
def tolaram_bottleneck_assessment_guide():
    """
    Guide for using bottleneck analysis in Tolaram assessment
    """
    print("📚 TOLARAM ASSESSMENT - BOTTLENECK ANALYSIS GUIDE")
    print("=" * 60)
    
    print("\n1. INTEGRATION WITH MAIN ANALYSIS:")
    print("```python")
    print("# After running main SAP integration")
    print("comprehensive_df, summary_stats, quality_details = create_comprehensive_sap_view(...)")
    print("")
    print("# Run bottleneck analysis")
    print("bottleneck_results = run_complete_bottleneck_analysis(")
    print("    comprehensive_df, result_folder='tolaram_assessment'")
    print(")")
    print("```")
    
    print("\n2. KEY BOTTLENECK INSIGHTS FOR ASSESSMENT:")
    print("• Plant Performance Bottlenecks")
    print("• Material Supply Chain Issues")
    print("• Work Center Capacity Constraints")
    print("• Downtime Risk Prediction")
    print("• Schedule Delay Analysis")
    print("• Throughput Optimization Opportunities")
    
    print("\n3. BUSINESS VALUE DEMONSTRATION:")
    print("• Quantified efficiency improvement potential")
    print("• Predictive downtime prevention")
    print("• Capacity optimization recommendations")
    print("• Cost reduction through bottleneck elimination")
    print("• Data-driven resource allocation")
    
    print("\n4. ASSESSMENT REPORT SECTIONS:")
    print("• Executive Summary: Bottleneck impact on operations")
    print("• Predictive Analytics: Downtime risk modeling")
    print("• Capacity Analysis: Optimization opportunities")
    print("• Action Plan: Prioritized improvement initiatives")
    print("• ROI Estimation: Expected benefits of improvements")

if __name__ == "__main__":
    tolaram_bottleneck_assessment_guide()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
import os
warnings.filterwarnings('ignore')

# Set style for matplotlib
plt.style.use('default')
sns.set_palette("husl")

class SAPVisualizationSuite:
    """
    Comprehensive Visualization Suite for SAP Manufacturing and Quality Analysis
    """
    
    def __init__(self, comprehensive_df, ml_results=None, bottleneck_results=None, result_folder="result"):
        self.comprehensive_df = comprehensive_df
        self.ml_results = ml_results
        self.bottleneck_results = bottleneck_results
        self.result_folder = result_folder
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Create visualization folder
        self.viz_folder = os.path.join(result_folder, f"11_visualizations_{self.timestamp}")
        os.makedirs(self.viz_folder, exist_ok=True)
        
        # Color schemes
        self.colors = {
            'primary': '#1f77b4',
            'secondary': '#ff7f0e', 
            'success': '#2ca02c',
            'warning': '#d62728',
            'info': '#9467bd',
            'quality_good': '#2ca02c',
            'quality_poor': '#d62728',
            'quality_medium': '#ff7f0e'
        }
        
        self.created_files = []
    
    def create_executive_dashboard(self):
        """
        Create executive-level dashboard with key KPIs
        """
        print("📊 Creating Executive Dashboard...")
        
        # Create subplot layout
        fig = make_subplots(
            rows=3, cols=3,
            subplot_titles=[
                'Quality Score Distribution', 'Plant Performance Overview', 'Production Efficiency Trends',
                'Quality Issues by Plant', 'Order Volume Trends', 'Bottleneck Analysis',
                'ML Predictions Summary', 'Downtime Risk Assessment', 'Key Performance Metrics'
            ],
            specs=[
                [{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
                [{"type": "xy"}, {"type": "xy"}, {"type": "xy"}],
                [{"type": "xy"}, {"type": "xy"}, {"type": "xy"}]
            ],
            vertical_spacing=0.12,
            horizontal_spacing=0.1
        )
        
        df = self.comprehensive_df
        
        # 1. Quality Score Distribution
        if 'QUALITY_SCORE' in df.columns:
            quality_scores = df['QUALITY_SCORE'].dropna()
            fig.add_trace(
                go.Histogram(x=quality_scores, nbinsx=20, name="Quality Scores", 
                           marker_color=self.colors['primary'], showlegend=False),
                row=1, col=1
            )
        
        # 2. Plant Performance Overview
        plant_col = self._get_plant_column()
        if plant_col:
            plant_performance = df.groupby(plant_col).agg({
                'QUALITY_SCORE': 'mean',
                'AUFNR': 'count'
            }).reset_index()
            
            fig.add_trace(
                go.Scatter(x=plant_performance[plant_col], 
                          y=plant_performance['QUALITY_SCORE'],
                          mode='markers',
                          marker=dict(size=plant_performance['AUFNR']/10, 
                                    color=self.colors['secondary']),
                          name="Plant Performance", showlegend=False),
                row=1, col=2
            )
        
        # 3. Production Efficiency Trends
        if 'PRODUCTION_EFFICIENCY' in df.columns and 'ERDAT' in df.columns:
            try:
                df_temp = df.copy()
                df_temp['ERDAT'] = pd.to_datetime(df_temp['ERDAT'], errors='coerce')
                df_temp['MONTH'] = df_temp['ERDAT'].dt.to_period('M')
                
                monthly_efficiency = df_temp.groupby('MONTH')['PRODUCTION_EFFICIENCY'].mean().reset_index()
                monthly_efficiency['MONTH_STR'] = monthly_efficiency['MONTH'].astype(str)
                
                fig.add_trace(
                    go.Scatter(x=monthly_efficiency['MONTH_STR'], 
                              y=monthly_efficiency['PRODUCTION_EFFICIENCY'],
                              mode='lines+markers',
                              line=dict(color=self.colors['success']),
                              name="Efficiency Trend", showlegend=False),
                    row=1, col=3
                )
            except:
                pass
        
        # 4. Quality Issues by Plant
        if plant_col and 'QUALITY_NOTIF_COUNT' in df.columns:
            plant_quality = df.groupby(plant_col)['QUALITY_NOTIF_COUNT'].sum().reset_index()
            
            fig.add_trace(
                go.Bar(x=plant_quality[plant_col], 
                       y=plant_quality['QUALITY_NOTIF_COUNT'],
                       marker_color=self.colors['warning'],
                       name="Quality Issues", showlegend=False),
                row=2, col=1
            )
        
        # 5. Order Volume Trends
        if 'ERDAT' in df.columns:
            try:
                df_temp = df.copy()
                df_temp['ERDAT'] = pd.to_datetime(df_temp['ERDAT'], errors='coerce')
                df_temp['MONTH'] = df_temp['ERDAT'].dt.to_period('M')
                
                monthly_orders = df_temp.groupby('MONTH').size().reset_index(name='Order_Count')
                monthly_orders['MONTH_STR'] = monthly_orders['MONTH'].astype(str)
                
                fig.add_trace(
                    go.Bar(x=monthly_orders['MONTH_STR'], 
                           y=monthly_orders['Order_Count'],
                           marker_color=self.colors['info'],
                           name="Order Volume", showlegend=False),
                    row=2, col=2
                )
            except:
                pass
        
        # 6. Bottleneck Analysis
        if self.bottleneck_results and 'bottleneck_analysis' in self.bottleneck_results:
            summary = self.bottleneck_results['bottleneck_analysis'].get('summary', {})
            categories = summary.get('bottleneck_categories', {})
            
            if categories:
                fig.add_trace(
                    go.Bar(x=list(categories.keys()), 
                           y=list(categories.values()),
                           marker_color=self.colors['warning'],
                           name="Bottlenecks", showlegend=False),
                    row=2, col=3
                )
        
        # 7. ML Predictions Summary
        if self.ml_results and 'predictions' in self.ml_results:
            predictions = self.ml_results['predictions']
            
            if 'QUALITY_ISSUE_PROBABILITY' in predictions.columns:
                risk_levels = pd.cut(predictions['QUALITY_ISSUE_PROBABILITY'], 
                                   bins=[0, 0.3, 0.7, 1.0], 
                                   labels=['Low', 'Medium', 'High'])
                risk_counts = risk_levels.value_counts()
                
                fig.add_trace(
                    go.Pie(labels=risk_counts.index, 
                           values=risk_counts.values,
                           hole=0.4,
                           marker=dict(colors=[self.colors['success'], 
                                             self.colors['warning'], 
                                             self.colors['warning']]),
                           showlegend=False),
                    row=3, col=1
                )
        
        # 8. Downtime Risk Assessment
        if self.bottleneck_results and 'downtime_model' in self.bottleneck_results:
            downtime_model = self.bottleneck_results['downtime_model']
            if downtime_model and 'prediction_summary' in downtime_model:
                summary = downtime_model['prediction_summary']
                
                risk_data = {
                    'Risk Level': ['Low', 'Medium', 'High'],
                    'Count': [summary.get('low_risk_count', 0),
                             summary.get('medium_risk_count', 0), 
                             summary.get('high_risk_count', 0)]
                }
                
                fig.add_trace(
                    go.Bar(x=risk_data['Risk Level'], 
                           y=risk_data['Count'],
                           marker_color=[self.colors['success'], 
                                       self.colors['warning'], 
                                       self.colors['warning']],
                           name="Downtime Risk", showlegend=False),
                    row=3, col=2
                )
        
        # 9. Key Performance Metrics
        metrics = self._calculate_key_metrics()
        if metrics:
            fig.add_trace(
                go.Indicator(
                    mode="gauge+number+delta",
                    value=metrics.get('overall_score', 75),
                    domain={'x': [0, 1], 'y': [0, 1]},
                    title={'text': "Overall Performance"},
                    gauge={
                        'axis': {'range': [None, 100]},
                        'bar': {'color': self.colors['primary']},
                        'steps': [
                            {'range': [0, 50], 'color': "lightgray"},
                            {'range': [50, 80], 'color': "gray"}
                        ],
                        'threshold': {
                            'line': {'color': "red", 'width': 4},
                            'thickness': 0.75,
                            'value': 90
                        }
                    }
                ),
                row=3, col=3
            )
        
        # Update layout
        fig.update_layout(
            title={
                'text': "SAP Manufacturing Analytics - Executive Dashboard",
                'x': 0.5,
                'xanchor': 'center',
                'font': {'size': 24}
            },
            height=1200,
            showlegend=False,
            template="plotly_white"
        )
        
        # Save dashboard
        dashboard_file = os.path.join(self.viz_folder, f"executive_dashboard_{self.timestamp}.html")
        fig.write_html(dashboard_file)
        self.created_files.append(dashboard_file)
        
        print(f"   ✓ Executive Dashboard: {os.path.basename(dashboard_file)}")
        return fig
    
    def create_quality_analysis_charts(self):
        """
        Create detailed quality analysis visualizations
        """
        print("🔍 Creating Quality Analysis Charts...")
        
        df = self.comprehensive_df
        
        # Quality Score Distribution with Statistics
        fig1 = go.Figure()
        
        if 'QUALITY_SCORE' in df.columns:
            quality_scores = df['QUALITY_SCORE'].dropna()
            
            # Histogram
            fig1.add_trace(go.Histogram(
                x=quality_scores,
                nbinsx=30,
                name="Quality Score Distribution",
                marker_color=self.colors['primary'],
                opacity=0.7
            ))
            
            # Add mean line
            mean_score = quality_scores.mean()
            fig1.add_vline(x=mean_score, line_dash="dash", line_color="red",
                          annotation_text=f"Mean: {mean_score:.1f}")
            
            fig1.update_layout(
                title="Quality Score Distribution Analysis",
                xaxis_title="Quality Score",
                yaxis_title="Number of Orders",
                template="plotly_white"
            )
        
        # Quality by Plant Comparison
        fig2 = go.Figure()
        plant_col = self._get_plant_column()
        
        if plant_col and 'QUALITY_SCORE' in df.columns:
            plant_quality = df.groupby(plant_col).agg({
                'QUALITY_SCORE': ['mean', 'std', 'count'],
                'QUALITY_NOTIF_COUNT': 'sum'
            }).round(2)
            
            plant_quality.columns = ['_'.join(col).strip() for col in plant_quality.columns]
            plant_quality = plant_quality.reset_index()
            
            # Box plot for quality scores by plant
            for plant in plant_quality[plant_col].unique():
                plant_scores = df[df[plant_col] == plant]['QUALITY_SCORE'].dropna()
                fig2.add_trace(go.Box(
                    y=plant_scores,
                    name=str(plant),
                    boxpoints='outliers'
                ))
            
            fig2.update_layout(
                title="Quality Score Distribution by Plant",
                xaxis_title="Plant",
                yaxis_title="Quality Score",
                template="plotly_white"
            )
        
        # Quality Issues Over Time
        fig3 = go.Figure()
        
        if 'ERDAT' in df.columns and 'QUALITY_NOTIF_COUNT' in df.columns:
            try:
                df_temp = df.copy()
                df_temp['ERDAT'] = pd.to_datetime(df_temp['ERDAT'], errors='coerce')
                df_temp['WEEK'] = df_temp['ERDAT'].dt.to_period('W')
                
                weekly_quality = df_temp.groupby('WEEK').agg({
                    'QUALITY_NOTIF_COUNT': 'sum',
                    'AUFNR': 'count'
                }).reset_index()
                weekly_quality['QUALITY_RATE'] = weekly_quality['QUALITY_NOTIF_COUNT'] / weekly_quality['AUFNR'] * 100
                weekly_quality['WEEK_STR'] = weekly_quality['WEEK'].astype(str)
                
                fig3.add_trace(go.Scatter(
                    x=weekly_quality['WEEK_STR'],
                    y=weekly_quality['QUALITY_RATE'],
                    mode='lines+markers',
                    name="Quality Issue Rate",
                    line=dict(color=self.colors['warning'])
                ))
                
                fig3.update_layout(
                    title="Quality Issue Rate Trend Over Time",
                    xaxis_title="Week",
                    yaxis_title="Quality Issues per 100 Orders",
                    template="plotly_white"
                )
            except:
                pass
        
        # Save quality charts
        quality_file1 = os.path.join(self.viz_folder, f"quality_distribution_{self.timestamp}.html")
        quality_file2 = os.path.join(self.viz_folder, f"quality_by_plant_{self.timestamp}.html")
        quality_file3 = os.path.join(self.viz_folder, f"quality_trends_{self.timestamp}.html")
        
        fig1.write_html(quality_file1)
        fig2.write_html(quality_file2)
        fig3.write_html(quality_file3)
        
        self.created_files.extend([quality_file1, quality_file2, quality_file3])
        
        print(f"   ✓ Quality Analysis Charts: 3 files created")
        return [fig1, fig2, fig3]
    
    def create_production_efficiency_charts(self):
        """
        Create production efficiency analysis visualizations
        """
        print("⚡ Creating Production Efficiency Charts...")
        
        df = self.comprehensive_df
        
        # Efficiency Distribution and Benchmarking
        fig1 = make_subplots(
            rows=2, cols=2,
            subplot_titles=['Efficiency Distribution', 'Efficiency by Plant', 
                           'Efficiency vs Quality Score', 'Monthly Efficiency Trends'],
            specs=[[{"type": "xy"}, {"type": "xy"}],
                   [{"type": "xy"}, {"type": "xy"}]]
        )
        
        if 'PRODUCTION_EFFICIENCY' in df.columns:
            efficiency = df['PRODUCTION_EFFICIENCY'].dropna()
            
            # 1. Efficiency Distribution
            fig1.add_trace(
                go.Histogram(x=efficiency, nbinsx=25, name="Efficiency Distribution",
                           marker_color=self.colors['success'], showlegend=False),
                row=1, col=1
            )
            
            # Add benchmark lines
            fig1.add_vline(x=85, line_dash="dash", line_color="orange",
                          annotation_text="Target: 85%", row=1, col=1)
            fig1.add_vline(x=efficiency.mean(), line_dash="dash", line_color="red",
                          annotation_text=f"Avg: {efficiency.mean():.1f}%", row=1, col=1)
        
        # 2. Efficiency by Plant
        plant_col = self._get_plant_column()
        if plant_col and 'PRODUCTION_EFFICIENCY' in df.columns:
            plant_efficiency = df.groupby(plant_col)['PRODUCTION_EFFICIENCY'].agg(['mean', 'count']).reset_index()
            
            fig1.add_trace(
                go.Bar(x=plant_efficiency[plant_col], 
                       y=plant_efficiency['mean'],
                       marker_color=self.colors['info'],
                       name="Plant Efficiency", showlegend=False),
                row=1, col=2
            )
        
        # 3. Efficiency vs Quality Score
        if 'PRODUCTION_EFFICIENCY' in df.columns and 'QUALITY_SCORE' in df.columns:
            fig1.add_trace(
                go.Scatter(x=df['PRODUCTION_EFFICIENCY'], 
                          y=df['QUALITY_SCORE'],
                          mode='markers',
                          marker=dict(color=self.colors['primary'], opacity=0.6),
                          name="Efficiency vs Quality", showlegend=False),
                row=2, col=1
            )
        
        # 4. Monthly Efficiency Trends
        if 'ERDAT' in df.columns and 'PRODUCTION_EFFICIENCY' in df.columns:
            try:
                df_temp = df.copy()
                df_temp['ERDAT'] = pd.to_datetime(df_temp['ERDAT'], errors='coerce')
                df_temp['MONTH'] = df_temp['ERDAT'].dt.to_period('M')
                
                monthly_eff = df_temp.groupby('MONTH')['PRODUCTION_EFFICIENCY'].mean().reset_index()
                monthly_eff['MONTH_STR'] = monthly_eff['MONTH'].astype(str)
                
                fig1.add_trace(
                    go.Scatter(x=monthly_eff['MONTH_STR'], 
                              y=monthly_eff['PRODUCTION_EFFICIENCY'],
                              mode='lines+markers',
                              line=dict(color=self.colors['success']),
                              name="Monthly Trend", showlegend=False),
                    row=2, col=2
                )
            except:
                pass
        
        fig1.update_layout(
            title="Production Efficiency Analysis",
            height=800,
            template="plotly_white"
        )
        
        # Efficiency Improvement Opportunities
        fig2 = go.Figure()
        
        if 'PRODUCTION_EFFICIENCY' in df.columns:
            # Create efficiency categories
            df_temp = df.copy()
            df_temp['EFFICIENCY_CATEGORY'] = pd.cut(
                df_temp['PRODUCTION_EFFICIENCY'],
                bins=[0, 70, 85, 95, 100],
                labels=['Poor (<70%)', 'Fair (70-85%)', 'Good (85-95%)', 'Excellent (95%+)']
            )
            
            efficiency_dist = df_temp['EFFICIENCY_CATEGORY'].value_counts()
            
            colors = [self.colors['warning'], self.colors['secondary'], 
                     self.colors['success'], self.colors['primary']]
            
            fig2.add_trace(go.Pie(
                labels=efficiency_dist.index,
                values=efficiency_dist.values,
                marker=dict(colors=colors),
                hole=0.4,
                textinfo='label+percent'
            ))
            
            fig2.update_layout(
                title="Production Efficiency Categories",
                template="plotly_white"
            )
        
        # Save efficiency charts
        efficiency_file1 = os.path.join(self.viz_folder, f"production_efficiency_analysis_{self.timestamp}.html")
        efficiency_file2 = os.path.join(self.viz_folder, f"efficiency_categories_{self.timestamp}.html")
        
        fig1.write_html(efficiency_file1)
        fig2.write_html(efficiency_file2)
        
        self.created_files.extend([efficiency_file1, efficiency_file2])
        
        print(f"   ✓ Production Efficiency Charts: 2 files created")
        return [fig1, fig2]
    
    def create_bottleneck_visualizations(self):
        """
        Create bottleneck analysis visualizations
        """
        print("🔍 Creating Bottleneck Analysis Charts...")
        
        if not self.bottleneck_results:
            print("   ⚠️  No bottleneck analysis data available")
            return []
        
        bottleneck_data = self.bottleneck_results['bottleneck_analysis']
        
        # Bottleneck Summary Dashboard
        fig1 = make_subplots(
            rows=2, cols=2,
            subplot_titles=['Bottleneck Categories', 'Plant Bottleneck Scores', 
                           'Work Center Utilization', 'Material Risk Analysis'],
            specs=[[{"type": "xy"}, {"type": "xy"}],
                   [{"type": "xy"}, {"type": "xy"}]]
        )
        
        # 1. Bottleneck Categories
        if 'summary' in bottleneck_data:
            categories = bottleneck_data['summary'].get('bottleneck_categories', {})
            if categories:
                fig1.add_trace(
                    go.Bar(x=list(categories.keys()), 
                           y=list(categories.values()),
                           marker_color=self.colors['warning'],
                           name="Bottleneck Count", showlegend=False),
                    row=1, col=1
                )
        
        # 2. Plant Bottleneck Scores
        if 'plant_bottlenecks' in bottleneck_data:
            plant_data = bottleneck_data['plant_bottlenecks']
            if 'bottleneck_ranking' in plant_data:
                ranking = plant_data['bottleneck_ranking'].head(10)
                
                fig1.add_trace(
                    go.Bar(x=ranking.index.astype(str), 
                           y=ranking['BOTTLENECK_SCORE'],
                           marker_color=self.colors['warning'],
                           name="Bottleneck Score", showlegend=False),
                    row=1, col=2
                )
        
        # 3. Work Center Utilization
        if 'work_center_bottlenecks' in bottleneck_data:
            wc_data = bottleneck_data['work_center_bottlenecks']
            if 'work_center_analysis' in wc_data:
                wc_analysis = wc_data['work_center_analysis'].head(10)
                
                fig1.add_trace(
                    go.Scatter(x=wc_analysis.index.astype(str), 
                              y=wc_analysis['UTILIZATION_SCORE'],
                              mode='markers',
                              marker=dict(size=10, color=self.colors['info']),
                              name="Utilization Score", showlegend=False),
                    row=2, col=1
                )
        
        # 4. Material Risk Analysis
        if 'material_bottlenecks' in bottleneck_data:
            material_data = bottleneck_data['material_bottlenecks']
            if 'high_risk_materials' in material_data:
                risk_materials = material_data['high_risk_materials'].head(10)
                
                fig1.add_trace(
                    go.Bar(x=risk_materials.index, 
                           y=risk_materials['MATERIAL_BOTTLENECK_SCORE'],
                           marker_color=self.colors['warning'],
                           name="Risk Score", showlegend=False),
                    row=2, col=2
                )
        
        fig1.update_layout(
            title="Bottleneck Analysis Dashboard",
            height=800,
            template="plotly_white"
        )
        
        # Downtime Risk Visualization
        fig2 = go.Figure()
        
        if 'downtime_model' in self.bottleneck_results:
            downtime_model = self.bottleneck_results['downtime_model']
            if downtime_model and 'prediction_summary' in downtime_model:
                summary = downtime_model['prediction_summary']
                
                risk_levels = ['Low Risk', 'Medium Risk', 'High Risk']
                counts = [summary.get('low_risk_count', 0),
                         summary.get('medium_risk_count', 0),
                         summary.get('high_risk_count', 0)]
                
                colors = [self.colors['success'], self.colors['warning'], self.colors['warning']]
                
                fig2.add_trace(go.Bar(
                    x=risk_levels,
                    y=counts,
                    marker_color=colors,
                    text=counts,
                    textposition='auto'
                ))
                
                fig2.update_layout(
                    title="Downtime Risk Assessment",
                    xaxis_title="Risk Level",
                    yaxis_title="Number of Orders",
                    template="plotly_white"
                )
        
        # Throughput Analysis
        fig3 = go.Figure()
        
        if 'throughput_analysis' in bottleneck_data:
            throughput_data = bottleneck_data['throughput_analysis']
            
            if 'plant_analysis' in throughput_data:
                plant_throughput = throughput_data['plant_analysis']['plant_throughput']
                
                fig3.add_trace(go.Scatter(
                    x=plant_throughput.index,
                    y=plant_throughput['THROUGHPUT_SCORE'],
                    mode='markers+lines',
                    marker=dict(size=12, color=self.colors['primary']),
                    line=dict(color=self.colors['primary'])
                ))
                
                fig3.update_layout(
                    title="Plant Throughput Performance",
                    xaxis_title="Plant",
                    yaxis_title="Throughput Score",
                    template="plotly_white"
                )
        
        # Save bottleneck charts
        bottleneck_file1 = os.path.join(self.viz_folder, f"bottleneck_dashboard_{self.timestamp}.html")
        bottleneck_file2 = os.path.join(self.viz_folder, f"downtime_risk_{self.timestamp}.html")
        bottleneck_file3 = os.path.join(self.viz_folder, f"throughput_analysis_{self.timestamp}.html")
        
        fig1.write_html(bottleneck_file1)
        fig2.write_html(bottleneck_file2)
        fig3.write_html(bottleneck_file3)
        
        self.created_files.extend([bottleneck_file1, bottleneck_file2, bottleneck_file3])
        
        print(f"   ✓ Bottleneck Analysis Charts: 3 files created")
        return [fig1, fig2, fig3]
    
    def create_ml_model_visualizations(self):
        """
        Create machine learning model performance visualizations
        """
        print("🤖 Creating ML Model Visualizations...")
        
        if not self.ml_results:
            print("   ⚠️  No ML results available")
            return []
        
        # Model Performance Comparison
        fig1 = go.Figure()
        
        if 'ml_results' in self.ml_results:
            ml_data = self.ml_results['ml_results']
            
            # Quality prediction models
            if 'quality_prediction' in ml_data:
                models = []
                accuracies = []
                cv_scores = []
                
                for model_name, results in ml_data['quality_prediction'].items():
                    models.append(model_name)
                    accuracies.append(results['accuracy'])
                    cv_scores.append(results['cv_mean'])
                
                fig1.add_trace(go.Bar(
                    x=models,
                    y=accuracies,
                    name="Test Accuracy",
                    marker_color=self.colors['primary']
                ))
                
                fig1.add_trace(go.Bar(
                    x=models,
                    y=cv_scores,
                    name="CV Score",
                    marker_color=self.colors['secondary']
                ))
                
                fig1.update_layout(
                    title="Quality Prediction Model Performance",
                    xaxis_title="Model",
                    yaxis_title="Accuracy",
                    template="plotly_white",
                    barmode='group'
                )
        
        # Feature Importance Visualization
        fig2 = go.Figure()
        
        if 'feature_importance' in self.ml_results.get('ml_results', {}):
            importance_df = self.ml_results['ml_results']['feature_importance'].head(10)
            
            fig2.add_trace(go.Bar(
                x=importance_df['mean'],
                y=importance_df['feature'],
                orientation='h',
                marker_color=self.colors['info'],
                error_x=dict(type='data', array=importance_df['std'])
            ))
            
            fig2.update_layout(
                title="Feature Importance Analysis",
                xaxis_title="Importance Score",
                yaxis_title="Features",
                template="plotly_white"
            )
        
        # Prediction Distribution
        fig3 = go.Figure()
        
        if 'predictions' in self.ml_results:
            predictions = self.ml_results['predictions']
            
            if 'QUALITY_ISSUE_PROBABILITY' in predictions.columns:
                probabilities = predictions['QUALITY_ISSUE_PROBABILITY'].dropna()
                
                fig3.add_trace(go.Histogram(
                    x=probabilities,
                    nbinsx=30,
                    marker_color=self.colors['warning'],
                    opacity=0.7
                ))
                
                # Add risk thresholds
                fig3.add_vline(x=0.3, line_dash="dash", line_color="orange",
                              annotation_text="Low Risk Threshold")
                fig3.add_vline(x=0.7, line_dash="dash", line_color="red",
                              annotation_text="High Risk Threshold")
                
                fig3.update_layout(
                    title="Quality Issue Probability Distribution",
                    xaxis_title="Probability",
                    yaxis_title="Number of Orders",
                    template="plotly_white"
                )
        
        # Save ML visualization charts
        ml_file1 = os.path.join(self.viz_folder, f"ml_model_performance_{self.timestamp}.html")
        ml_file2 = os.path.join(self.viz_folder, f"feature_importance_{self.timestamp}.html")
        ml_file3 = os.path.join(self.viz_folder, f"prediction_distribution_{self.timestamp}.html")
        
        fig1.write_html(ml_file1)
        fig2.write_html(ml_file2)
        fig3.write_html(ml_file3)
        
        self.created_files.extend([ml_file1, ml_file2, ml_file3])
        
        print(f"   ✓ ML Model Visualizations: 3 files created")
        return [fig1, fig2, fig3]
    
    def create_plant_comparison_dashboard(self):
        """
        Create comprehensive plant comparison dashboard
        """
        print("🏭 Creating Plant Comparison Dashboard...")
        
        df = self.comprehensive_df
        plant_col = self._get_plant_column()
        
        if not plant_col:
            print("   ⚠️  No plant information available")
            return None
        
        # Plant Performance Radar Chart
        fig1 = go.Figure()
        
        # Calculate plant metrics
        plant_metrics = df.groupby(plant_col).agg({
            'QUALITY_SCORE': 'mean',
            'PRODUCTION_EFFICIENCY': 'mean',
            'QUALITY_NOTIF_COUNT': 'mean',
            'DEFECT_COUNT': 'mean',
            'ORDER_ITEM_COUNT': 'mean',
            'AUFNR': 'count'
        }).round(2)
        
        # Normalize metrics for radar chart (0-100 scale)
        normalized_metrics = plant_metrics.copy()
        normalized_metrics['QUALITY_SCORE_NORM'] = normalized_metrics['QUALITY_SCORE']
        normalized_metrics['EFFICIENCY_NORM'] = normalized_metrics['PRODUCTION_EFFICIENCY']
        normalized_metrics['QUALITY_ISSUES_NORM'] = 100 - (normalized_metrics['QUALITY_NOTIF_COUNT'] / normalized_metrics['QUALITY_NOTIF_COUNT'].max() * 100)
        normalized_metrics['DEFECTS_NORM'] = 100 - (normalized_metrics['DEFECT_COUNT'] / normalized_metrics['DEFECT_COUNT'].max() * 100)
        normalized_metrics['THROUGHPUT_NORM'] = normalized_metrics['AUFNR'] / normalized_metrics['AUFNR'].max() * 100
        
        categories = ['Quality Score', 'Efficiency', 'Quality Issues (Inv)', 'Defects (Inv)', 'Throughput']
        
        # Create radar chart for each plant
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2']
        
        for i, (plant, row) in enumerate(normalized_metrics.head(5).iterrows()):
            values = [
                row['QUALITY_SCORE_NORM'],
                row['EFFICIENCY_NORM'],
                row['QUALITY_ISSUES_NORM'],
                row['DEFECTS_NORM'],
                row['THROUGHPUT_NORM']
            ]
            
            fig1.add_trace(go.Scatterpolar(
                r=values + [values[0]],  # Close the polygon
                theta=categories + [categories[0]],
                fill='toself',
                name=str(plant),
                line_color=colors[i % len(colors)]
            ))
        
        fig1.update_layout(
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 100]
                )),
            showlegend=True,
            title="Plant Performance Comparison (Radar Chart)",
            template="plotly_white"
        )
        
        # Plant Ranking Matrix
        fig2 = go.Figure()
        
        # Create ranking matrix
        ranking_metrics = ['QUALITY_SCORE', 'PRODUCTION_EFFICIENCY', 'AUFNR']
        plant_rankings = pd.DataFrame()
        
        for metric in ranking_metrics:
            if metric in plant_metrics.columns:
                rankings = plant_metrics[metric].rank(ascending=False)
                plant_rankings[metric] = rankings
        
        if not plant_rankings.empty:
            fig2.add_trace(go.Heatmap(
                z=plant_rankings.T.values,
                x=plant_rankings.index,
                y=plant_rankings.columns,
                colorscale='RdYlGn_r',
                text=plant_rankings.T.values,
                texttemplate="%{text}",
                textfont={"size": 12},
                colorbar=dict(title="Ranking")
            ))
            
            fig2.update_layout(
                title="Plant Ranking Matrix (1 = Best)",
                xaxis_title="Plant",
                yaxis_title="Metric",
                template="plotly_white"
            )
        
        # Plant Efficiency vs Quality Scatter
        fig3 = go.Figure()
        
        if 'QUALITY_SCORE' in plant_metrics.columns and 'PRODUCTION_EFFICIENCY' in plant_metrics.columns:
            fig3.add_trace(go.Scatter(
                x=plant_metrics['PRODUCTION_EFFICIENCY'],
                y=plant_metrics['QUALITY_SCORE'],
                mode='markers+text',
                text=plant_metrics.index,
                textposition="top center",
                marker=dict(
                    size=plant_metrics['AUFNR'] / 10,  # Size by order count
                    color=plant_metrics['QUALITY_NOTIF_COUNT'],
                    colorscale='RdYlGn_r',
                    showscale=True,
                    colorbar=dict(title="Quality Issues")
                )
            ))
            
            # Add benchmark lines
            fig3.add_hline(y=85, line_dash="dash", line_color="orange",
                          annotation_text="Quality Target: 85")
            fig3.add_vline(x=85, line_dash="dash", line_color="orange",
                          annotation_text="Efficiency Target: 85%")
            
            fig3.update_layout(
                title="Plant Performance Matrix: Efficiency vs Quality",
                xaxis_title="Production Efficiency (%)",
                yaxis_title="Quality Score",
                template="plotly_white"
            )
        
        # Save plant comparison charts
        plant_file1 = os.path.join(self.viz_folder, f"plant_radar_comparison_{self.timestamp}.html")
        plant_file2 = os.path.join(self.viz_folder, f"plant_ranking_matrix_{self.timestamp}.html")
        plant_file3 = os.path.join(self.viz_folder, f"plant_performance_matrix_{self.timestamp}.html")
        
        fig1.write_html(plant_file1)
        fig2.write_html(plant_file2)
        fig3.write_html(plant_file3)
        
        self.created_files.extend([plant_file1, plant_file2, plant_file3])
        
        print(f"   ✓ Plant Comparison Dashboard: 3 files created")
        return [fig1, fig2, fig3]
    
    def create_time_series_analysis(self):
        """
        Create time series analysis visualizations
        """
        print("📈 Creating Time Series Analysis...")
        
        df = self.comprehensive_df
        
        if 'ERDAT' not in df.columns:
            print("   ⚠️  No date information available for time series")
            return []
        
        try:
            df_temp = df.copy()
            df_temp['ERDAT'] = pd.to_datetime(df_temp['ERDAT'], errors='coerce')
            df_temp = df_temp.dropna(subset=['ERDAT'])
            
            # Time Series Dashboard
            fig1 = make_subplots(
                rows=3, cols=1,
                subplot_titles=['Order Volume Over Time', 'Quality Metrics Trends', 'Efficiency Trends'],
                shared_xaxes=True,
                vertical_spacing=0.1
            )
            
            # 1. Order Volume Over Time
            df_temp['WEEK'] = df_temp['ERDAT'].dt.to_period('W')
            weekly_orders = df_temp.groupby('WEEK').size().reset_index(name='Order_Count')
            weekly_orders['WEEK_STR'] = weekly_orders['WEEK'].astype(str)
            
            fig1.add_trace(
                go.Scatter(x=weekly_orders['WEEK_STR'], 
                          y=weekly_orders['Order_Count'],
                          mode='lines+markers',
                          name="Order Volume",
                          line=dict(color=self.colors['primary'])),
                row=1, col=1
            )
            
            # 2. Quality Metrics Trends
            if 'QUALITY_SCORE' in df_temp.columns:
                weekly_quality = df_temp.groupby('WEEK').agg({
                    'QUALITY_SCORE': 'mean',
                    'QUALITY_NOTIF_COUNT': 'sum'
                }).reset_index()
                weekly_quality['WEEK_STR'] = weekly_quality['WEEK'].astype(str)
                
                fig1.add_trace(
                    go.Scatter(x=weekly_quality['WEEK_STR'], 
                              y=weekly_quality['QUALITY_SCORE'],
                              mode='lines+markers',
                              name="Quality Score",
                              line=dict(color=self.colors['success'])),
                    row=2, col=1
                )
            
            # 3. Efficiency Trends
            if 'PRODUCTION_EFFICIENCY' in df_temp.columns:
                weekly_efficiency = df_temp.groupby('WEEK')['PRODUCTION_EFFICIENCY'].mean().reset_index()
                weekly_efficiency['WEEK_STR'] = weekly_efficiency['WEEK'].astype(str)
                
                fig1.add_trace(
                    go.Scatter(x=weekly_efficiency['WEEK_STR'], 
                              y=weekly_efficiency['PRODUCTION_EFFICIENCY'],
                              mode='lines+markers',
                              name="Production Efficiency",
                              line=dict(color=self.colors['warning'])),
                    row=3, col=1
                )
            
            fig1.update_layout(
                title="Manufacturing Performance Time Series",
                height=900,
                template="plotly_white"
            )
            
            # Seasonal Analysis
            fig2 = go.Figure()
            
            df_temp['MONTH'] = df_temp['ERDAT'].dt.month
            df_temp['DAY_OF_WEEK'] = df_temp['ERDAT'].dt.dayofweek
            
            # Monthly seasonality
            monthly_pattern = df_temp.groupby('MONTH').agg({
                'AUFNR': 'count',
                'QUALITY_SCORE': 'mean'
            }).reset_index()
            
            fig2.add_trace(go.Bar(
                x=monthly_pattern['MONTH'],
                y=monthly_pattern['AUFNR'],
                name="Order Count",
                marker_color=self.colors['primary']
            ))
            
            fig2.update_layout(
                title="Monthly Order Volume Pattern",
                xaxis_title="Month",
                yaxis_title="Order Count",
                template="plotly_white"
            )
            
            # Save time series charts
            ts_file1 = os.path.join(self.viz_folder, f"time_series_dashboard_{self.timestamp}.html")
            ts_file2 = os.path.join(self.viz_folder, f"seasonal_analysis_{self.timestamp}.html")
            
            fig1.write_html(ts_file1)
            fig2.write_html(ts_file2)
            
            self.created_files.extend([ts_file1, ts_file2])
            
            print(f"   ✓ Time Series Analysis: 2 files created")
            return [fig1, fig2]
            
        except Exception as e:
            print(f"   ⚠️  Time series analysis failed: {e}")
            return []
    
    def create_static_summary_charts(self):
        """
        Create static matplotlib charts for reports
        """
        print("📊 Creating Static Summary Charts...")
        
        df = self.comprehensive_df
        
        # Create figure with subplots
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('SAP Manufacturing Analytics - Summary Report', fontsize=16, fontweight='bold')
        
        # 1. Quality Score Distribution
        if 'QUALITY_SCORE' in df.columns:
            axes[0, 0].hist(df['QUALITY_SCORE'].dropna(), bins=20, alpha=0.7, color=self.colors['primary'])
            axes[0, 0].axvline(df['QUALITY_SCORE'].mean(), color='red', linestyle='--', 
                              label=f'Mean: {df["QUALITY_SCORE"].mean():.1f}')
            axes[0, 0].set_title('Quality Score Distribution')
            axes[0, 0].set_xlabel('Quality Score')
            axes[0, 0].set_ylabel('Frequency')
            axes[0, 0].legend()
        
        # 2. Plant Performance Comparison
        plant_col = self._get_plant_column()
        if plant_col and 'QUALITY_SCORE' in df.columns:
            plant_quality = df.groupby(plant_col)['QUALITY_SCORE'].mean().sort_values(ascending=True)
            plant_quality.plot(kind='barh', ax=axes[0, 1], color=self.colors['secondary'])
            axes[0, 1].set_title('Plant Quality Performance')
            axes[0, 1].set_xlabel('Average Quality Score')
        
        # 3. Production Efficiency vs Quality
        if 'PRODUCTION_EFFICIENCY' in df.columns and 'QUALITY_SCORE' in df.columns:
            axes[0, 2].scatter(df['PRODUCTION_EFFICIENCY'], df['QUALITY_SCORE'], 
                              alpha=0.6, color=self.colors['info'])
            axes[0, 2].set_title('Efficiency vs Quality')
            axes[0, 2].set_xlabel('Production Efficiency (%)')
            axes[0, 2].set_ylabel('Quality Score')
            
            # Add correlation coefficient
            corr = df[['PRODUCTION_EFFICIENCY', 'QUALITY_SCORE']].corr().iloc[0, 1]
            axes[0, 2].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                           transform=axes[0, 2].transAxes, fontsize=10,
                           bbox=dict(boxstyle="round", facecolor='wheat', alpha=0.5))
        
        # 4. Quality Issues by Plant
        if plant_col and 'QUALITY_NOTIF_COUNT' in df.columns:
            plant_issues = df.groupby(plant_col)['QUALITY_NOTIF_COUNT'].sum().sort_values(ascending=True)
            plant_issues.plot(kind='barh', ax=axes[1, 0], color=self.colors['warning'])
            axes[1, 0].set_title('Quality Issues by Plant')
            axes[1, 0].set_xlabel('Total Quality Notifications')
        
        # 5. Monthly Trends
        if 'ERDAT' in df.columns:
            try:
                df_temp = df.copy()
                df_temp['ERDAT'] = pd.to_datetime(df_temp['ERDAT'], errors='coerce')
                df_temp['MONTH'] = df_temp['ERDAT'].dt.to_period('M')
                
                monthly_orders = df_temp.groupby('MONTH').size()
                monthly_orders.plot(kind='line', ax=axes[1, 1], color=self.colors['success'], marker='o')
                axes[1, 1].set_title('Monthly Order Volume')
                axes[1, 1].set_xlabel('Month')
                axes[1, 1].set_ylabel('Order Count')
                plt.setp(axes[1, 1].xaxis.get_majorticklabels(), rotation=45)
            except:
                axes[1, 1].text(0.5, 0.5, 'Date data not available', 
                               ha='center', va='center', transform=axes[1, 1].transAxes)
                axes[1, 1].set_title('Monthly Trends')
        
        # 6. Key Metrics Summary
        metrics = self._calculate_key_metrics()
        if metrics:
            metrics_data = {
                'Total Orders': metrics.get('total_orders', 0),
                'Avg Quality Score': metrics.get('avg_quality_score', 0),
                'Avg Efficiency': metrics.get('avg_efficiency', 0),
                'Quality Issues': metrics.get('total_quality_issues', 0)
            }
            
            bars = axes[1, 2].bar(range(len(metrics_data)), list(metrics_data.values()), 
                                 color=[self.colors['primary'], self.colors['success'], 
                                       self.colors['info'], self.colors['warning']])
            axes[1, 2].set_title('Key Performance Metrics')
            axes[1, 2].set_xticks(range(len(metrics_data)))
            axes[1, 2].set_xticklabels(list(metrics_data.keys()), rotation=45)
            
            # Add value labels on bars
            for bar, value in zip(bars, metrics_data.values()):
                height = bar.get_height()
                axes[1, 2].text(bar.get_x() + bar.get_width()/2., height,
                               f'{value:.0f}', ha='center', va='bottom')
        
        plt.tight_layout()
        
        # Save static chart
        static_file = os.path.join(self.viz_folder, f"summary_report_charts_{self.timestamp}.png")
        plt.savefig(static_file, dpi=300, bbox_inches='tight')
        plt.close()
        
        self.created_files.append(static_file)
        
        print(f"   ✓ Static Summary Charts: {os.path.basename(static_file)}")
        return static_file
    
    def create_visualization_index(self):
        """
        Create an HTML index page linking all visualizations
        """
        print("📑 Creating Visualization Index...")
        
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>SAP Manufacturing Analytics - Visualization Index</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; }}
                h1 {{ color: #1f77b4; }}
                h2 {{ color: #ff7f0e; }}
                .section {{ margin-bottom: 30px; }}
                .viz-link {{ 
                    display: inline-block; 
                    margin: 10px; 
                    padding: 10px 15px; 
                    background-color: #f0f0f0; 
                    text-decoration: none; 
                    border-radius: 5px;
                    color: #333;
                }}
                .viz-link:hover {{ background-color: #e0e0e0; }}
                .summary {{ 
                    background-color: #f9f9f9; 
                    padding: 20px; 
                    border-left: 4px solid #1f77b4; 
                    margin-bottom: 30px;
                }}
            </style>
        </head>
        <body>
            <h1>SAP Manufacturing Analytics - Visualization Dashboard</h1>
            
            <div class="summary">
                <h3>Analysis Summary</h3>
                <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
                <p><strong>Total Orders Analyzed:</strong> {len(self.comprehensive_df):,}</p>
                <p><strong>Visualizations Created:</strong> {len(self.created_files)}</p>
            </div>
            
            <div class="section">
                <h2>📊 Executive Dashboard</h2>
                <p>High-level overview of key performance indicators and trends.</p>
        """
        
        # Add links to visualizations by category
        viz_categories = {
            'executive_dashboard': '📊 Executive Dashboards',
            'quality': '🔍 Quality Analysis',
            'production_efficiency': '⚡ Production Efficiency',
            'bottleneck': '🔍 Bottleneck Analysis', 
            'ml': '🤖 Machine Learning Models',
            'plant': '🏭 Plant Comparisons',
            'time_series': '📈 Time Series Analysis',
            'summary': '📊 Summary Reports'
        }
        
        for category, title in viz_categories.items():
            category_files = [f for f in self.created_files if category in os.path.basename(f)]
            
            if category_files:
                html_content += f"""
                <div class="section">
                    <h2>{title}</h2>
                """
                
                for file_path in category_files:
                    filename = os.path.basename(file_path)
                    display_name = filename.replace(f'_{self.timestamp}', '').replace('_', ' ').title()
                    
                    if filename.endswith('.html'):
                        html_content += f'<a href="{filename}" class="viz-link">{display_name}</a>\n'
                    elif filename.endswith('.png'):
                        html_content += f'<a href="{filename}" class="viz-link">{display_name} (Image)</a>\n'
                
                html_content += "</div>\n"
        
        html_content += """
            </div>
            
            <div class="section">
                <h2>📋 How to Use These Visualizations</h2>
                <ul>
                    <li><strong>Executive Dashboard:</strong> Start here for overall performance overview</li>
                    <li><strong>Quality Analysis:</strong> Deep dive into quality metrics and trends</li>
                    <li><strong>Production Efficiency:</strong> Analyze operational efficiency patterns</li>
                    <li><strong>Bottleneck Analysis:</strong> Identify operational constraints and improvements</li>
                    <li><strong>ML Models:</strong> Review predictive model performance and insights</li>
                    <li><strong>Plant Comparisons:</strong> Compare performance across facilities</li>
                    <li><strong>Time Series:</strong> Understand trends and seasonal patterns</li>
                </ul>
            </div>
            
            <div class="section">
                <h2>📞 Support</h2>
                <p>For questions about these visualizations or the underlying data analysis, 
                refer to the comprehensive analysis reports in the parent folder.</p>
            </div>
            
        </body>
        </html>
        """
        
        # Save index file
        index_file = os.path.join(self.viz_folder, f"index_{self.timestamp}.html")
        with open(index_file, 'w') as f:
            f.write(html_content)
        
        self.created_files.append(index_file)
        
        print(f"   ✓ Visualization Index: {os.path.basename(index_file)}")
        return index_file
    
    def _get_plant_column(self):
        """Helper method to find the best plant column"""
        for col in ['Plant_Name', 'Plant_Code', 'WERKS', 'PWERK']:
            if col in self.comprehensive_df.columns and self.comprehensive_df[col].notna().sum() > 0:
                return col
        return None
    
    def _calculate_key_metrics(self):
        """Calculate key performance metrics"""
        df = self.comprehensive_df
        
        metrics = {
            'total_orders': len(df),
            'avg_quality_score': df.get('QUALITY_SCORE', pd.Series([0])).mean(),
            'avg_efficiency': df.get('PRODUCTION_EFFICIENCY', pd.Series([0])).mean(),
            'total_quality_issues': df.get('QUALITY_NOTIF_COUNT', pd.Series([0])).sum(),
            'overall_score': 0
        }
        
        # Calculate overall performance score
        quality_component = min(metrics['avg_quality_score'], 100) * 0.4
        efficiency_component = min(metrics['avg_efficiency'], 100) * 0.4
        issue_penalty = min(metrics['total_quality_issues'] / metrics['total_orders'] * 100, 50) * 0.2
        
        metrics['overall_score'] = quality_component + efficiency_component - issue_penalty
        
        return metrics
    
    def generate_all_visualizations(self):
        """
        Generate all visualization types
        """
        print("🎨 GENERATING COMPLETE VISUALIZATION SUITE")
        print("=" * 60)
        
        visualization_results = {}
        
        # Generate all visualization types
        try:
            visualization_results['executive_dashboard'] = self.create_executive_dashboard()
        except Exception as e:
            print(f"   ❌ Executive dashboard failed: {e}")
        
        try:
            visualization_results['quality_charts'] = self.create_quality_analysis_charts()
        except Exception as e:
            print(f"   ❌ Quality charts failed: {e}")
        
        try:
            visualization_results['efficiency_charts'] = self.create_production_efficiency_charts()
        except Exception as e:
            print(f"   ❌ Efficiency charts failed: {e}")
        
        try:
            visualization_results['bottleneck_charts'] = self.create_bottleneck_visualizations()
        except Exception as e:
            print(f"   ❌ Bottleneck charts failed: {e}")
        
        try:
            visualization_results['ml_charts'] = self.create_ml_model_visualizations()
        except Exception as e:
            print(f"   ❌ ML charts failed: {e}")
        
        try:
            visualization_results['plant_comparison'] = self.create_plant_comparison_dashboard()
        except Exception as e:
            print(f"   ❌ Plant comparison failed: {e}")
        
        try:
            visualization_results['time_series'] = self.create_time_series_analysis()
        except Exception as e:
            print(f"   ❌ Time series failed: {e}")
        
        try:
            visualization_results['static_charts'] = self.create_static_summary_charts()
        except Exception as e:
            print(f"   ❌ Static charts failed: {e}")
        
        try:
            visualization_results['index_page'] = self.create_visualization_index()
        except Exception as e:
            print(f"   ❌ Index page failed: {e}")
        
        print(f"\n✅ VISUALIZATION SUITE COMPLETE!")
        print("=" * 60)
        print(f"📁 Visualization folder: {self.viz_folder}")
        print(f"📊 Total files created: {len(self.created_files)}")
        print(f"🌐 Start with: index_{self.timestamp}.html")
        
        return {
            'visualization_results': visualization_results,
            'created_files': self.created_files,
            'viz_folder': self.viz_folder
        }

# Integration function for complete workflow
def create_complete_visualization_suite(comprehensive_df, ml_results=None, bottleneck_results=None, result_folder="result"):
    """
    Create complete visualization suite for SAP analysis
    """
    print("🎨 CREATING COMPLETE VISUALIZATION SUITE")
    print("=" * 60)
    
    # Initialize visualization suite
    viz_suite = SAPVisualizationSuite(comprehensive_df, ml_results, bottleneck_results, result_folder)
    
    # Generate all visualizations
    results = viz_suite.generate_all_visualizations()
    
    return {
        'viz_suite': viz_suite,
        'results': results,
        'created_files': results['created_files'],
        'viz_folder': results['viz_folder']
    }

# Usage guide for Tolaram assessment
def tolaram_visualization_guide():
    """
    Guide for using visualizations in Tolaram assessment
    """
    print("📚 TOLARAM ASSESSMENT - VISUALIZATION GUIDE")
    print("=" * 60)
    
    print("\n1. INTEGRATION WITH ANALYSIS:")
    print("```python")
    print("# After running complete analysis")
    print("comprehensive_df, summary, quality_details = create_comprehensive_sap_view(...)")
    print("ml_results = run_complete_ml_analysis(comprehensive_df)")
    print("bottleneck_results = run_complete_bottleneck_analysis(comprehensive_df)")
    print("")
    print("# Create all visualizations")
    print("viz_results = create_complete_visualization_suite(")
    print("    comprehensive_df, ml_results, bottleneck_results, 'tolaram_assessment'")
    print(")")
    print("```")
    
    print("\n2. VISUALIZATION CATEGORIES:")
    print("• Executive Dashboard: KPI overview for management")
    print("• Quality Analysis: Detailed quality performance charts")
    print("• Production Efficiency: Operational performance metrics")
    print("• Bottleneck Analysis: Constraint identification visuals")
    print("• ML Model Performance: Predictive analytics results")
    print("• Plant Comparisons: Multi-facility benchmarking")
    print("• Time Series: Trend analysis and seasonality")
    print("• Static Reports: Print-ready summary charts")
    
    print("\n3. ASSESSMENT REPORT INTEGRATION:")
    print("• Use executive dashboard for presentation slides")
    print("• Include static charts in written report")
    print("• Reference interactive charts for detailed analysis")
    print("• Plant comparison radar charts for benchmarking")
    print("• ML model visualizations for technical validation")
    
    print("\n4. BUSINESS VALUE DEMONSTRATION:")
    print("• Visual KPI dashboards show immediate impact")
    print("• Trend analysis demonstrates data-driven insights")
    print("• Comparative charts highlight improvement opportunities")
    print("• Professional presentation-ready outputs")

if __name__ == "__main__":
    tolaram_visualization_guide()