## 1. Import Libraries
**Purpose**: Load all required dependencies for data processing and modeling

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
from math import ceil

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Display settings
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


## 2. Functions
**Purpose**: Define core functions for data processing and feature engineering

In [2]:
def load_data(data_path='full_data/'):
    """
    Load and combine all CSV files from the data directory
    
    Returns:
        pandas.DataFrame: Combined industrial data with timestamp index
    """
    csv_files = sorted(glob.glob(os.path.join(data_path, '*.csv')))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {data_path}")
    
    print(f"📁 Loading {len(csv_files)} CSV files...")
    
    # Load and combine all files
    df_list = []
    for file in csv_files:
        df_temp = pd.read_csv(file)
        df_list.append(df_temp)
    
    df = pd.concat(df_list, ignore_index=True)
    
    # Parse timestamp and set as index
    df['Timestamps'] = pd.to_datetime(df['Timestamps'], format='%d/%m/%Y %H:%M:%S')
    df = df.set_index('Timestamps').sort_index()
    df.drop(columns='CM2_PV_VRM01_VIBRATION1', axis=1, inplace=True)
    
    print(f"✅ Loaded {len(df):,} rows, {len(df.columns)} columns")
    return df


def clean_data(df, remove_outliers=False, outlier_columns=None):
    """
    Clean industrial data by removing invalid values and handling missing data
    
    Args:
        df: Raw industrial DataFrame
        remove_outliers: Boolean flag to remove outliers using percentile method (default: False)
        outlier_columns: List of columns to apply outlier removal (default: None = all numeric columns)
        
    Returns:
        tuple: (cleaned_df, target_column)
    """
    df_clean = df.copy()
    
    # Identify vibration target column
    vibration_cols = [col for col in df_clean.columns if 'VIBRATION' in col.upper()]
    if not vibration_cols:
        raise ValueError("No vibration column found in data")
    
    target_col = vibration_cols[0]  # Use first vibration column as target
    
    # Filter realistic vibration values (0-50 mm/s is reasonable for industrial mills)
    initial_len = len(df_clean)
    df_clean = df_clean[(df_clean[target_col] > 0) & (df_clean[target_col] <= 12)]
    filtered_count = initial_len - len(df_clean)
    
    # Optional outlier removal using percentile method (gentle approach for industrial data)
    outliers_removed = 0
    if remove_outliers:
        # Determine which columns to process
        if outlier_columns is None:
            # Apply to all numeric columns by default
            columns_to_clean = df_clean.select_dtypes(include=[np.number]).columns
        else:
            # Apply only to specified columns
            columns_to_clean = [col for col in outlier_columns if col in df_clean.columns]
        
        initial_outlier_len = len(df_clean)
        
        # Apply percentile-based filtering (5th-95th percentile = keep 90% of data)
        for col in columns_to_clean:
            lower_bound = df_clean[col].quantile(0.5)  # 5th percentile
            upper_bound = df_clean[col].quantile(0.95)  # 95th percentile
            
            # Remove outliers for this column
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        
        outliers_removed = initial_outlier_len - len(df_clean)
        print(f"  • Processed {len(columns_to_clean)} columns for outlier removal")
    
    # Remove columns with >50% missing data
    missing_threshold = 0.5
    missing_ratios = df_clean.isnull().sum() / len(df_clean)
    cols_to_drop = missing_ratios[missing_ratios > missing_threshold].index
    df_clean = df_clean.drop(columns=cols_to_drop)
    
    # Fill remaining missing values
    df_clean = df_clean.fillna(method='ffill').fillna(method='bfill')
    
    print(f"🧹 Cleaning results:")
    print(f"  • Filtered {filtered_count:,} invalid vibration readings")
    if remove_outliers:
        print(f"  • Removed {outliers_removed:,} outliers using percentile method (5th-95th)")
    print(f"  • Removed {len(cols_to_drop)} columns with >50% missing data")
    print(f"  • Target column: {target_col}")
    
    return df_clean, target_col


def resample_aggregate(df, target_col, agg):
    """
    Resample 30-second data to X-minute intervals to reduce noise
    
    Args:
        df: Cleaned DataFrame with 30-second intervals
        target_col: Name of target column
        agg: Aggregation of resample
        
    Returns:
        pandas.DataFrame: Resampled DataFrame with X-minute intervals
    """
    print(f"📊 Resampling from 30s to {int(agg[:-1])}min intervals...")
    print(f"  • Original shape: {df.shape}")
    
    # Separate numeric and categorical columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Define aggregation strategy for different column types
    agg_funcs = {}
    
    # For most numeric columns, use mean
    for col in numeric_cols:
        if col == target_col:
            # For target (vibration), use mean as primary but also capture variability
            agg_funcs[col] = 'mean'
        elif 'TEMPERATURE' in col.upper() or 'PRESSURE' in col.upper() or 'FLOW' in col.upper():
            # Process variables - use mean
            agg_funcs[col] = 'mean'
        elif 'VIBRATION' in col.upper():
            # Other vibration variables - use mean
            agg_funcs[col] = 'mean'
        else:
            # Default to mean for other numeric columns
            agg_funcs[col] = 'mean'
    
    # Perform resampling to 5-minute intervals
    df_resampled = df.resample(agg).agg(agg_funcs)
    
    # Remove any rows with all NaN values (shouldn't happen with proper data)
    df_resampled = df_resampled.dropna(how='all')
    
    # Forward fill any remaining NaN values, then backward fill
    df_resampled = df_resampled.fillna(method='ffill').fillna(method='bfill')
    
    print(f"  • Resampled shape: {df_resampled.shape}")
    print(f"  • Data reduction: {len(df) - len(df_resampled):,} rows removed ({((len(df) - len(df_resampled))/len(df)*100):.1f}%)")
    print(f"  • Time interval: {int(agg[:-1])} minutes")
    
    return df_resampled


def engineer_features(df, target_col):
    """
    Create engineered features while preventing data leakage
    
    Args:
        df: Cleaned DataFrame
        target_col: Name of target column to exclude from feature engineering
        
    Returns:
        pandas.DataFrame: DataFrame with engineered features
    """
    df_features = df.copy()
    
    # CRITICAL: Exclude ALL vibration columns from feature engineering
    feature_cols = [col for col in df_features.columns if 'VIBRATION' not in col.upper()]
    
    print(f"🔧 Engineering features from {len(feature_cols)} non-vibration columns...")
    
    # Create rolling statistics for key process variables
    key_vars = []
    for pattern in ['POWER', 'PRESSURE', 'CURRENT', 'FLOW', 'TEMPERATURE']:
        pattern_cols = [col for col in feature_cols if pattern in col.upper()]
        key_vars.extend(pattern_cols[:2])  # Take first 2 matches to avoid explosion
    
    # Create rolling features
    windows = [3, 6, 12]  # 15min, 30min, 1hr for 5-minute data
    feature_count = 0
    
    for col in key_vars:
        for window in windows:
            # Rolling mean and std
            df_features[f"{col}_rolling_mean_{window}"] = df_features[col].rolling(window).mean()
            df_features[f"{col}_rolling_std_{window}"] = df_features[col].rolling(window).std()
            feature_count += 2
    
    # Add time-based features
    df_features['hour'] = df_features.index.hour
    df_features['day_of_week'] = df_features.index.dayofweek
    df_features['month'] = df_features.index.month
    feature_count += 3
    
    # Remove infinite values and excessive missing data
    df_features = df_features.replace([np.inf, -np.inf], np.nan)
    missing_ratios = df_features.isnull().sum() / len(df_features)
    cols_to_drop = missing_ratios[missing_ratios > 0.7].index
    df_features = df_features.drop(columns=cols_to_drop)
    df_features = df_features.fillna(method='ffill').fillna(method='bfill')
    
    print(f"✅ Created {feature_count} engineered features")
    print(f"🛡️ Target column '{target_col}' excluded from features")
    
    return df_features


def prepare_model_data(df, target_col, test_size=0.2):
    """
    Prepare data for modeling with proper time series splits
    
    Args:
        df: DataFrame with features and target
        target_col: Name of target column
        test_size: Proportion of data for testing
        
    Returns:
        tuple: (X_train, X_test, y_train, y_test)
    """
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Time series split (no shuffling - maintain temporal order)
    split_idx = int(len(df) * (1 - test_size))
    
    X_train = X.iloc[:split_idx]
    X_test = X.iloc[split_idx:]
    y_train = y.iloc[:split_idx]
    y_test = y.iloc[split_idx:]
    
    print(f"📊 Data split:")
    print(f"  • Training: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
    print(f"  • Testing: {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")
    print(f"  • Features: {X_train.shape[1]}")
    
    return X_train, X_test, y_train, y_test


def plot_all_time_series(df, figsize_per_plot=(12, 4), cols_per_row=3, save_path=None, show_plots=True):
    """
    Create time series plots for all columns in the dataframe.

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing time series data
    figsize_per_plot : tuple
        Size of each individual subplot (width, height)
    cols_per_row : int
        Number of columns per row in the subplot grid
    save_path : str, optional
        Path to save the plots (without extension, will save as PNG)
    show_plots : bool
        Whether to display the plots

    Returns:
    --------
    None
    """

    # Get numeric columns only
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    n_cols = len(numeric_cols)

    if n_cols == 0:
        print("No numeric columns found in dataframe.")
        return

    # Calculate grid dimensions
    n_rows = ceil(n_cols / cols_per_row)

    # Create figure with subplots
    fig, axes = plt.subplots(n_rows, cols_per_row, 
                            figsize=(figsize_per_plot[0] * cols_per_row, 
                                    figsize_per_plot[1] * n_rows))

    # Ensure axes is always a 2D array
    if n_rows == 1 and cols_per_row == 1:
        axes = np.array([[axes]])
    elif n_rows == 1:
        axes = axes.reshape(1, -1)
    elif cols_per_row == 1:
        axes = axes.reshape(-1, 1)

    # Plot each column
    for i, col in enumerate(numeric_cols):
        row = i // cols_per_row
        col_idx = i % cols_per_row
        ax = axes[row, col_idx]
        
        # Plot time series
        ax.plot(df.index, df[col], linewidth=0.8, alpha=0.8)
        ax.set_title(f'{col}', fontsize=10, fontweight='bold')
        ax.set_xlabel('Time', fontsize=8)
        ax.set_ylabel('Value', fontsize=8)
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='both', which='major', labelsize=7)
        
        # Add basic statistics as text
        stats_text = f'Mean: {df[col].mean():.3f}\nStd: {df[col].std():.3f}'
        ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, 
                fontsize=7, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    # Hide empty subplots
    for i in range(n_cols, n_rows * cols_per_row):
        row = i // cols_per_row
        col_idx = i % cols_per_row
        axes[row, col_idx].set_visible(False)

    plt.tight_layout(pad=2.0)
    plt.suptitle(f'Time Series Plots - All {n_cols} Numeric Columns', 
                    fontsize=16, fontweight='bold', y=0.995)

    if save_path:
        plt.savefig(f'{save_path}_timeseries.png', dpi=300, bbox_inches='tight')
        print(f"Time series plots saved to {save_path}_timeseries.png")

    if show_plots:
        plt.show()
    else:
        plt.close()

    print(f"✅ Created time series plots for {n_cols} columns")


def plot_all_histograms(df, figsize_per_plot=(12, 4), cols_per_row=3, bins=50, save_path=None, show_plots=True):
    """
    Create histogram plots for all columns in the dataframe.

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing the data
    figsize_per_plot : tuple
        Size of each individual subplot (width, height)
    cols_per_row : int
        Number of columns per row in the subplot grid
    bins : int or str
        Number of bins for histograms
    save_path : str, optional
        Path to save the plots (without extension, will save as PNG)
    show_plots : bool
        Whether to display the plots

    Returns:
    --------
    None
    """

    # Get numeric columns only
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    n_cols = len(numeric_cols)

    if n_cols == 0:
        print("No numeric columns found in dataframe.")
        return

    # Calculate grid dimensions
    n_rows = ceil(n_cols / cols_per_row)

    # Create figure with subplots
    fig, axes = plt.subplots(n_rows, cols_per_row, 
                            figsize=(figsize_per_plot[0] * cols_per_row, 
                                    figsize_per_plot[1] * n_rows))

    # Ensure axes is always a 2D array
    if n_rows == 1 and cols_per_row == 1:
        axes = np.array([[axes]])
    elif n_rows == 1:
        axes = axes.reshape(1, -1)
    elif cols_per_row == 1:
        axes = axes.reshape(-1, 1)

    # Plot each column
    for i, col in enumerate(numeric_cols):
        row = i // cols_per_row
        col_idx = i % cols_per_row
        ax = axes[row, col_idx]
        
        # Remove NaN values for plotting
        data = df[col].dropna()
        
        if len(data) == 0:
            ax.text(0.5, 0.5, 'No data available', 
                    transform=ax.transAxes, ha='center', va='center')
            ax.set_title(f'{col}', fontsize=10, fontweight='bold')
            continue
        
        # Plot histogram
        n, bins_edges, patches = ax.hist(data, bins=bins, alpha=0.7, 
                                        edgecolor='black', linewidth=0.5)
        
        # Add KDE overlay if enough data points
        if len(data) > 10:
            try:
                ax2 = ax.twinx()
                data.plot.density(ax=ax2, color='red', linewidth=2, alpha=0.8)
                ax2.set_ylabel('Density', fontsize=8, color='red')
                ax2.tick_params(axis='y', labelcolor='red', labelsize=7)
            except:
                pass  # Skip KDE if it fails
        
        ax.set_title(f'{col}', fontsize=10, fontweight='bold')
        ax.set_xlabel('Value', fontsize=8)
        ax.set_ylabel('Frequency', fontsize=8)
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='both', which='major', labelsize=7)
        
        # Add statistics text
        stats_text = (f'Count: {len(data):,}\n'
                        f'Mean: {data.mean():.3f}\n'
                        f'Std: {data.std():.3f}\n'
                        f'Min: {data.min():.3f}\n'
                        f'Max: {data.max():.3f}')
        ax.text(0.98, 0.98, stats_text, transform=ax.transAxes, 
                fontsize=7, verticalalignment='top', horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    # Hide empty subplots
    for i in range(n_cols, n_rows * cols_per_row):
        row = i // cols_per_row
        col_idx = i % cols_per_row
        axes[row, col_idx].set_visible(False)

    plt.tight_layout(pad=2.0)
    plt.suptitle(f'Histogram Plots - All {n_cols} Numeric Columns', 
                    fontsize=16, fontweight='bold', y=0.995)

    if save_path:
        plt.savefig(f'{save_path}_histograms.png', dpi=300, bbox_inches='tight')
        print(f"Histogram plots saved to {save_path}_histograms.png")

    if show_plots:
        plt.show()
    else:
        plt.close()

    print(f"✅ Created histogram plots for {n_cols} columns")


def analyze_all_columns(df, save_path=None, show_plots=True):
    """
    Convenience function to create both time series and histogram plots.

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe to analyze
    save_path : str, optional
        Base path to save the plots (without extension)
    show_plots : bool
        Whether to display the plots
    """

    print(f"📊 ANALYZING ALL COLUMNS IN DATAFRAME")
    print(f"=" * 50)
    print(f"Total columns: {len(df.columns)}")
    print(f"Numeric columns: {len(df.select_dtypes(include=[np.number]).columns)}")
    print(f"Data shape: {df.shape}")
    print()

    # Create time series plots
    print("🔄 Creating time series plots...")
    plot_all_time_series(df, save_path=save_path, show_plots=show_plots)
    print()

    # Create histogram plots  
    print("🔄 Creating histogram plots...")
    plot_all_histograms(df, save_path=save_path, show_plots=show_plots)
    print()

    print("✅ Analysis complete!")

def create_dummies(df, columns):
    """
    Create dummy variables for specified categorical columns
    
    Args:
        df: pandas.DataFrame - Input DataFrame
        columns: list - List of column names to create dummies for
        
    Returns:
        pandas.DataFrame: DataFrame with dummy variables added and original categorical columns dropped
    """
    df_dummies = df.copy()
    
    # Validate that all specified columns exist in the DataFrame
    missing_columns = [col for col in columns if col not in df_dummies.columns]
    if missing_columns:
        raise ValueError(f"Columns not found in DataFrame: {missing_columns}")
    
    print(f"🔢 Creating dummy variables for {len(columns)} columns: {columns}")
    
    # Create dummies for each specified column
    for col in columns:
        # Get dummies with column name prefix to avoid naming conflicts
        dummies = pd.get_dummies(df_dummies[col], prefix=col, drop_first=True)
        
        # Add dummy columns to DataFrame
        df_dummies = pd.concat([df_dummies, dummies], axis=1)
        
        # Drop the original categorical column
        df_dummies = df_dummies.drop(columns=[col])
        
        print(f"  ✓ Created {len(dummies.columns)} dummy variables for '{col}'")
    
    print(f"📊 Final shape: {df_dummies.shape} (added {df_dummies.shape[1] - df.shape[1]} columns)")
    
    return df_dummies


print("✅ Core functions defined successfully")

✅ Core functions defined successfully
