## 3. Collecting Data
**Purpose**: Load industrial vibration data from CSV files and perform initial validation

In [None]:
# Load raw industrial data
print("🏭 Loading Industrial Vibration Data")
print("="*50)   

df_raw = load_data('full_data/')

# Display basic information about the dataset
print(f"\n📊 Dataset Overview:")
print(f"  • Time range: {df_raw.index.min()} to {df_raw.index.max()}")
print(f"  • Duration: {(df_raw.index.max() - df_raw.index.min()).days} days")
print(f"  • Total data points: {len(df_raw):,}")

# Show column information
print(f"\n📋 Available Columns ({len(df_raw.columns)})")

# Check for vibration columns (our target)
vibration_cols = [col for col in df_raw.columns if 'VIBRATION' in col.upper()]
print(f"\n🎯 Target Variables Found:")
for col in vibration_cols:
    print(f"  • {col}")
    print(f"    Range: {df_raw[col].min():.3f} to {df_raw[col].max():.3f}")
    print(f"    Mean: {df_raw[col].mean():.3f} ± {df_raw[col].std():.3f}")

# Check data quality
missing_data = df_raw.isnull().sum()
missing_percentage = (missing_data / len(df_raw)) * 100
columns_with_missing = missing_percentage[missing_percentage > 0]

print(f"\n🔍 Data Quality Assessment:")
print(f"  • Columns with missing data: {len(columns_with_missing)}/{len(df_raw.columns)}")
print(f"  • Average missing data: {missing_percentage.mean():.1f}%")

if len(columns_with_missing) > 0:
    print(f"  • Worst missing data:")
    worst_missing = columns_with_missing.nlargest(3)
    for col, pct in worst_missing.items():
        print(f"    - {col}: {pct:.1f}%")

print(f"\n✅ Data loading complete - ready for cleaning")

# Store for next steps
data_info = {
    'total_rows': len(df_raw),
    'total_cols': len(df_raw.columns),
    'vibration_cols': vibration_cols,
    'time_range': (df_raw.index.min(), df_raw.index.max())
}

print(f"\n📈 Sample data preview:")
df_raw.head(3)

## 4. Cleaning data

In [None]:
# Clean the raw data
print("\n🧹 Cleaning Industrial Data")
print("="*50)

df_clean, target_column = clean_data(df_raw, remove_outliers=False)

print(f"\n📊 Cleaning Results:")
print(f"  • Clean dataset shape: {df_clean.shape}")
print(f"  • Data reduction: {len(df_raw) - len(df_clean):,} rows removed ({(len(df_raw) - len(df_clean))/len(df_raw)*100:.1f}%)")
print(f"  • Final columns: {len(df_clean.columns)}")
print(f"  • Target variable: {target_column}")

# Resample to X-minute intervals to reduce noise
agg = '5T'
print(f"\n📊 Resampling to {int(agg[:-1])}-minute intervals")
print("="*50)

df_clean = create_dummies(df_clean, ['CM2_PV_PRODUCT'])

df_resampled = resample_aggregate(df_clean, target_column, agg)

# Update the working dataset to use resampled data
df_clean = df_resampled
print(f"\n✅ Updated working dataset to use {int(agg[:-1])}-minute resampled data")

# Validate target variable after resampling
print(f"\n🎯 Target Variable Statistics (After Resampling):")
target_stats = df_clean[target_column].describe()
print(f"  • Count: {int(target_stats['count']):,} valid readings")
print(f"  • Range: {target_stats['min']:.3f} to {target_stats['max']:.3f}")
print(f"  • Mean: {target_stats['mean']:.3f}")
print(f"  • Std Dev: {target_stats['std']:.3f}")
print(f"  • Median: {target_stats['50%']:.3f}")

# Calculate coefficient of variation (noise indicator) - should be improved after resampling
cv = target_stats['std'] / target_stats['mean']
noise_level = 'High' if cv > 0.15 else 'Moderate' if cv > 0.10 else 'Low'
print(f"  • Coefficient of Variation: {cv:.3f} ({noise_level} noise)")

print(f"\n✅ Data cleaning and resampling complete - ready for EDA")

# Store cleaned data info
clean_data_info = {
    'shape': df_clean.shape,
    'target_col': target_column,
    'target_stats': target_stats,
    'noise_level': noise_level,
    'resampled': True,
    'time_interval': f'{int(agg[:-1])}min'
}

In [None]:
# Create comparison visualization: 30s vs 5min data
print(f"\n📈 Comparing 30-second vs {int(agg[:-1])}-minute Data")
print("="*50)

try:
    
    # Filter both datasets for the sample period
    df_clean_30s = clean_data(df_raw)[0]  # Get 30s data
    sample_30s = df_clean_30s
    sample_5min = df_clean
    
    if len(sample_30s) > 0 and len(sample_5min) > 0:
        # Create comparison plot
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
        
        # Plot 30-second data
        ax1.plot(sample_30s.index, sample_30s[target_column], 'b-', alpha=0.7, linewidth=0.8)
        ax1.set_title(f'Original 30-second Data - {target_column}', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Vibration (mm/s)')
        ax1.grid(True, alpha=0.3)
        
        # Calculate and show statistics for 30s data
        std_30s = sample_30s[target_column].std()
        mean_30s = sample_30s[target_column].mean()
        cv_30s = std_30s / mean_30s
        
        ax1.text(0.02, 0.95, f'Std Dev: {std_30s:.3f}\\nCV: {cv_30s:.3f}', 
                transform=ax1.transAxes, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        
        # Plot 5-minute data
        ax2.plot(sample_5min.index, sample_5min[target_column], 'r-', alpha=0.8, linewidth=1.5)
        ax2.set_title(f'Resampled {int(agg[:-1])}-minute Data - {target_column}', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Vibration (mm/s)')
        ax2.set_xlabel('Time')
        ax2.grid(True, alpha=0.3)
        
        # Calculate and show statistics for 5min data
        std_5min = sample_5min[target_column].std()
        mean_5min = sample_5min[target_column].mean()
        cv_5min = std_5min / mean_5min
        
        ax2.text(0.02, 0.95, f'Std Dev: {std_5min:.3f}\\nCV: {cv_5min:.3f}', 
                transform=ax2.transAxes, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))
        
        plt.tight_layout()
        plt.show()
        
        # Print comparison statistics
        print(f"\n📊 Noise Reduction Analysis:")
        print(f"  • 30-second data:")
        print(f"    - Standard deviation: {std_30s:.3f}")
        print(f"    - Coefficient of variation: {cv_30s:.3f}")
        print(f"    - Data points in sample: {len(sample_30s):,}")
        
        print(f"  • {int(agg[:-1])}-minute data:")
        print(f"    - Standard deviation: {std_5min:.3f}")
        print(f"    - Coefficient of variation: {cv_5min:.3f}")
        print(f"    - Data points in sample: {len(sample_5min):,}")
        
        noise_reduction = ((cv_30s - cv_5min) / cv_30s) * 100
        print(f"  • Noise reduction: {noise_reduction:.1f}% improvement in CV")
        
        if noise_reduction > 0:
            print(f"  • ✅ {int(agg[:-1])}-minute resampling successfully reduced noise")
        else:
            print(f"  • ⚠️ Noise reduction less than expected")
            
    else:
        print("⚠️ Insufficient data for comparison visualization")
        
except Exception as e:
    print(f"⚠️ Comparison visualization failed: {e}")
    print("This is normal if the data pipeline hasn't been run yet")

print(f"\n✅ Comparison complete!")