## 5. EDA (Exploratory Data Analysis)
**Purpose**: Understand data patterns, relationships, and insights to guide modeling

In [None]:
# Correlation Analysis with Key Process Variables
print("\n🔗 Correlation Analysis")
print("="*50)

# Identify key process variables (exclude vibration columns to prevent leakage)
non_vibration_cols = [col for col in df_clean.columns if 'VIBRATION' not in col.upper()]

# Find top process variables by correlation with target
correlations = df_clean[non_vibration_cols + [target_column]].corr()[target_column].abs().sort_values(ascending=False)

# Remove the target itself and get top 10
top_corr_features = correlations.drop(target_column).head(10)

print(f"🎯 Top 10 Features by Correlation with {target_column}:")
for i, (feature, corr) in enumerate(top_corr_features.items(), 1):
    print(f"  {i:2d}. {feature:<40} | r = {corr:.3f}")

# Visualize top correlations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Top Correlated Features vs Vibration', fontsize=16, fontweight='bold')

top_5_features = top_corr_features.head(5).index.tolist()

for i, feature in enumerate(top_5_features):
    row, col = i // 3, i % 3
    
    # Create scatter plot with trend line
    axes[row, col].scatter(df_clean[feature], df_clean[target_column], alpha=0.1, s=1)
    
    # Add trend line
    z = np.polyfit(df_clean[feature], df_clean[target_column], 1)
    p = np.poly1d(z)
    axes[row, col].plot(df_clean[feature], p(df_clean[feature]), "r--", alpha=0.8)
    
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Vibration (mm/s)')
    axes[row, col].set_title(f'r = {correlations[feature]:.3f}')
    axes[row, col].grid(True, alpha=0.3)

# Remove empty subplot
if len(top_5_features) < 6:
    axes[1, 2].remove()

plt.tight_layout()
plt.show()

# Store top correlated features for later use
top_features_info = {
    'top_10_features': top_corr_features.to_dict(),
    'strongest_correlation': top_corr_features.iloc[0],
    'weakest_in_top10': top_corr_features.iloc[-1]
}

print(f"\n✅ Correlation analysis complete")
print(f"  • Strongest correlation: {top_corr_features.index[0]} (r = {top_corr_features.iloc[0]:.3f})")
print(f"  • Average top-10 correlation: {top_corr_features.mean():.3f}")

In [None]:
# Target Variable Analysis
print("🎯 Target Variable Analysis")
print("="*50)

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle(f'Target Variable Analysis: {target_column}', fontsize=16, fontweight='bold')

# 1. Time series plot
axes[0, 0].plot(df_clean.index, df_clean[target_column], alpha=0.7, linewidth=0.5)
axes[0, 0].set_title('Vibration Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Vibration (mm/s)')
axes[0, 0].grid(True, alpha=0.3)

# 2. Distribution histogram
axes[0, 1].hist(df_clean[target_column], bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].axvline(df_clean[target_column].mean(), color='red', linestyle='--', label=f'Mean: {df_clean[target_column].mean():.3f}')
axes[0, 1].axvline(df_clean[target_column].median(), color='green', linestyle='--', label=f'Median: {df_clean[target_column].median():.3f}')
axes[0, 1].set_title('Vibration Distribution')
axes[0, 1].set_xlabel('Vibration (mm/s)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Box plot by hour of day
df_clean['hour'] = df_clean.index.hour
hourly_data = [df_clean[df_clean['hour'] == h][target_column].values for h in range(24)]
axes[1, 0].boxplot(hourly_data, positions=range(24))
axes[1, 0].set_title('Vibration by Hour of Day')
axes[1, 0].set_xlabel('Hour')
axes[1, 0].set_ylabel('Vibration (mm/s)')
axes[1, 0].set_xticks(range(0, 24, 4))
axes[1, 0].grid(True, alpha=0.3)

# 4. Rolling statistics
window = 1440  # 24 hours worth of 30-second data
rolling_mean = df_clean[target_column].rolling(window=window).mean()
rolling_std = df_clean[target_column].rolling(window=window).std()

axes[1, 1].plot(df_clean.index, rolling_mean, label='24h Rolling Mean', color='blue')
axes[1, 1].fill_between(df_clean.index, 
                        rolling_mean - rolling_std, 
                        rolling_mean + rolling_std, 
                        alpha=0.3, color='blue', label='±1 Std Dev')
axes[1, 1].set_title('24-Hour Rolling Statistics')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Vibration (mm/s)')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print key insights
print(f"\n📊 Key Insights:")
print(f"  • Vibration range: {df_clean[target_column].min():.3f} - {df_clean[target_column].max():.3f} mm/s")
print(f"  • Distribution skewness: {df_clean[target_column].skew():.3f}")
print(f"  • Hourly variation: {df_clean.groupby('hour')[target_column].std().mean():.3f} mm/s average std")
print(f"  • Daily pattern visible: {'Yes' if df_clean.groupby('hour')[target_column].mean().std() > 0.1 else 'No'}")

# Clean up temporary column
df_clean = df_clean.drop('hour', axis=1)

In [None]:
analyze_all_columns(df_clean)

In [None]:
manual_selected_features = [
    'CM2_PV_DA01_POSITION', # arraste dos gases (controla a abertura para saída dos gases)
   # 'CM2_PV_WI01_WATER_INJECTION', # só de adicionar ela, piora muito. Isso pq ela vai de 0 a 4000 mto rapido (min,max)
   # 'CM2_PV_VRM01_POWER',
    'CM2_PV_VRM01_DIFF_PRESSURE', # vrm é o motor
    'CM2_PV_BF01_DIF_PRESSURE1', # bf é o bag filter
    'CM2_SP_RB01_SPA_TOTAL_FEED', # alimentação total do moinho
    'CM2_PV_HG01_TEMPERATURE2', # hg é hotgas (gás quente)
    'CM2_PV_BF01_OUT_TEMPERATURE', # temperatura de saída do bag filter
    'CM2_PV_VRM01_INLET_TEMPERATURE', # temperatura de entrada do motor
    'CM2_PV_HYS01_PRESSURE1', # hys é (?) 
    'CM2_PV_HYS01_PRESSURE',
    'CM2_PV_VRM01_OUTLET_TEMPERATURE', # temperatura de saída do motor
    'CM2_PV_DA02_POSITION', # posição do arraste dos gases
    'CM2_PV_HG01_TEMPERATURE1' # temperatura do hotgas
]

In [None]:
# Correlation Matrix Analysis for Manual Selected Features
print("📊 Correlation Analysis: Manual Selected Features vs Target")
print("=" * 60)

# Create correlation matrix with available features and target
features_for_corr = [f for f in manual_selected_features if f in df_raw.columns]
print(f"Analyzing correlations for {len(features_for_corr)} features with {target_column}")

# Calculate correlation matrix
corr_data = df_raw[features_for_corr + [target_column]].corr()

# Create the correlation matrix plot
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_data, dtype=bool), k=1)
sns.heatmap(corr_data, 
            annot=True, 
            cmap='RdBu_r', 
            center=0,
            square=True,
            mask=mask,
            fmt='.3f',
            cbar_kws={"shrink": .8})

plt.title(f'Correlation Matrix: Manual Selected Features vs {target_column}', 
          fontsize=14, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Print correlation summary with target
target_correlations = corr_data[target_column].drop(target_column).abs().sort_values(ascending=False)

print(f"\n🎯 Feature Correlations with {target_column} (sorted by absolute value):")
print("-" * 70)
for i, (feature, corr) in enumerate(target_correlations.items(), 1):
    direction = "📈" if corr_data[target_column][feature] > 0 else "📉"
    strength = "🔴" if abs(corr) >= 0.5 else "🟡" if abs(corr) >= 0.3 else "🟢"
    print(f"  {i:2d}. {strength} {direction} {feature:35s} | {corr:6.3f}")

print(f"\n💡 Correlation Insights:")
strong_corrs = target_correlations[target_correlations >= 0.5]
moderate_corrs = target_correlations[(target_correlations >= 0.3) & (target_correlations < 0.5)]
weak_corrs = target_correlations[target_correlations < 0.3]

print(f"   • Strong correlations (≥0.5): {len(strong_corrs)} features")
print(f"   • Moderate correlations (0.3-0.5): {len(moderate_corrs)} features") 
print(f"   • Weak correlations (<0.3): {len(weak_corrs)} features")

if len(strong_corrs) > 0:
    print(f"   • Top correlated feature: {strong_corrs.index[0]} ({strong_corrs.iloc[0]:.3f})")

print("✅ Correlation analysis complete\n")