In [1]:
# Essential imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# --- 1. Load the dataset ---
try:
    # Load the dataset
    df = pd.read_csv('Database.csv')
except FileNotFoundError:
    print("Error: 'Database.csv' not found. Please ensure the file is in the current directory.")
    # You might want to exit or handle this error differently
    # exit() 

# --- 2. Outlier Detection & Removal using 3.5*IQR ---

def remove_outliers_iqr(data, columns):
    """Removes rows where numerical values fall outside a 3.5*IQR range for each specified column."""
    clean_data = data.copy()
    initial_shape = clean_data.shape[0]
    
    for col in columns:
        # Check if column exists and is numeric
        if col in clean_data.columns and pd.api.types.is_numeric_dtype(clean_data[col]):
            Q1 = clean_data[col].quantile(0.25)
            Q3 = clean_data[col].quantile(0.75)
            IQR = Q3 - Q1
            # Using 3.5 * IQR for the filter
            lower_bound = Q1 - 3.5 * IQR
            upper_bound = Q3 + 3.5 * IQR
            
            # Filter rows within the acceptable range
            clean_data = clean_data[(clean_data[col] >= lower_bound) & (clean_data[col] <= upper_bound)]
            
    rows_removed = initial_shape - clean_data.shape[0]
    return clean_data, rows_removed

# List of all numerical columns
num_cols = ['DHI', 'DNI', 'GHI', 'Wind_speed', 'Humidity', 'Temperature',
            'PV_production', 'Wind_production', 'Electric_demand']

# Perform outlier removal
df_clean, rows_removed = remove_outliers_iqr(df, num_cols)

print(f"Original shape: {df.shape}")
print(f"After outlier removal: {df_clean.shape}")
print(f"Total rows removed by IQR: {rows_removed}")

# --- 3. Save Processed Data (RAW/UNSCALED) ---
# Ensure only necessary columns remain
df_clean = df_clean.drop(columns=['Unnamed: 0', 'Time'], errors='ignore') # Drop potential junk columns

# Save the cleaned data without any scaling.
df_clean.to_csv("preprocess_data.csv", index=False)

print("\nProcessing complete! 'preprocess_data.csv' now contains clean, UNSCALED data.")

Original shape: (38879, 13)
After outlier removal: (38879, 13)
Total rows removed by IQR: 0

Processing complete! 'preprocess_data.csv' now contains clean, UNSCALED data.
