In [3]:
import pandas as pd
import numpy as np
import os

# =========================
# CONFIGURATION
# =========================

INPUT_CSV = "/workspaces/TeamCPH/data/combined_df_corrected.csv"
OUTPUT_CSV = "/workspaces/TeamCPH/data/combined_df_corrected_clean_for_nn.csv"

TARGET_COL = "Revenue"

# =========================
# CHECK FILE EXISTS
# =========================
if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(f"Could not find file at: {INPUT_CSV}")

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(INPUT_CSV)

print("Original shape:", df.shape)
print("Original dtypes:\n", df.dtypes)

# =========================
# 1) DATE HANDLING (if Date column exists)
# =========================
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    if df["Date"].isna().any():
        bad_rows = df[df["Date"].isna()]
        raise ValueError(
            f"Some Date values could not be parsed. Example bad rows:\n{bad_rows.head(5)}"
        )
    df = df.sort_values("Date").reset_index(drop=True)

# =========================
# 2) TARGET COLUMN SANITY
# =========================
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataset columns: {df.columns.tolist()}")

df[TARGET_COL] = pd.to_numeric(df[TARGET_COL], errors="coerce")
if df[TARGET_COL].isna().any():
    raise ValueError(f"Target column '{TARGET_COL}' contains non-numeric or missing values after conversion.")

# =========================
# 3) CONVERT BOOLS TO INTS
# =========================
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
for col in bool_cols:
    df[col] = df[col].astype(int)

# =========================
# 4) FORCE NON-DATE FEATURES TO NUMERIC
# =========================
for col in df.columns:
    if col == "Date":
        continue
    if df[col].dtype == "object":
        df[col] = pd.to_numeric(df[col], errors="ignore")

object_cols = df.select_dtypes(include=["object"]).columns.tolist()
object_cols = [c for c in object_cols if c != "Date"]

if len(object_cols) > 0:
    raise ValueError(
        f"These columns are still non-numeric (object type): {object_cols}\n\n"
        "Neural networks generally require numeric inputs.\n"
        "Fix options:\n"
        "1) One-hot encode categorical columns\n"
        "2) Drop them if they are not needed\n"
        "3) Convert them to numeric if possible"
    )

# =========================
# 5) HANDLE MISSING VALUES IN FEATURES
# =========================
numeric_cols = df.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
feature_cols = [c for c in numeric_cols if c != TARGET_COL]

df[feature_cols] = df[feature_cols].ffill().bfill()
df[feature_cols] = df[feature_cols].fillna(0)

# =========================
# 6) FINAL SAFETY CHECKS
# =========================
if df[feature_cols].isna().any().any():
    raise ValueError("There are still NaNs in feature columns after cleaning.")

if np.isinf(df[feature_cols].to_numpy()).any():
    raise ValueError("Infinite values found in feature columns.")

for col in df.columns:
    if col == "Date":
        continue
    if df[col].dtype == "float64":
        df[col] = df[col].astype("float32")

print("\nCleaned shape:", df.shape)
print("Remaining NaNs per column:\n", df.isna().sum().sort_values(ascending=False).head(10))
print("Cleaned dtypes:\n", df.dtypes)

# =========================
# 7) SAVE CLEAN FILE
# =========================
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved cleaned dataset to: {OUTPUT_CSV}")
print(f"Target column for model: {TARGET_COL}")


Original shape: (10896, 22)
Original dtypes:
 Date               object
Holiday              bool
NextDayHoliday       bool
IsWeekend            bool
Month               int64
KielerWeek           bool
IsNewYearsEve        bool
IsHalloween          bool
t                   int64
lag_1             float64
roll7_mean        float64
roll28_mean       float64
year_sin1         float64
year_cos1         float64
year_sin2         float64
year_cos2         float64
Revenue           float64
Product_2            bool
Product_3            bool
Product_4            bool
Product_5            bool
Product_6            bool
dtype: object

Cleaned shape: (10896, 22)
Remaining NaNs per column:
 Date              0
Holiday           0
NextDayHoliday    0
IsWeekend         0
Month             0
KielerWeek        0
IsNewYearsEve     0
IsHalloween       0
t                 0
lag_1             0
dtype: int64
Cleaned dtypes:
 Date              datetime64[ns]
Holiday                    int64
NextDayHoliday  