Isolation Forest Training for Weather Data Anomaly Detection

In [1]:
# 1. Imports & Konfiguration
import pandas as pd
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import os

# 🔧 Pfade definieren
DATA_DIR = "data"
MODEL_DIR = "model"


# 2. Daten einlesen & aufbereiten
def load_data(filenames):
    filepaths = [os.path.join(DATA_DIR, f) for f in filenames]
    dfs = [pd.read_excel(fp) for fp in filepaths]
    df = pd.concat(dfs, ignore_index=True)

    df.columns = [
        "source",
        "date",
        "time",
        "city",
        "lat",
        "lon",
        "temperature",
        "wind_speed",
        "pressure",
        "humidity",
    ]

    df["humidity"] = (
        df["humidity"].astype(str).str.replace("%", "").str.strip().astype(float)
    )
    df["pressure"] = (
        df["pressure"]
        .astype(str)
        .str.replace("’", "")
        .str.replace("'", "")
        .str.strip()
        .astype(float)
    )

    df.dropna(
        subset=["temperature", "wind_speed", "pressure", "humidity"], inplace=True
    )
    return df


# 3. Feature-Auswahl
def prepare_features(df):
    features = ["temperature", "wind_speed", "pressure", "humidity", "lat", "lon"]
    return df[features]


# 4. Modelltraining
def train_model(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    model = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
    model.fit(X_scaled)
    return model, scaler


# 5. Speichern im Ordner 'model/'
def save_model(model, scaler, output_dir=MODEL_DIR):
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(model, os.path.join(output_dir, "isolation_forest_model.pkl"))
    joblib.dump(scaler, os.path.join(output_dir, "scaler.pkl"))
    print("✅ Modell und Scaler gespeichert in:", output_dir)


# 6. Ausführen
if __name__ == "__main__":
    files = ["2022_05_01.xlsx", "2022_05_02.xlsx", "2022_05_03.xlsx"]
    df = load_data(files)
    X = prepare_features(df)
    model, scaler = train_model(X)
    save_model(model, scaler)

✅ Modell und Scaler gespeichert in: model
