In [1]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
# -----------------------------
# 1) SETTINGS
# -----------------------------
DATA_PATH = "weather.csv"   # <-- your dataset file in same folder
OUT_DIR = Path("weather_model_artifacts")
OUT_DIR.mkdir(exist_ok=True)

In [4]:
# -----------------------------
# 2) LOAD DATA
# -----------------------------
df = pd.read_csv("weather.csv")

# Basic required columns
required_cols = ["city", "month", "weather_condition"]
for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"Missing required column: {c}")

In [5]:
# 3) AUTO-SELECT TARGET COLUMNS (temperature, windspeed)
# -----------------------------
temp_candidates = ["temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "temperature"]
temp_col = next((c for c in temp_candidates if c in df.columns), None)
if temp_col is None:
    raise ValueError(f"No temperature column found. Expected one of: {temp_candidates}")

wind_candidates = ["windspeed_10m_max", "windspeed_10m_mean", "windspeed", "wind_speed"]
wind_col = next((c for c in wind_candidates if c in df.columns), None)
if wind_col is None:
    raise ValueError(f"No windspeed column found. Expected one of: {wind_candidates}")

print("✅ Using temperature column:", temp_col)
print("✅ Using windspeed column:", wind_col)

✅ Using temperature column: temperature_2m_mean
✅ Using windspeed column: windspeed_10m_max


In [6]:
# -----------------------------
# 4) CLEAN DATA
# -----------------------------
df["city"] = df["city"].astype(str).str.strip()
df["month"] = df["month"].astype(str).str.strip()
df["weather_condition"] = df["weather_condition"].astype(str).str.strip()

df = df.dropna(subset=["city", "month", "weather_condition", temp_col, wind_col]).copy()


In [7]:
# -----------------------------
# 5) FEATURES + TARGETS
# -----------------------------
X = df[["city", "month"]]
y_condition = df["weather_condition"]
y_temp = df[temp_col].astype(float)
y_wind = df[wind_col].astype(float)

In [9]:
# -----------------------------
# 6) PREPROCESSING: OneHotEncode city + month
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["city", "month"])
    ],
    remainder="drop"
)

# Encode weather_condition labels
label_encoder = LabelEncoder()
y_condition_enc = label_encoder.fit_transform(y_condition)

In [10]:
# -----------------------------
# 7) SPLIT DATA
# -----------------------------
X_train, X_test, yc_train, yc_test, yt_train, yt_test, yw_train, yw_test = train_test_split(
    X, y_condition_enc, y_temp, y_wind,
    test_size=0.2,
    random_state=42,
    stratify=y_condition_enc if len(np.unique(y_condition_enc)) > 1 else None
)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Preprocess (same)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["city", "month"])
    ]
)

X = df[["city", "month"]]
y = label_encoder.fit_transform(df["weather_condition"])

X_train, X_test, yc_train, yc_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# FAST model
clf_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=2000, n_jobs=-1))
])

# LIGHT tuning (small search space)
clf_params = {
    "model__C": [0.1, 0.5, 1, 2, 5],
    "model__solver": ["lbfgs", "saga"]
}

clf_search = RandomizedSearchCV(
    clf_pipe,
    clf_params,
    n_iter=6,      # ✅ reduced
    cv=3,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

clf_search.fit(X_train, yc_train)
best_clf = clf_search.best_estimator_

yc_pred = best_clf.predict(X_test)

print("\n===== WEATHER CONDITION MODEL (FAST) =====")
print("Best Params:", clf_search.best_params_)
print("Accuracy:", accuracy_score(yc_test, yc_pred))
print(classification_report(yc_test, yc_pred, target_names=label_encoder.classes_))

Fitting 3 folds for each of 6 candidates, totalling 18 fits

===== WEATHER CONDITION MODEL (FAST) =====
Best Params: {'model__solver': 'lbfgs', 'model__C': 0.5}
Accuracy: 0.27939381611065905
                  precision    recall  f1-score   support

       Clear sky       0.00      0.00      0.00       205
   Dense drizzle       0.00      0.00      0.00      1559
      Heavy rain       0.00      0.00      0.00       585
   Light drizzle       0.28      0.70      0.40      7512
    Mainly clear       0.13      0.05      0.07      1888
Moderate drizzle       0.32      0.06      0.10      4462
   Moderate rain       0.30      0.46      0.36      5495
        Overcast       0.00      0.00      0.00      1175
   Partly cloudy       0.17      0.03      0.06      2077
     Slight rain       0.00      0.00      0.00      4538

        accuracy                           0.28     29496
       macro avg       0.12      0.13      0.10     29496
    weighted avg       0.19      0.28      0.19     2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# -----------------------------
# Preprocessor (same as before)
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["city", "month"])
    ],
    remainder="drop"
)

# -----------------------------
# Train/Test Split (temperature)
# -----------------------------
X = df[["city", "month"]]
y_temp = df[temp_col].astype(float)   # temp_col = your selected temp column

X_train, X_test, yt_train, yt_test = train_test_split(
    X, y_temp, test_size=0.2, random_state=42
)

# -----------------------------
# FAST RandomForest Regressor (Option B)
# -----------------------------
best_temp = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,       # ✅ smaller than 600 (faster)
        max_depth=25,           # ✅ restrict depth (faster)
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ))
])

# Train
best_temp.fit(X_train, yt_train)

# Predict + Evaluate
yt_pred = best_temp.predict(X_test)

print("\n===== TEMPERATURE MODEL (FAST RF) =====")
print("MAE:", mean_absolute_error(yt_test, yt_pred))
print("R2:", r2_score(yt_test, yt_pred))


===== TEMPERATURE MODEL (FAST RF) =====
MAE: 0.5897595080625133
R2: 0.8324689035117326


In [None]:
# ----------------------------- # 
# 10) MODEL 3: WINDSPEED (Regressor) #
#  -----------------------------

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# -----------------------------
# Preprocessor (same as before)
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["city", "month"])
    ],
    remainder="drop"
)

# -----------------------------
# Train/Test Split (windspeed)
# -----------------------------
X = df[["city", "month"]]
y_wind = df[wind_col].astype(float)   # wind_col = your selected wind column

X_train, X_test, yw_train, yw_test = train_test_split(
    X, y_wind,
    test_size=0.2,
    random_state=42
)

# -----------------------------
# FAST RandomForest Regressor (Option B)
# -----------------------------
best_wind = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,       # ✅ fewer trees (faster)
        max_depth=25,           # ✅ limit depth
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ))
])

# Train model
best_wind.fit(X_train, yw_train)

# Predict
yw_pred = best_wind.predict(X_test)

# Evaluate
print("\n===== WINDSPEED MODEL (FAST RF) =====")
print("MAE:", mean_absolute_error(yw_test, yw_pred))
print("R2:", r2_score(yw_test, yw_pred))


===== WINDSPEED MODEL (FAST RF) =====
MAE: 2.9793548140903976
R2: 0.6074974925578274


In [18]:
# -----------------------------
# 11) SAVE ARTIFACTS
# -----------------------------
joblib.dump(best_clf, OUT_DIR / "condition_model.joblib")
joblib.dump(best_temp, OUT_DIR / "temp_model.joblib")
joblib.dump(best_wind, OUT_DIR / "wind_model.joblib")
joblib.dump(label_encoder, OUT_DIR / "label_encoder.joblib")

print("\n✅ Models saved to folder:", OUT_DIR)
print(" - condition_model.joblib")
print(" - temp_model.joblib")
print(" - wind_model.joblib")
print(" - label_encoder.joblib")


✅ Models saved to folder: weather_model_artifacts
 - condition_model.joblib
 - temp_model.joblib
 - wind_model.joblib
 - label_encoder.joblib


In [19]:
# -----------------------------
# 12) QUICK TEST PREDICTION FUNCTION
# -----------------------------
def predict_all(city: str, month: str):
    X_in = pd.DataFrame([{"city": city, "month": month}])

    cond_idx = best_clf.predict(X_in)[0]
    cond = label_encoder.inverse_transform([cond_idx])[0]

    temp = best_temp.predict(X_in)[0]
    wind = best_wind.predict(X_in)[0]

    return cond, float(temp), float(wind)

print("\nExample prediction:")
print(predict_all(df['city'].iloc[0], df['month'].iloc[0]))


Example prediction:
('Light drizzle', 26.335359422296882, 12.568451273922438)
