In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef, classification_report
from xgboost import XGBClassifier

In [49]:
# ----------------------------
# Utility Functions
# ----------------------------

def calculate_performance_metrics(y_test, y_pred):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred, average='weighted')
    metrics['recall'] = recall_score(y_test, y_pred, average='weighted')
    metrics['f1_score'] = f1_score(y_test, y_pred, average='weighted')
    metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
    metrics['mcc'] = matthews_corrcoef(y_test, y_pred)
    metrics['classification_report'] = classification_report(y_test, y_pred)
    return metrics

def print_performance_metrics(metrics):
    print("Accuracy:", metrics.get('accuracy', "Not computed"))
    print("Precision:", metrics.get('precision', "Not computed"))
    print("Recall:", metrics.get('recall', "Not computed"))
    print("F1 Score:", metrics.get('f1_score', "Not computed"))
    print("Confusion Matrix:\n", metrics.get('confusion_matrix', "Not computed"))
    print("Matthews Correlation Coefficient (MCC):", metrics.get('mcc', "Not computed"))
    print("Classification Report:\n", metrics.get('classification_report', "Not computed"))

def feature_importance(model, X, feature_names):
    feature_importances = model.feature_importances_
    feature_importances_list = [(feature_names[j], importance) for j, importance in enumerate(feature_importances)]
    feature_importances_list.sort(key=lambda x: x[1], reverse=True)
    print("Feature Importances:")
    for feature, importance in feature_importances_list:
        print(f"{feature}: {importance}")


In [50]:
# ----------------------------
# Train XGBoost Model
# ----------------------------
def train_model(X_train, y_train, X_test, y_test, feature_names):
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_jobs=-1,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics_xgb = calculate_performance_metrics(y_test, y_pred)
    print_performance_metrics(metrics_xgb)
    feature_importance(model, X_train, feature_names)

    return model, y_pred

In [51]:
# ----------------------------
# Prepare Data and Run Model
# ----------------------------
def create_model():
    df = pd.read_csv("./data/final.csv")

    df = df.drop(columns=["time"])

    # ----------------------------
    # Filter out rare classes (<2 samples) [Requirement of XGBOOST]
    # ----------------------------
    target_name = "weather_code (wmo code)"
    counts = df[target_name].value_counts()
    valid_classes = counts[counts >= 2].index
    df = df[df[target_name].isin(valid_classes)]

    # ----------------------------
    # Encode target to consecutive integers
    # ----------------------------
    le = LabelEncoder()
    y_encoded = le.fit_transform(df[target_name].values)

    # Features
    X = df.drop(columns=[target_name]).values
    feature_names = df.drop(columns=[target_name]).columns.tolist()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    print("Data ready. Training XGBoost model...")
    model, y_pred = train_model(X_train, y_train, X_test, y_test, feature_names)


    y_pred_original = le.inverse_transform(y_pred)
    print("\nSample predictions (original codes):", y_pred_original[:10])

    return model, le

# ----------------------------
# Run
# ----------------------------
def run_model_training():
    create_model()

run_model_training()


Data ready. Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8746177370030581
Precision: 0.8509975632500982
Recall: 0.8746177370030581
F1 Score: 0.8587109505848174
Confusion Matrix:
 [[ 30   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  3   4   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  12   4   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   2 172   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   3   2   2   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   1   1   1   1   0   0   0   0   0   0   0   1   0   0   0]
 [  0   0   0   0   0   0  17   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   2   2   0   0   0   0   0   0   2   0   0]
 [  0   0   0   0   0   0   0   1   0   1   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0  16   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   5   0   0   0   2   0]
 [  0   0   0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [52]:
# output is encoded, so while predicting will have to decode first and then show the weather codes