In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dữ liệu (giả sử bạn đã có sẵn df)
df = pd.read_csv("/kaggle/input/data-full-features-ai/weather_data_nghean (1).csv")

# Kiểm tra thông tin tổng quát
print(df.info())
print(df.describe())

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388493 entries, 0 to 388492
Data columns (total 38 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   x         388493 non-null  float64
 1   y         388493 non-null  float64
 2   B04B      388493 non-null  float64
 3   B05B      388493 non-null  float64
 4   B06B      388493 non-null  float64
 5   B09B      388493 non-null  float64
 6   B10B      388493 non-null  float64
 7   B11B      388493 non-null  float64
 8   B12B      388493 non-null  float64
 9   B14B      388493 non-null  float64
 10  B16B      388493 non-null  float64
 11  I2B       388493 non-null  float64
 12  I4B       388493 non-null  float64
 13  IRB       388493 non-null  float64
 14  VSB       388493 non-null  float64
 15  WVB       388493 non-null  float64
 16  CAPE      388493 non-null  float64
 17  CIN       388493 non-null  float64
 18  EWSS      388493 non-null  float64
 19  IE        388493 non-null  float64
 20  ISOR

Unnamed: 0,x,y,B04B,B05B,B06B,B09B,B10B,B11B,B12B,B14B,...,SSHF,TCLW,TCW,TCWV,U250,U850,V250,V850,Radar,datetime
0,104.9,19.96,0.498362,0.352224,0.236776,255.42627,260.7911,279.25586,259.7476,281.53525,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00
1,104.94,19.96,0.498362,0.352224,0.236776,255.42627,260.7911,279.25586,259.7476,281.53525,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00
2,104.98,19.96,0.572723,0.384196,0.249166,255.3,260.9037,280.62646,260.546,283.249,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00
3,104.86,19.92,0.532949,0.360718,0.238078,255.81377,260.79684,278.82367,259.354,280.84116,...,-272124.0,0.550171,32.744827,32.179337,26.195923,-4.334152,6.599442,3.6866,0.0,2019-04-01 08:00:00
4,104.9,19.92,0.532949,0.360718,0.238078,255.81377,260.79684,278.82367,259.354,280.84116,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00


In [2]:
HIMA_BANDS = ['B04B', 'B05B', 'B06B', 'B09B', 'B10B', 'B11B', 'B12B', 'B14B', 'B16B', 'I2B', 'I4B', 'IRB', 'VSB', 'WVB']
ERA5_PARAMS = ['CAPE', 'CIN', 'EWSS', 'IE', 'ISOR', 'KX', 'PEV', 'R250', 'R500', 'R850', 'SLHF', 'SLOR', 'SSHF', 'TCLW', 'TCW', 'TCWV', 'U250', 'U850', 'V250', 'V850']
SELECTED_HIMA_BANDS = [ 'B05B', 'B06B',  'B10B', 'B11B', 'B12B',  'I4B', 'IRB']
SELECTED_ERA5_PARAMS = ['CAPE', 'CIN', 'EWSS', 'IE', 'ISOR', 'KX', 'PEV', 'R250', 'R500', 'R850', 'SLHF', 'SLOR', 'SSHF', 'TCLW', 'TCW', 'TCWV', 'U250', 'U850', 'V250', 'V850']
SELECTED_FEATURES = SELECTED_HIMA_BANDS + SELECTED_ERA5_PARAMS
HEIGHT, WIDTH = 90, 250

In [3]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from cuml.ensemble import RandomForestRegressor

class BaseModel(ABC):
    def __init__(self, selected_features):
        self.selected_features = selected_features
        self.feature_names = None

    def _filter_features(self, X):
        if isinstance(X, pd.DataFrame):
            missing = set(self.selected_features) - set(X.columns)
            if missing:
                raise ValueError(f"Missing features: {missing}")
            return X[self.selected_features]
        elif isinstance(X, np.ndarray):
            if self.feature_names is None:
                raise ValueError("Feature names chưa được định nghĩa cho numpy array")
            X_df = pd.DataFrame(X, columns=self.feature_names)
            return X_df[self.selected_features].values
        else:
            raise TypeError("Đầu vào phải là DataFrame hoặc numpy array")

    @abstractmethod
    def fit(self, X, y): pass

    @abstractmethod
    def predict(self, X): pass

class XGBModel(BaseModel):
    def __init__(self, selected_features, params=None, n_splits=5, early_stopping_rounds=20):
        super().__init__(selected_features)
        default_params = {
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "device" : "cuda",
            "n_estimators": 1000,
            "learning_rate": 0.05,
            "max_depth": 8,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "random_state": 42
        }
        self.params = {**default_params, **(params or {})}
        self.n_splits = n_splits
        self.early_stopping_rounds = early_stopping_rounds
        self.model = None
        self.eval_metric = "rmse"

    def fit(self, X, y, eval_set=None, eval_metric=None, verbose=False):
        # Store feature names
        if isinstance(X, pd.DataFrame):
            self.feature_names = X.columns.tolist()
        X_filtered = self._filter_features(X)

        # Determine train/val for early stopping
        fit_X, fit_y = X_filtered, y
        ev_set = None
        if eval_set is None:
            kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
            train_idx, val_idx = next(kf.split(X_filtered))
            fit_X = X_filtered.iloc[train_idx]
            fit_y = y.iloc[train_idx] if isinstance(y, pd.Series) else y[train_idx]
            X_val = X_filtered.iloc[val_idx]
            y_val = y.iloc[val_idx] if isinstance(y, pd.Series) else y[val_idx]
            ev_set = [(X_val, y_val)]
        else:
            X_val, y_val = eval_set
            X_val_filtered = self._filter_features(X_val)
            ev_set = [(X_val_filtered, y_val)]

        # Initialize and train
        self.model = xgb.XGBRegressor(**self.params)
        fit_kwargs = {
            "eval_metric": eval_metric or self.eval_metric,
            "verbose": verbose
        }
        if ev_set is not None:
            fit_kwargs.update({
                "eval_set": ev_set,
                "early_stopping_rounds": self.early_stopping_rounds
            })
        self.model.fit(fit_X, fit_y, **fit_kwargs)

    def predict(self, X):
        X_filtered = self._filter_features(X)
        return self.model.predict(X_filtered)

class DNNModel(BaseModel):
    def __init__(self, selected_features, layers=(64, 32)):
        super().__init__(selected_features)
        self.input_dim = len(selected_features)
        self.layers = layers
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(self.layers[0], activation="relu", input_dim=self.input_dim))
        for units in self.layers[1:]:
            model.add(Dense(units, activation="relu"))
        model.add(Dense(1))
        return model

    def fit(self, X, y, epochs=50, batch_size=32):
        if isinstance(X, pd.DataFrame):
            self.feature_names = X.columns.tolist()
        X_filtered = self._filter_features(X)
        self.model.compile(optimizer="adam", loss="mse")
        self.model.fit(X_filtered, y, epochs=epochs, batch_size=batch_size, verbose=0)

    def predict(self, X):
        X_filtered = self._filter_features(X)
        return self.model.predict(X_filtered).flatten()

class ERTModel(BaseModel):
    def __init__(self, selected_features, params=None):
        super().__init__(selected_features)
        default_params = {"n_estimators": 100, "max_features": 1.0, "n_streams": 1}
        self.params = {**default_params, **(params or {})}
        self.model = RandomForestRegressor(**self.params)

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            self.feature_names = X.columns.tolist()
        X_filtered = self._filter_features(X)
        self.model.fit(X_filtered, y)

    def predict(self, X):
        X_filtered = self._filter_features(X)
        return self.model.predict(X_filtered)

class StackingModel:
    def __init__(self, level1_models, level2_model, n_folds=5):
        self.n_folds = n_folds
        self.level1_models = level1_models
        self.level2_model = level2_model

    def _generate_meta_features(self, X, y):
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        meta = np.zeros((X.shape[0], len(self.level1_models)))
        for i, model in enumerate(self.level1_models):
            for train_idx, val_idx in kf.split(X):
                Xi_tr, Xi_val = X[train_idx], X[val_idx]
                yi_tr = y[train_idx]
                df_tr = pd.DataFrame(Xi_tr, columns=model.feature_names)
                df_val = pd.DataFrame(Xi_val, columns=model.feature_names)
                model.fit(df_tr, yi_tr)
                meta[val_idx, i] = model.predict(df_val)
        return meta

    def fit(self, X, y):
        meta = self._generate_meta_features(X.values, y.values)
        self.level2_model.fit(meta, y.values)

    def predict(self, X):
        preds = []
        for m in self.level1_models:
            dfX = pd.DataFrame(X, columns=m.feature_names)
            preds.append(m.predict(dfX))
        meta_test = np.column_stack(preds)
        return self.level2_model.predict(meta_test)

# ========== Main pipeline ==========
df = pd.read_csv("/kaggle/input/data-full-features-ai/weather_data_nghean (1).csv")
df.fillna(df.mean(numeric_only=True), inplace=True)

TARGET_COL = "Radar"
HIMA_SELECTED = ['B05B','B06B','B11B','B12B','I2B','I4B','IRB']
ERA5_SELECTED = ['CAPE','CIN','EWSS','IE','ISOR','KX','PEV','R250','R500','R850','SLHF','SLOR','SSHF','TCLW','TCW','TCWV','U250','U850','V250','V850']
FOR_XG = HIMA_SELECTED + ERA5_SELECTED
ALL_FEATURES = FOR_XG

X = df[ALL_FEATURES]
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBModel(selected_features=FOR_XG, params={"n_estimators":200,"max_depth":6}, n_splits=5, early_stopping_rounds=20)
dnn_model = DNNModel(selected_features=ALL_FEATURES, layers=(128,64,32))
ert_model = ERTModel(selected_features=ALL_FEATURES, params={"n_estimators":100})

# Train
xgb_model.fit(X_train, y_train)
dnn_model.fit(X_train, y_train)
ert_model.fit(X_train, y_train)

# Evaluate level-1 models
def evaluate_model(m, X_t, y_t):
    dfX = X_t if isinstance(X_t, pd.DataFrame) else pd.DataFrame(X_t, columns=m.feature_names)
    preds = m.predict(dfX)
    return mean_absolute_error(y_t, preds), mean_squared_error(y_t, preds), np.sqrt(mean_squared_error(y_t, preds)), r2_score(y_t, preds)

metrics = {}
for name, model in [('XGB', xgb_model), ('DNN', dnn_model), ('ERT', ert_model)]:
    metrics[name] = evaluate_model(model, X_test, y_test)
print("Level-1 metrics:", metrics)

# Stacking
meta_model = ElasticNet(alpha=0.01, l1_ratio=0.7)
stacker = StackingModel([xgb_model, dnn_model, ert_model], meta_model, n_folds=5)
stacker.fit(X_train, y_train)
preds_stack = stacker.predict(X_test)
mae_s, mse_s, rmse_s, r2_s = mean_absolute_error(y_test, preds_stack), mean_squared_error(y_test, preds_stack), np.sqrt(mean_squared_error(y_test, preds_stack)), r2_score(y_test, preds_stack)
print("Stacking metrics: MAE=%.4f, RMSE=%.4f, R2=%.4f" % (mae_s, rmse_s, r2_s))


2025-05-05 02:27:42.269553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746412062.451512      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746412062.503654      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1746412083.928707      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1746412083.929372      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> 

[1m2429/2429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Level-1 metrics: {'XGB': (0.1657905744887403, 0.580675357452941, 0.762020575478734, 0.6678814111779581), 'DNN': (0.3667747476753957, 1.7485716650856988, 1.3223356854769135, -9.953308468735322e-05), 'ERT': (0.12144270672808426, 0.46118816314524935, 0.6791083589128095, 0.7362223831968917)}




[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m2429/2429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Stacking metrics: MAE=0.1300, RMSE=0.7099, R2=0.7118
