In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dữ liệu (giả sử bạn đã có sẵn df)
df = pd.read_csv("/kaggle/input/data-full-features-ai/weather_data_nghean (1).csv")

# Kiểm tra thông tin tổng quát
print(df.info())
print(df.describe())

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388493 entries, 0 to 388492
Data columns (total 38 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   x         388493 non-null  float64
 1   y         388493 non-null  float64
 2   B04B      388493 non-null  float64
 3   B05B      388493 non-null  float64
 4   B06B      388493 non-null  float64
 5   B09B      388493 non-null  float64
 6   B10B      388493 non-null  float64
 7   B11B      388493 non-null  float64
 8   B12B      388493 non-null  float64
 9   B14B      388493 non-null  float64
 10  B16B      388493 non-null  float64
 11  I2B       388493 non-null  float64
 12  I4B       388493 non-null  float64
 13  IRB       388493 non-null  float64
 14  VSB       388493 non-null  float64
 15  WVB       388493 non-null  float64
 16  CAPE      388493 non-null  float64
 17  CIN       388493 non-null  float64
 18  EWSS      388493 non-null  float64
 19  IE        388493 non-null  float64
 20  ISOR

Unnamed: 0,x,y,B04B,B05B,B06B,B09B,B10B,B11B,B12B,B14B,...,SSHF,TCLW,TCW,TCWV,U250,U850,V250,V850,Radar,datetime
0,104.9,19.96,0.498362,0.352224,0.236776,255.42627,260.7911,279.25586,259.7476,281.53525,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00
1,104.94,19.96,0.498362,0.352224,0.236776,255.42627,260.7911,279.25586,259.7476,281.53525,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00
2,104.98,19.96,0.572723,0.384196,0.249166,255.3,260.9037,280.62646,260.546,283.249,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00
3,104.86,19.92,0.532949,0.360718,0.238078,255.81377,260.79684,278.82367,259.354,280.84116,...,-272124.0,0.550171,32.744827,32.179337,26.195923,-4.334152,6.599442,3.6866,0.0,2019-04-01 08:00:00
4,104.9,19.92,0.532949,0.360718,0.238078,255.81377,260.79684,278.82367,259.354,280.84116,...,-137404.0,0.601746,35.61592,35.00551,25.895142,-4.906418,6.482254,5.172928,0.0,2019-04-01 08:00:00


In [2]:
HIMA_BANDS = ['B04B', 'B05B', 'B06B', 'B09B', 'B10B', 'B11B', 'B12B', 'B14B', 'B16B', 'I2B', 'I4B', 'IRB', 'VSB', 'WVB']
ERA5_PARAMS = ['CAPE', 'CIN', 'EWSS', 'IE', 'ISOR', 'KX', 'PEV', 'R250', 'R500', 'R850', 'SLHF', 'SLOR', 'SSHF', 'TCLW', 'TCW', 'TCWV', 'U250', 'U850', 'V250', 'V850']
SELECTED_HIMA_BANDS = [ 'B05B', 'B06B',  'B10B', 'B11B', 'B12B',  'I4B', 'IRB']
SELECTED_ERA5_PARAMS = ['CAPE', 'CIN', 'EWSS', 'IE', 'ISOR', 'KX', 'PEV', 'R250', 'R500', 'R850', 'SLHF', 'SLOR', 'SSHF', 'TCLW', 'TCW', 'TCWV', 'U250', 'U850', 'V250', 'V850']
SELECTED_FEATURES = SELECTED_HIMA_BANDS + SELECTED_ERA5_PARAMS
HEIGHT, WIDTH = 90, 250

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import xgboost as xgb
import lightgbm as lgb
from cuml.ensemble import RandomForestRegressor

#========== Các hàm tiện ích chung ==========
def filter_features(X, selected_features, feature_names=None):
    if isinstance(X, pd.DataFrame):
        missing = set(selected_features) - set(X.columns)
        if missing:
            raise ValueError(f"Missing features: {missing}")
        return X[selected_features]
    elif isinstance(X, np.ndarray):
        if feature_names is None:
            raise ValueError("Feature names chưa được định nghĩa cho numpy array")
        X_df = pd.DataFrame(X, columns=feature_names)
        return X_df[selected_features].values
    else:
        raise TypeError("Đầu vào phải là DataFrame hoặc numpy array")

#========== Hàm cho XGBoost ==========
def train_xgb(X, y, selected_features, params=None, sample_weight=None, n_splits=5):
    default_params = {
         'n_estimators': 500
    }
    final_params = {**default_params, **(params or {})}
    
    feature_names = X.columns.tolist() if isinstance(X, pd.DataFrame) else None
    X_filtered = filter_features(X, selected_features, feature_names)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_idx, val_idx = next(kf.split(X_filtered))
    
    # Trích xuất trọng số cho tập train
    if sample_weight is not None:
        train_weight = sample_weight[train_idx]
        val_weight = sample_weight[val_idx]
    else:
        train_weight = val_weight = None

    X_train = X_filtered.iloc[train_idx] if isinstance(X, pd.DataFrame) else X_filtered[train_idx]
    y_train = y.iloc[train_idx] if isinstance(y, pd.Series) else y[train_idx]
    X_val = X_filtered.iloc[val_idx] if isinstance(X, pd.DataFrame) else X_filtered[val_idx]
    y_val = y.iloc[val_idx] if isinstance(y, pd.Series) else y[val_idx]

    model = xgb.XGBRegressor(**final_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        sample_weight=train_weight,  # Thêm trọng số
        early_stopping_rounds=10,
        verbose=False
    )
    
    return {
        'model': model,
        'feature_names': feature_names,
        'selected_features': selected_features
    }


def predict_xgb(model_dict, X):
    X_filtered = filter_features(
        X, 
        model_dict['selected_features'], 
        model_dict['feature_names']
    )
    return model_dict['model'].predict(X_filtered)

#========== Hàm cho LightGBM ==========
def train_lgb(X, y, selected_features, params=None, n_splits=5):
    default_params = {
        'n_estimators': 500
    }
    final_params = {**default_params, **(params or {})}
    
    feature_names = X.columns.tolist() if isinstance(X, pd.DataFrame) else None
    X_filtered = filter_features(X, selected_features, feature_names)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_idx, val_idx = next(kf.split(X_filtered))
    
    X_train = X_filtered.iloc[train_idx] if isinstance(X, pd.DataFrame) else X_filtered[train_idx]
    y_train = y.iloc[train_idx] if isinstance(y, pd.Series) else y[train_idx]
    X_val = X_filtered.iloc[val_idx] if isinstance(X, pd.DataFrame) else X_filtered[val_idx]
    y_val = y.iloc[val_idx] if isinstance(y, pd.Series) else y[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        final_params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=100,
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )
    
    return {
        'model': model,
        'feature_names': feature_names,
        'selected_features': selected_features
    }

def predict_lgb(model_dict, X):
    X_filtered = filter_features(
        X,
        model_dict['selected_features'],
        model_dict['feature_names']
    )
    return model_dict['model'].predict(X_filtered)

#========== Hàm cho Random Forest ==========
def train_ert(X, y, selected_features, params=None):
    default_params = {'n_estimators': 500}
    final_params = {**default_params, **(params or {})}
    
    feature_names = X.columns.tolist() if isinstance(X, pd.DataFrame) else None
    X_filtered = filter_features(X, selected_features, feature_names)
    
    model = RandomForestRegressor(**final_params)
    model.fit(X_filtered, y)
    
    return {
        'model': model,
        'feature_names': feature_names,
        'selected_features': selected_features
    }

def predict_ert(model_dict, X):
    X_filtered = filter_features(
        X,
        model_dict['selected_features'],
        model_dict['feature_names']
    )
    return model_dict['model'].predict(X_filtered)

#========== Hàm cho Stacking ==========
def generate_meta_features(X, y, base_models, n_folds=5):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    meta = np.zeros((X.shape[0], len(base_models)))
    
    for i, model_dict in enumerate(base_models):
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train = y[train_idx]
            
            if 'xgb' in model_dict['type']:
                m = train_xgb(
                    pd.DataFrame(X_train, columns=model_dict['feature_names']),
                    y_train,
                    model_dict['selected_features'],
                    params=model_dict.get('params', None)
                )
                preds = predict_xgb(m, pd.DataFrame(X_val, columns=model_dict['feature_names']))
            elif 'lgb' in model_dict['type']:
                m = train_lgb(
                    pd.DataFrame(X_train, columns=model_dict['feature_names']),
                    y_train,
                    model_dict['selected_features'],
                    params=model_dict.get('params', None)
                )
                preds = predict_lgb(m, pd.DataFrame(X_val, columns=model_dict['feature_names']))
            elif 'ert' in model_dict['type']:
                m = train_ert(
                    pd.DataFrame(X_train, columns=model_dict['feature_names']),
                    y_train,
                    model_dict['selected_features'],
                    params=model_dict.get('params', None)
                )
                preds = predict_ert(m, pd.DataFrame(X_val, columns=model_dict['feature_names']))
            
            meta[val_idx, i] = preds
            
    return meta

def train_stacking(X, y, base_models, level2_model, n_folds=5):
    meta_features = generate_meta_features(X.values, y.values, base_models, n_folds)
    level2_model.fit(meta_features, y.values)
    return level2_model, [model['feature_names'] for model in base_models]

def predict_stacking(stacking_model, base_models_info, X):
    meta_test = []
    for model_info in base_models_info:
        if model_info['type'] == 'xgb':
            preds = predict_xgb(model_info, X)
        elif model_info['type'] == 'lgb':
            preds = predict_lgb(model_info, X)
        elif model_info['type'] == 'ert':
            preds = predict_ert(model_info, X)
        meta_test.append(preds)
    return stacking_model.predict(np.column_stack(meta_test))

#========== Pipeline chính ==========
# Load data và chuẩn bị features
df = pd.read_csv("/kaggle/input/data-full-features-ai/weather_data_nghean (1).csv")
df.fillna(df.mean(numeric_only=True), inplace=True)

TARGET_COL = "Radar"
HIMA_BANDS = [ 'B09B', 'B10B', 'B11B', 'B12B', 'B14B', 'B16B', 'I2B', 'I4B', 'IRB', 'VSB', 'WVB']
ERA5_PARAMS = [ 'R500', 'TCLW', 'TCW', 'TCWV']
FOR_XG = HIMA_BANDS + ERA5_PARAMS
ALL_FEATURES = FOR_XG

X = df[ALL_FEATURES]
y = df[TARGET_COL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện các model level-1
train_weight = np.where(y_train > 0, 5.0, 1.0)
xgb_model = train_xgb(
    X_train, y_train,
    selected_features=FOR_XG,
    params={},
    sample_weight=train_weight
)

lgb_model = train_lgb(
    X_train, y_train,
    selected_features=ALL_FEATURES,
    params={}
)

ert_model = train_ert(
    X_train, y_train,
    selected_features=ALL_FEATURES
)

# Đánh giá các model
def evaluate_model(predict_func, model_dict, X_test, y_test):
    preds = predict_func(model_dict, X_test)
    cc, _ = pearsonr(y_test, preds)  # Tính Pearson Correlation Coefficient
    return {
        'MAE': mean_absolute_error(y_test, preds),
        'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
        'R2': r2_score(y_test, preds),
        'CC': cc
    }

metrics = {
    'XGB': evaluate_model(predict_xgb, xgb_model, X_test, y_test),
    'LGB': evaluate_model(predict_lgb, lgb_model, X_test, y_test),
    'ERT': evaluate_model(predict_ert, ert_model, X_test, y_test)
}

# Huấn luyện stacking model
base_models_info = [
    {'type': 'xgb', **xgb_model},
    {'type': 'lgb', **lgb_model},
    {'type': 'ert', **ert_model}
]

level2_model = ElasticNet(alpha=0.01, l1_ratio=0.7)
stacking_model, feature_names_list = train_stacking(
    X_train, y_train,
    base_models=base_models_info,
    level2_model=level2_model
)

# Dự đoán và đánh giá stacking
test_preds = predict_stacking(
    stacking_model,
    base_models_info,
    X_test
)

stacking_metrics = {
    'MAE': mean_absolute_error(y_test, test_preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, test_preds)),
    'R2': r2_score(y_test, test_preds),
    'CC': pearsonr(y_test, test_preds)[0]  # Tính CC cho stacking
}

print("Level-1 Metrics:", metrics)
print("\nStacking Metrics:", stacking_metrics)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 248635, number of used features: 15
[LightGBM] [Info] Start training from score 0.206359




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 198908, number of used features: 15
[LightGBM] [Info] Start training from score 0.206243
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 198908, number of used features: 15
[LightGBM] [Info] Start training from score 0.208835
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015932 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 198908, number of used features: 15
[LightGBM] [Info] Start 