In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# === Load data ===
file_path = "D:/ML/project-soccer/data/85_Soccer_ETR_LGBR_COA_BWO.xlsx"
df = pd.read_excel(file_path, sheet_name="DATA after VIF")

y = df["markat value"]
X = df.drop(columns=["markat value"])


# Config
n_splits = 5
fold_size = len(X) // n_splits  # Each fold = 20%

# Store results
rmse_scores = []
r2_scores = []




# ETR K-Fold


In [16]:
from sklearn.ensemble import ExtraTreesRegressor


# Manual K-Fold in order
for i in range(n_splits):
    start = i * fold_size
    end = (i + 1) * fold_size

    # Test set = current fold
    X_test = X[start:end]
    y_test = y[start:end]

    # Train set = all except current fold
    X_train = np.concatenate([X[:start], X[end:]], axis=0)
    y_train = np.concatenate([y[:start], y[end:]], axis=0)

    # Train Extra Trees model
    model = ExtraTreesRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save metrics
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {i+1}: RMSE={rmse:.2f}, R2={r2:.2f}")




rmse_scores=np.array(rmse_scores)
r2_scores=np.array(r2_scores)
print("\nAll RMSEs:", rmse_scores)
print("All R²s:", r2_scores)




AttributeError: 'numpy.ndarray' object has no attribute 'append'

# LGBR K-Fold


In [9]:
from lightgbm import LGBMRegressor

for i in range(n_splits):
    start = i * fold_size
    end = (i + 1) * fold_size

    # Test set = current fold
    X_test = X[start:end]
    y_test = y[start:end]

    # Train set = all except current fold
    X_train = np.concatenate([X[:start], X[end:]], axis=0)
    y_train = np.concatenate([y[:start], y[end:]], axis=0)

    # Train LightGBM model
    model = LGBMRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save metrics
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {i+1}: RMSE={rmse:.2f}, R2={r2:.2f}")

print("\nAll RMSEs:", rmse_scores)
print("All R²s:", r2_scores)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 131
[LightGBM] [Info] Number of data points in the train set: 366, number of used features: 9
[LightGBM] [Info] Start training from score 17538592.896175
Fold 1: RMSE=275029148457113.22, R2=0.37
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 135
[LightGBM] [Info] Number of data points in the train set: 366, number of used features: 9
[LightGBM] [Info] Start training from score 18168579.234973
Fold 2: RMSE=131413525658401.14, R2=0.43
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000056 secon

 # XGBR K-Fold

In [3]:
from xgboost import XGBRegressor

for i in range(n_splits):
    start = i * fold_size
    end = (i + 1) * fold_size

    # Test set = current fold
    X_test = X[start:end]
    y_test = y[start:end]

    # Train set = all except current fold
    X_train = np.concatenate([X[:start], X[end:]], axis=0)
    y_train = np.concatenate([y[:start], y[end:]], axis=0)

    # Train XGBoost model
    model = XGBRegressor(random_state=42, verbosity=0)  # verbosity=0 to suppress warnings
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save metrics
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {i+1}: RMSE={rmse:.2f}, R2={r2:.2f}")

print("\nAll RMSEs:", rmse_scores)
print("All R²s:", r2_scores)

from xgboost import XGBRegressor
import numpy as np
from sklearn.datasets import load_diabetes

# Load dataset
data = load_diabetes()
X = data.data
y = data.target

# Split settings
n_splits = 5
fold_size = len(X) // n_splits
i = 3  # 4th fold (0-based index)

# Get test and train data from iteration 4
start = i * fold_size
end = (i + 1) * fold_size

X_test_final = X[start:end]
y_test_final = y[start:end]

X_train_final = np.concatenate([X[:start], X[end:]], axis=0)
y_train_final = np.concatenate([y[:start], y[end:]], axis=0)





Fold 1: RMSE=280048879676325.84, R2=0.36
Fold 2: RMSE=142996793414198.50, R2=0.38
Fold 3: RMSE=199436327532940.53, R2=0.48
Fold 4: RMSE=118481158659078.98, R2=0.69
Fold 5: RMSE=129565161085740.22, R2=0.34

All RMSEs: [280048879676325.84, 142996793414198.5, 199436327532940.53, 118481158659078.98, 129565161085740.22]
All R²s: [0.36192606239306624, 0.3751502112652635, 0.4773420975595889, 0.689802524068287, 0.3353233780264161]
