In [2]:
import pandas as pd
import numpy as np

# ostateczne dane do przewidywania ceny
dane = pd.read_pickle("dane_onehot.pkl")

In [3]:
from sklearn.model_selection import train_test_split

# przekształcamy na numeryczne (na wszelki wypadek)
dane = dane.apply(pd.to_numeric)

# target variable - cena lotu (zł)
X = dane.drop(columns = "Price")
y = dane["Price"]

In [4]:
import random 
random.seed(123) # ziarenko dla powtarzalności wyników

# podział na zbiór treningowy, walidacyjny i testowy - proporcje 80% : 20% : 20% 
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# konwersja do macierzy numpy
X_train_np = X_train.to_numpy().astype(np.float64)
y_train_np = y_train.to_numpy().astype(np.float64)
X_test_np = X_test.to_numpy().astype(np.float64)
y_test_np = y_test.to_numpy().astype(np.float64)

In [8]:
import itertools
import pandas as pd
import time
import numpy as np
from glob import glob
from xgboost_regressor import XGBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# funkcja licząca metryki
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Evaluate model performance on train and test sets
    """
    metrics = {}
    
    # Training metrics
    train_pred = model.predict(X_train)
    metrics['train_MSE'] = mean_squared_error(y_train, train_pred)
    metrics['train_MAE'] = mean_absolute_error(y_train, train_pred)
    metrics['train_MAPE'] = mean_absolute_percentage_error(y_train, train_pred)
    metrics['train_R2'] = r2_score(y_train, train_pred)
    
    # Test metrics
    test_pred = model.predict(X_test)
    metrics['test_MSE'] = mean_squared_error(y_test, test_pred)
    metrics['test_MAE'] = mean_absolute_error(y_test, test_pred)
    metrics['test_MAPE'] = mean_absolute_percentage_error(y_test, test_pred)
    metrics['test_R2'] = r2_score(y_test, test_pred)
    
    return metrics

# Configuration
NUM_REPETITIONS = 5  # Number of repetitions for each parameter combination
RANDOM_STATE = 42
filename = None  # Custom filename if needed

# XGBoost parameters to test
baseline_params = {
    "n_estimators": [100],
    "learning_rate": [0.05],
    "max_depth": [5],
    "min_child_weight": [1],
    "reg_lambda": [1],
    "gamma": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

# Set up filename
if not filename:
    files = glob("xgb_*.xlsx")
    filename = f"xgb_{len(files)+1}.xlsx"
if ".xlsx" not in filename: 
    filename += ".xlsx"

# Set random seed
np.random.seed(RANDOM_STATE)

print(f"Starting parameter testing with {NUM_REPETITIONS} repetitions per combination")
print(f"Data shape: {X_train.shape}")

# Generate parameter combinations
keys = list(baseline_params.keys())
combinations = list(itertools.product(*(baseline_params[key] for key in keys)))

# Create parameter dataframe
params_df = pd.DataFrame(data=combinations, columns=keys)

print(f"Testing {len(params_df)} parameter combinations")
print(f"Total experiments: {len(params_df) * NUM_REPETITIONS}")

# Results storage
results = []

# Main testing loop
for i, row in params_df.iterrows():
    try:
        print(f"\nTesting combination {i+1}/{len(params_df)}")
        
        # Extract parameters
        params = {
            "n_estimators": int(row["n_estimators"]),
            "learning_rate": float(row["learning_rate"]),
            "max_depth": int(row["max_depth"]),
            "min_child_weight": float(row["min_child_weight"]),
            "reg_lambda": float(row["reg_lambda"]),
            "gamma": float(row["gamma"]),
            "subsample": float(row["subsample"]),
            "colsample_bytree": float(row["colsample_bytree"])
        }
        
        print("Parameters:", ", ".join(f"{k}={v}" for k, v in params.items()))
        
        # Lists to store results from repetitions
        training_times = []
        test_metrics_list = []
        
        # Multiple repetitions for statistical significance
        for rep in range(NUM_REPETITIONS):
            print(f"  Repetition {rep+1}/{NUM_REPETITIONS}")
            
            # Train model
            start_time = time.time()
            model = XGBoostRegressor(**params, random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            training_time = time.time() - start_time
            training_times.append(training_time)
            
            # Evaluate model
            metrics = evaluate_model(model, X_train, y_train, 
                                    X_test, y_test)
            test_metrics_list.append(metrics)
            
            print(f"    Test R²: {metrics['test_R2']:.4f}")
            print(f"    Test MSE: {metrics['test_MSE']:.4f}")
        
        # Calculate average metrics across repetitions
        avg_results = params.copy()
        
        # Training time
        avg_results["train_time"] = np.mean(training_times)
        
        # Test metrics - calculate mean across repetitions
        metric_names = ['train_MSE', 'train_MAE', 'train_MAPE', 'train_R2',
                        'test_MSE', 'test_MAE', 'test_MAPE', 'test_R2']
        
        for metric in metric_names:
            values = [m[metric] for m in test_metrics_list]
            avg_results[metric] = np.mean(values)
        
        # Add summary metrics
        avg_results['repetitions'] = NUM_REPETITIONS
        
        results.append(avg_results)
        
        # Print summary for this combination
        print(f"  Average Test R²: {avg_results['test_R2']:.4f}")
        print(f"  Average Test MSE: {avg_results['test_MSE']:.4f}")
        
    except KeyboardInterrupt:
        print("\nInterrupted by user. Saving current results...")
        break
    # except Exception as e:
    #     print(f"Error with combination {i+1}: {str(e)}")
    #     continue

# Save results
try:
    result_df = pd.DataFrame(results)
    result_df.to_excel(filename, index=False)
    print(f"\nResults saved to {filename}")
    
    # Display summary of best results
    if not result_df.empty:
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Best R² score
        best_r2_idx = result_df['test_R2'].idxmax()
        best_r2_row = result_df.iloc[best_r2_idx]
        print(f"Best Test R²: {best_r2_row['test_R2']:.4f}")
        print("Parameters:")
        for param in baseline_params.keys():
            print(f"  {param}: {best_r2_row[param]}")
        
        # Lowest MSE
        best_mse_idx = result_df['test_MSE'].idxmin()
        best_mse_row = result_df.iloc[best_mse_idx]
        print(f"\nLowest Test MSE: {best_mse_row['test_MSE']:.4f}")
        print("Parameters:")
        for param in baseline_params.keys():
            print(f"  {param}: {best_mse_row[param]}")
        
        # Performance statistics
        print(f"\nPerformance Statistics:")
        print(f"Average training time per model: {result_df['train_time'].mean():.2f}s")
        print(f"Total combinations tested: {len(result_df)}")
        print(f"Total models trained: {len(result_df) * NUM_REPETITIONS}")
        
except Exception as e:
    print(f"Error saving results: {str(e)}")
    # Try to save as CSV as backup
    try:
        result_df = pd.DataFrame(results)
        result_df.to_excel(filename, index=False)
        print(f"Results saved as CSV to {filename}")
    except:
        print("Could not save results")


Starting parameter testing with 5 repetitions per combination
Data shape: (115450, 37)
Testing 1 parameter combinations
Total experiments: 5

Testing combination 1/1
Parameters: n_estimators=100, learning_rate=0.05, max_depth=5, min_child_weight=1.0, reg_lambda=1.0, gamma=0.1, subsample=0.8, colsample_bytree=0.8
  Repetition 1/5


KeyError: "None of [Index([ 14556,  55306, 103047,    127,  53381,  31813,  21793,  43865,  74817,\n        19057,\n       ...\n        51298,  50333,  80192,  41876,  34311,  69838,    610,   7946,  38491,\n         6518],\n      dtype='int32', length=92360)] are in the [columns]"