# Gradient Boosting

In [1]:
# Data Loading and Preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

# Load the pre-split data
train_df = pd.read_csv('csv/pokemon_train.csv')
test_df = pd.read_csv('csv/pokemon_test.csv')

# Prepare features and targets
features = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Height(m)', 'Weight(kg)']
X_train = train_df[features]
X_test = test_df[features]

# Prepare targets
y_train_primary = train_df['Primary_Type']
y_test_primary = test_df['Primary_Type']
y_train_generation = train_df['Generation']
y_test_generation = test_df['Generation']

# Prepare both types for multi-label classification
y_train_both = train_df[['Primary_Type', 'Secondary_Type']].values.tolist()
y_test_both = test_df[['Primary_Type', 'Secondary_Type']].values.tolist()
y_train_both = [[t[0], t[1]] if pd.notna(t[1]) else [t[0]] for t in y_train_both]
y_test_both = [[t[0], t[1]] if pd.notna(t[1]) else [t[0]] for t in y_test_both]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Binarize labels for multi-label classification
mlb = MultiLabelBinarizer()
y_train_both_bin = mlb.fit_transform(y_train_both)
y_test_both_bin = mlb.transform(y_test_both)

In [11]:
# Baseline without CV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.ensemble import GradientBoostingClassifier

# Primary Type
gb_primary = GradientBoostingClassifier(random_state=42)
gb_primary.fit(X_train_scaled, y_train_primary)
y_pred_primary = gb_primary.predict(X_test_scaled)
print("Gradient Boosting - Primary Type Accuracy:", accuracy_score(y_test_primary, y_pred_primary))

# Both Types
gb_both = MultiOutputClassifier(GradientBoostingClassifier(random_state=42))
gb_both.fit(X_train_scaled, y_train_both_bin)
y_pred_both = gb_both.predict(X_test_scaled)
print("Gradient Boosting - Both Types Hamming Loss:", hamming_loss(y_test_both_bin, y_pred_both))

# Generation
gb_generation = GradientBoostingClassifier(random_state=42)
gb_generation.fit(X_train_scaled, y_train_generation)
y_pred_generation = gb_generation.predict(X_test_scaled)
print("Gradient Boosting - Generation Accuracy:", accuracy_score(y_test_generation, y_pred_generation))

Gradient Boosting - Primary Type Accuracy: 0.2
Gradient Boosting - Both Types Hamming Loss: 0.08699186991869919
Gradient Boosting - Generation Accuracy: 0.2


In [2]:
# Model Implementation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier

# Primary Type Prediction
gb_primary = GradientBoostingClassifier(random_state=42)

# Both Types Prediction
gb_both = MultiOutputClassifier(GradientBoostingClassifier(random_state=42))

# Generation Prediction
gb_generation = GradientBoostingClassifier(random_state=42)

In [7]:
# Results 
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, hamming_loss, accuracy_score, f1_score, jaccard_score

def neg_hamming_loss(y_true, y_pred):
    return -hamming_loss(y_true, y_pred)

hamming_scorer = make_scorer(neg_hamming_loss)

def train_and_evaluate(model, X_train, y_train, X_test, y_test, param_grid, is_multilabel=False):
    # Before tuning
    if is_multilabel:
        scores_before = cross_val_score(model, X_train, y_train, cv=5, 
                                        scoring=make_scorer(hamming_loss, greater_is_better=False))
        print(f"Before tuning - Mean CV Hamming Loss: {-scores_before.mean():.3f} (+/- {scores_before.std() * 2:.3f})")
    else:
        scores_before = cross_val_score(model, X_train, y_train, cv=5)
        print(f"Before tuning - Mean CV score: {scores_before.mean():.3f} (+/- {scores_before.std() * 2:.3f})")

    # After tuning
    grid_search = GridSearchCV(model, param_grid, cv=5, 
                           scoring=hamming_scorer if is_multilabel else 'accuracy')
    grid_search.fit(X_train, y_train)

    print("Best parameters:", grid_search.best_params_)
    print(f"After tuning - Best cross-validation score: {-grid_search.best_score_:.3f}" if is_multilabel 
          else f"After tuning - Best cross-validation score: {grid_search.best_score_:.3f}")

    # Evaluate on test set
    y_pred = grid_search.predict(X_test)
    if is_multilabel:
        test_hamming_loss = hamming_loss(y_test, y_pred)
        print(f"Test set Hamming Loss: {test_hamming_loss:.3f}")
        
        exact_matches = np.all(y_pred == y_test, axis=1)
        exact_match_ratio = np.mean(exact_matches)
        print(f"Exact Match Ratio: {exact_match_ratio:.3f}")
        
        jaccard_score_value = jaccard_score(y_test, y_pred, average='samples')
        print(f"Jaccard Similarity Score: {jaccard_score_value:.3f}")
        
        f1_score_value = f1_score(y_test, y_pred, average='samples')
        print(f"F1 Score: {f1_score_value:.3f}")
    else:
        test_accuracy = accuracy_score(y_test, y_pred)
        print(f"Test set accuracy: {test_accuracy:.3f}")

    return grid_search.best_estimator_

# Define parameter grids
param_grid_primary = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

param_grid_both = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__learning_rate': [0.01, 0.1, 0.5],
    'estimator__max_depth': [3, 5, 7]
}

param_grid_generation = param_grid_primary

### Train and evaluate models

In [None]:
# Primary Type - took around 6 minutes to run
print("Gradient Boosting - Primary Type Prediction:")
best_gb_primary = train_and_evaluate(gb_primary, X_train_scaled, y_train_primary, 
                                     X_test_scaled, y_test_primary, param_grid_primary)

Gradient Boosting - Primary Type Prediction:
Before tuning - Mean CV score: 0.187 (+/- 0.043)
Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
After tuning - Best cross-validation score: 0.212
Test set accuracy: 0.161


In [None]:
# Both Types - took around 9 minutes and 30 seconds 
print("\nGradient Boosting - Both Types Prediction:")
best_gb_both = train_and_evaluate(gb_both, X_train_scaled, y_train_both_bin, 
                                  X_test_scaled, y_test_both_bin, param_grid_both, is_multilabel=True)


Gradient Boosting - Both Types Prediction:
Before tuning - Mean CV Hamming Loss: 0.093 (+/- 0.006)
Best parameters: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 7, 'estimator__n_estimators': 50}
After tuning - Best cross-validation score: 0.085
Test set Hamming Loss: 0.081
Exact Match Ratio: 0.000
Jaccard Similarity Score: 0.000
F1 Score: 0.000


In [None]:
# Generation - took around 5 minutes
print("\nGradient Boosting - Generation Prediction:")
best_gb_generation = train_and_evaluate(gb_generation, X_train_scaled, y_train_generation, 
                                        X_test_scaled, y_test_generation, param_grid_generation)


Gradient Boosting - Generation Prediction:
Before tuning - Mean CV score: 0.166 (+/- 0.052)
Best parameters: {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 100}
After tuning - Best cross-validation score: 0.196
Test set accuracy: 0.220
