<a href="https://colab.research.google.com/github/mazenbuk/ppm/blob/main/horsss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV, KFold
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [None]:
data_train = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/train.csv')
data_test = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/test.csv')

In [None]:
data_train.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [None]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1196 non-null   object 
 8   peripheral_pulse       1175 non-null   object 
 9   mucous_membrane        1214 non-null   object 
 10  capillary_refill_time  1229 non-null   object 
 11  pain                   1191 non-null   object 
 12  peristalsis            1215 non-null   object 
 13  abdominal_distention   1212 non-null   object 
 14  nasogastric_tube       1155 non-null   object 
 15  naso

In [None]:
# Prepare features and target variable
X_train = data_train.drop(columns=['id', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
y_train = data_train['outcome']

In [None]:
# Define numerical and categorical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [None]:
# Define preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

In [None]:
# Stratified K-Fold cross-validator
# stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# K-Fold cross-validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# Define hyperparameters grid
classifiers = {
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': np.arange(2, 30, 1),
            'leaf_size': np.arange(2, 50, 1),
            'metric': ['euclidean', 'manhattan', 'minkowski'],
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': np.logspace(-10, -7, 10)
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': np.arange(2, 50, 1),
            'min_samples_split': np.arange(2, 10, 1),
            'min_samples_leaf': np.arange(2, 5, 1)
        }
    }
}

In [None]:
for name, config in classifiers.items():
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', config['model'])
    ])

    grid_search = GridSearchCV(model_pipeline, {f'classifier__{key}': value for key, value in config['params'].items()}, cv=kfold, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Generate cross-validated predictions for evaluation
    y_pred = cross_val_predict(best_model, X_train, y_train, cv=kfold, method='predict')
    y_scores = cross_val_predict(best_model, X_train, y_train, cv=kfold, method='predict_proba')[:, 1]

    # Print classification report
    print(f"Classifier: {name}")
    print(classification_report(y_train, y_pred))
    print("-" * 30)

    # Prepare test features, predict and prepare submission
    X_test = data_test.drop(columns=['id', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
    predictions = best_model.predict(X_test)
    submission = pd.DataFrame({
        'id': data_test['id'],
        'outcome': predictions
    })
    submission.to_csv(f'{name}_submission.csv', index=False)

Classifier: SVM
              precision    recall  f1-score   support

        died       0.65      0.74      0.69       410
  euthanized       0.67      0.53      0.59       251
       lived       0.73      0.72      0.72       574

    accuracy                           0.69      1235
   macro avg       0.68      0.67      0.67      1235
weighted avg       0.69      0.69      0.69      1235

------------------------------
Classifier: KNN
              precision    recall  f1-score   support

        died       0.64      0.71      0.67       410
  euthanized       0.67      0.58      0.62       251
       lived       0.73      0.71      0.72       574

    accuracy                           0.69      1235
   macro avg       0.68      0.67      0.67      1235
weighted avg       0.69      0.69      0.68      1235

------------------------------
Classifier: NaiveBayes
              precision    recall  f1-score   support

        died       0.50      0.93      0.65       410
  euthanized

# Menambah Algoritma Boosting

In [None]:
pip install CatBoost

Collecting CatBoost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: CatBoost
Successfully installed CatBoost-1.2.5


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV, KFold
from sklearn.metrics import classification_report, mean_squared_error
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import numpy as np

# Load data
data_train = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/train.csv')
data_test = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/test.csv')

# Prepare features and target variable
X_train = data_train.drop(columns=['id', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
y_train = data_train['outcome']

# Encode target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Define numerical and categorical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# K-Fold cross-validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Define hyperparameters grid for various classifiers
classifiers = {
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': np.arange(2, 30, 1),
            'leaf_size': np.arange(2, 50, 1),
            'metric': ['euclidean', 'manhattan', 'minkowski'],
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': np.logspace(-10, -7, 10)
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': np.arange(2, 50, 1),
            'min_samples_split': np.arange(2, 10, 1),
            'min_samples_leaf': np.arange(2, 5, 1)
        }
    },
    'Random Forest':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': np.arange(3, 20, 1),
            'min_samples_split': np.arange(2, 11, 1),
            'min_samples_leaf': np.arange(1, 6, 1)
        }
    }
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'LGBM': {
        'model': LGBMClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'iterations': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'depth': np.arange(3, 10, 1)
        }
    }
}

for name, config in classifiers.items():
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', config['model'])
    ])

    grid_search = GridSearchCV(model_pipeline, {f'classifier__{key}': value for key, value in config['params'].items()}, cv=kfold, n_jobs=-1)
    grid_search.fit(X_train, y_train_encoded)
    best_model = grid_search.best_estimator_

    # Generate cross-validated predictions for evaluation
    y_pred = cross_val_predict(best_model, X_train, y_train_encoded, cv=kfold, method='predict')
    y_scores = cross_val_predict(best_model, X_train, y_train_encoded, cv=kfold, method='predict_proba')[:, 1]

    # Print classification report
    print(f"Classifier: {name}")
    print(classification_report(y_train_encoded, y_pred))
    print("-" * 30)

    # Prepare test features, predict and prepare submission
    X_test = data_test.drop(columns=['id', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
    predictions = best_model.predict(X_test)
    submission = pd.DataFrame({
        'id': data_test['id'],
        'outcome': label_encoder.inverse_transform(predictions)  # Decode predictions back to original labels
    })
    submission.to_csv(f'{name}_submission.csv', index=False)

Classifier: SVM
              precision    recall  f1-score   support

           0       0.65      0.74      0.69       410
           1       0.67      0.53      0.59       251
           2       0.73      0.72      0.72       574

    accuracy                           0.69      1235
   macro avg       0.68      0.67      0.67      1235
weighted avg       0.69      0.69      0.69      1235

------------------------------
Classifier: KNN
              precision    recall  f1-score   support

           0       0.64      0.71      0.67       410
           1       0.67      0.58      0.62       251
           2       0.73      0.71      0.72       574

    accuracy                           0.69      1235
   macro avg       0.68      0.67      0.67      1235
weighted avg       0.69      0.69      0.68      1235

------------------------------
Classifier: NaiveBayes
              precision    recall  f1-score   support

           0       0.50      0.93      0.65       410
           1

# Menggunakan Imputasi KNN

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, KFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.base import BaseEstimator
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Load data
data_train = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/train.csv')
data_test = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/test.csv')

# Prepare features and target variable
X_train = data_train.drop(columns=['id', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
y_train = data_train['outcome']

# Encode target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Define numerical and categorical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# K-Fold cross-validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Define hyperparameters grid for various classifiers
classifiers = {
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': np.arange(2, 30, 1),
            'leaf_size': np.arange(2, 50, 1),
            'metric': ['euclidean', 'manhattan', 'minkowski'],
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': np.logspace(-10, -7, 10)
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': np.arange(2, 50, 1),
            'min_samples_split': np.arange(2, 10, 1),
            'min_samples_leaf': np.arange(2, 5, 1)
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'bootstrap': [True],
            'n_estimators': [50, 100, 200],
            'max_depth': [50, 70, 100, 110],
            'min_samples_split': [8, 10, 12],
            'min_samples_leaf': np.arange(2, 5, 1)
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'LGBM': {
        'model': LGBMClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'iterations': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'depth': np.arange(3, 10, 1)
        }
    }
}

for name, config in classifiers.items():
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', config['model'])
    ])

    grid_search = GridSearchCV(model_pipeline, {f'classifier__{key}': value for key, value in config['params'].items()}, cv=kfold, n_jobs=-1)
    grid_search.fit(X_train, y_train_encoded)
    best_model = grid_search.best_estimator_

    # Generate cross-validated predictions for evaluation
    y_pred = cross_val_predict(best_model, X_train, y_train_encoded, cv=kfold, method='predict')
    y_scores = cross_val_predict(best_model, X_train, y_train_encoded, cv=kfold, method='predict_proba')[:, 1]

    # Print classification report
    print(f"Classifier: {name}")
    print(classification_report(y_train_encoded, y_pred))
    print("-" * 30)

    # Prepare test features, predict and prepare submission
    X_test = data_test.drop(columns=['id', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
    predictions = best_model.predict(X_test)
    submission = pd.DataFrame({
        'id': data_test['id'],
        'outcome': label_encoder.inverse_transform(predictions)  # Decode predictions back to original labels
    })
    submission.to_csv(f'{name}_submission.csv', index=False)

Classifier: RandomForest
              precision    recall  f1-score   support

           0       0.65      0.75      0.69       410
           1       0.72      0.59      0.65       251
           2       0.75      0.72      0.73       574

    accuracy                           0.70      1235
   macro avg       0.70      0.69      0.69      1235
weighted avg       0.71      0.70      0.70      1235

------------------------------


# Menambahkan Oversampling

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, KFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Load data
data_train = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/train.csv')
data_test = pd.read_csv('https://raw.github.com/mazenbuk/ppm/main/ppm-predict-health-outcomes-of-horses/test.csv')

# Prepare features and target variable
X_train = data_train.drop(columns=['id', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
y_train = data_train['outcome']

# Encode target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Define numerical and categorical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# K-Fold cross-validator
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Define hyperparameters grid for various classifiers
classifiers = {
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': np.arange(2, 30, 1),
            'leaf_size': np.arange(2, 50, 1),
            'metric': ['euclidean', 'manhattan', 'minkowski'],
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': np.logspace(-10, -7, 10)
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': np.arange(2, 50, 1),
            'min_samples_split': np.arange(2, 10, 1),
            'min_samples_leaf': np.arange(2, 5, 1)
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    # 'RandomForest': {
    #     'model': RandomForestClassifier(),
    #     'params': {
    #         'bootstrap': [True],
    #         'n_estimators': [50, 100, 200],
    #         'max_depth': [50, 70, 100, 110],
    #         'min_samples_split': [8, 10, 12],
    #         'min_samples_leaf': np.arange(2, 5, 1)
    #     }
    # },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'LGBM': {
        'model': LGBMClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': np.arange(3, 10, 1)
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'iterations': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
            'depth': np.arange(3, 10, 1)
        }
    }
}

# Running the training and evaluation for each classifier
for name, config in classifiers.items():
    model_pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', config['model'])
    ])

    grid_search = GridSearchCV(model_pipeline, {f'classifier__{key}': value for key, value in config['params'].items()}, cv=kfold, n_jobs=-1)
    grid_search.fit(X_train, y_train_encoded)
    best_model = grid_search.best_estimator_

    # Generate cross-validated predictions for evaluation
    y_pred = cross_val_predict(best_model, X_train, y_train_encoded, cv=kfold, method='predict')
    y_scores = cross_val_predict(best_model, X_train, y_train_encoded, cv=kfold, method='predict_proba')[:, 1]

    # Print classification report
    print(f"Classifier: {name}")
    print(classification_report(y_train_encoded, y_pred))
    print("-" * 30)

    # Prepare test features, predict and prepare submission
    X_test = data_test.drop(columns=['id', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data'])
    predictions = best_model.predict(X_test)
    submission = pd.DataFrame({
        'id': data_test['id'],
        'outcome': label_encoder.inverse_transform(predictions)
    })
    submission.to_csv(f'{name}_submission.csv', index=False)


Classifier: SVM
              precision    recall  f1-score   support

           0       0.64      0.74      0.69       410
           1       0.61      0.64      0.63       251
           2       0.76      0.66      0.71       574

    accuracy                           0.68      1235
   macro avg       0.67      0.68      0.67      1235
weighted avg       0.69      0.68      0.68      1235

------------------------------
Classifier: KNN
              precision    recall  f1-score   support

           0       0.63      0.71      0.67       410
           1       0.56      0.70      0.62       251
           2       0.78      0.61      0.68       574

    accuracy                           0.66      1235
   macro avg       0.65      0.68      0.66      1235
weighted avg       0.68      0.66      0.67      1235

------------------------------
Classifier: NaiveBayes
              precision    recall  f1-score   support

           0       0.51      0.92      0.66       410
           1

  y = column_or_1d(y, warn=True)
