In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings("ignore")
# Load the datasets
X = pd.read_csv(r"C:\Users\sanjay p\OneDrive\Desktop\bitcode breakers\train.csv")
final = pd.read_csv(r"C:\Users\sanjay p\OneDrive\Desktop\bitcode breakers\test.csv")

y = X[['LABEL']]
# Replace label values for binary classification
y['LABEL'] = y['LABEL'].replace({1:0, 2:1})

ID = final[['ID']]
# Drop unnecessary columns from test data
final = final.drop(columns=['Unnamed: 0','ID'],axis=1)

# Drop unnecessary columns from training data
X = X.drop(columns=['Unnamed: 0','LABEL'],axis=1)

# Define transformation pipeline for preprocessing
tran = Pipeline(steps=[ # Impute missing values with mean
            ('pca', PCA(n_components=9)), # Perform PCA with 9 components
            ('scaler', StandardScaler())  # Standardize features
        ])

features = X.columns
# Define preprocessor to handle preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
         ('t',tran, features) # Apply transformation pipeline to features
    ])

# Define stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
# Define a hyperparameter search space for BaggingCLassifier and XGBClassifier
space = {
    'max_samples': hp.uniform('max_samples', 0.7, 1.0),
    'max_features': hp.uniform('max_features', 0.7, 1.0),
    'estimator': {
        'max_depth': hp.choice('max_depth', range(2, 10)),
        'learning_rate': hp.uniform('learning_rate', 0.0003, 0.007),
        'min_child_weight': hp.uniform('min_child_weight', 1, 12),
        'gamma': hp.loguniform('gamma', low=np.log(0.001), high=np.log(5)),  # Gamma from 0 to 5 (log-uniform)
        'reg_alpha': hp.loguniform('reg_alpha', low=np.log(0.0001), high=np.log(0.8)),  # Alpha from 0 to 0.8 (log-uniform)
        'reg_lambda': hp.loguniform('reg_lambda', low=np.log(1), high=np.log(5)),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
        'colsample_bynode': hp.uniform('colsample_bynode', 0.6, 1.0),
        'colsample_bylevel': hp.uniform('colsample_bylevel', 0.6, 1.0)
    }
}

In [9]:
def objective(space):
    max_samples = space['max_samples']
    max_features = space['max_features']
    xgb_params = {
        'max_depth': space['estimator']['max_depth'],
        'learning_rate': space['estimator']['learning_rate'],
        'gamma': space['estimator']['gamma'],
        'min_child_weight': space['estimator']['min_child_weight'],
        'reg_alpha': space['estimator']['reg_alpha'],
        'reg_lambda': space['estimator']['reg_lambda'],
        'colsample_bytree': space['estimator']['colsample_bytree'],
        'colsample_bynode': space['estimator']['colsample_bynode'],
        'colsample_bylevel': space['estimator']['colsample_bylevel']
    }
    mean_accuracy_score = []
    # Flatten the target variable y
    y_data  = y.values.ravel()
    # Calculate scale_pos_weight for XGBClassifier
    scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()
    # Define the BaggingClassifier with XGBClassifier as the base estimator
    xgb_classifier = XGBClassifier(n_estimators = 300, scale_pos_weight=scale_pos_weight, objective="binary:logistic",
                                    random_state=42, **xgb_params)
    model_bagging = BaggingClassifier(
        estimator=xgb_classifier,
        n_estimators = 70,
        max_samples=max_samples,
        max_features=max_features,
        bootstrap=True,
        random_state=42  # Set random state for reproducibility
    )
    # Perform stratified k-fold cross-validation
    for train_index, test_index in skf.split(X, y):
        X_train_skf = preprocessor.fit_transform(X.iloc[train_index])
        X_test_skf = preprocessor.transform(X.iloc[test_index])
        y_train_skf, y_test_skf = y.iloc[train_index], y.iloc[test_index]
        y_train_skf = y_train_skf.values.ravel()
        y_test_skf = y_test_skf.values.ravel()

        # Fit the model
        model_bagging.fit(X_train_skf, y_train_skf)
        y_pred_skf = model_bagging.predict(X_test_skf)
        accuracy = accuracy_score(y_test_skf, y_pred_skf)
        mean_accuracy_score.append(accuracy)
    # Calculate mean accuracy   
    mean_accuracy = np.mean(mean_accuracy_score)
    # Return loss (negative mean accuracy) and optimization status
    return {'loss': -mean_accuracy, 'status': STATUS_OK}

# Run hyperparameter optimization using Hyperopt
trials = Trials()
best_params = fmin(objective, space, rstate=np.random.default_rng(42), algo=tpe.suggest,
                   max_evals=40, trials=trials)
# Print best hyperparameters found
print("Best hyperparameters:", best_params)


100%|██████████| 40/40 [1:17:11<00:00, 115.79s/trial, best loss: -0.992726788022092]
Best hyperparameters: {'colsample_bylevel': 0.7550550688041309, 'colsample_bynode': 0.8422597946469974, 'colsample_bytree': 0.8035651858278754, 'gamma': 0.01906530906714543, 'learning_rate': 0.006046234261242088, 'max_depth': 7, 'max_features': 0.9853464615314544, 'max_samples': 0.9864341340067273, 'min_child_weight': 3.951572290815913, 'reg_alpha': 0.00014489149306128374, 'reg_lambda': 1.0574422551197646}


In [3]:
# Define the best hyperparameters obtained from hyperparameter tuning
best_params = {'colsample_bylevel': 0.7550550688041309, 'colsample_bynode': 0.8422597946469974, 'colsample_bytree': 0.8035651858278754, 'gamma': 0.01906530906714543, 'learning_rate': 0.006046234261242088, 'max_depth': 9, 
                'min_child_weight': 3.951572290815913, 'reg_alpha': 0.00014489149306128374, 'reg_lambda': 1.0574422551197646}
bp = {'max_features': 0.9853464615314544, 'max_samples': 0.9864341340067273}

In [4]:

def evaluate():
# Flatten the target variable y
    y_data  = y.values.ravel()
    mean_accuracy_score = []
    # Calculate scale_pos_weight for XGBClassifier
    scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()
    # Initialize BaggingClassifier & XGBClassifier with best hyperparameters
    model_bagging = BaggingClassifier(
            estimator=XGBClassifier(n_estimators=300, scale_pos_weight=scale_pos_weight, objective="binary:logistic", 
                                    random_state=42, **best_params),n_estimators=70,
            random_state=42,
            bootstrap=True, **bp
        )
    for train_index, test_index in skf.split(X, y):
        X_train_skf = preprocessor.fit_transform(X.iloc[train_index])
        X_test_skf = preprocessor.transform(X.iloc[test_index])
        y_train_skf, y_test_skf = y.iloc[train_index], y.iloc[test_index]
        y_train_skf = y_train_skf.values.ravel()
        y_test_skf = y_test_skf.values.ravel()
        
        # Fit the model
        model_bagging.fit(X_train_skf, y_train_skf)
        y_pred_skf = model_bagging.predict(X_test_skf)
        accuracy = accuracy_score(y_test_skf, y_pred_skf)
        mean_accuracy_score.append(accuracy)
    # Calculate mean accuracy score    
    mean_accuracy = np.mean(mean_accuracy_score)
    # Print mean accuracy score  
    print("mean_accuracy:", mean_accuracy)
# Call the evaluate function
print("call evaluate function")
evaluate()

call evaluate function
mean_accuracy: 0.992726788022092


In [5]:

ID_array = ID['ID'].to_numpy()
# Transform features in the test set using the preprocessor fitted on the training data
X_pre = preprocessor.fit_transform(X)
final_pre = preprocessor.transform(final)

# Flatten the target variable y
y_data = y.values.ravel()

# Calculate scale_pos_weight for XGBClassifier
scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()

# Initialize BaggingClassifier & XGBClassifier with best hyperparameters
best_model = BaggingClassifier(
            estimator=XGBClassifier(n_estimators=300, scale_pos_weight=scale_pos_weight, objective="binary:logistic", 
                                    random_state=42, **best_params),n_estimators=70,
            random_state=42,
            bootstrap=True, **bp
        )
# Fit the best_model on the preprocessed training data
best_model.fit(X_pre, y_data)
y_pred_prob = best_model.predict_proba(final_pre)[:,1]
# Make predictions on the preprocessed test data
final_predictions =  best_model.predict(final_pre)
final_predictions = np.where(final_predictions == 0, 1, 2)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'ID': ID_array,
    'label': final_predictions
})
# Save results to CSV file
predictions_df.to_csv("submission.csv",index=False)
