In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import BaggingClassifier, GradientBoostingRegressor
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from fairlearn.metrics import demographic_parity_ratio
from sklearn.metrics import f1_score, classification_report
from fairlearn.postprocessing import ThresholdOptimizer, plot_threshold_optimizer
# Load the datasets
X = pd.read_csv("train.csv")
unexpected_values = ['0', '3']
X['MARITAL_STATUS'] = X['MARITAL_STATUS'].replace(unexpected_values, np.nan)
X.dropna(subset=['MARITAL_STATUS'],inplace=True)

# Separate features and target variable
y = X[['DEFAULT.PAYMENT.NEXT.MONTH']]

# Drop 'ID',  'DEFAULT.PAYMENT.NEXT.MONTH' and 'EDUCATION', 'EDUCATION' is dropped due to its low feature imporatnce score
X = X.drop(['ID', 'DEFAULT.PAYMENT.NEXT.MONTH', 'EDUCATION'], axis=1)
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
space = {
        'n_estimators': hp.choice('n_estimators', [400, 450, 500, 550, 600, 650]),
        'max_depth': hp.choice('max_depth', range(3, 12)),
        'learning_rate': hp.uniform('learning_rate', 0.0003, 0.008),
        'min_child_weight': hp.uniform('min_child_weight', 1, 12),
        'gamma': hp.loguniform('gamma', low=np.log(0.01), high=np.log(5)),  # Gamma from 0 to 5 (log-uniform)
        'reg_alpha': hp.loguniform('reg_alpha', low=np.log(0.001), high=np.log(0.81)),  # Alpha from 0 to 0.8 (log-uniform)
        'reg_lambda': hp.loguniform('reg_lambda', low=np.log(1), high=np.log(5)),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
        'colsample_bynode': hp.uniform('colsample_bynode', 0.6, 1.0),
        'colsample_bylevel': hp.uniform('colsample_bylevel', 0.6, 1.0)
        }

def objective(space):

    mean_f1_scores = []

    # Flatten the target variable y
    y_data  = y.values.ravel()

    # Calculate scale_pos_weight for XGBClassifier
    scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()
    model_xgb = XGBClassifier(scale_pos_weight=scale_pos_weight, objective="binary:logistic",
                                    random_state=42, **space)
    # Perform stratified k-fold cross-validation
    for train_index, test_index in skf.split(X, y):
        X_train_skf = preprocessor.fit_transform(X.iloc[train_index])
        X_test_skf = preprocessor.transform(X.iloc[test_index])
        y_train_skf, y_test_skf = y.iloc[train_index], y.iloc[test_index]
        y_train_skf = y_train_skf.values.ravel()
        y_test_skf = y_test_skf.values.ravel()
        sensitive_features_test = X.iloc[test_index]['SEX'].values
        sensitive_features_train = X.iloc[train_index]['SEX'].values
        # Initialize ThresholdOptimizer for fairness constraints
        threshold_optimizer = ThresholdOptimizer(
         estimator=model_xgb,
         constraints="demographic_parity",
         prefit=False,objective='balanced_accuracy_score',
         predict_method="predict"
     )

        # Fit the model
        threshold_optimizer.fit(X_train_skf, y_train_skf, sensitive_features=sensitive_features_train)

        # Predict on the test set
        y_pred_skf = threshold_optimizer.predict(X_test_skf, sensitive_features=sensitive_features_test, random_state=42)

        # Calculate F1 score
        f1 = f1_score(y_test_skf, y_pred_skf, average='binary')
        mean_f1_scores.append(f1)

    # Calculate mean F1 score
    mean_f1 = np.mean(mean_f1_scores)
    # Return loss (negative mean F1 score) and optimization status
    return {'loss': -mean_f1, 'status': STATUS_OK}

# Run hyperparameter optimization using Hyperopt
trials = Trials()
best_params = fmin(objective, space, rstate=np.random.default_rng(42), algo=tpe.suggest,
                   max_evals=150, trials=trials)
# Print best hyperparameters found
print("Best hyperparameters:", best_params)



100%|██████████| 150/150 [23:05<00:00,  9.24s/trial, best loss: -0.5342588526835975]
Best hyperparameters: {'colsample_bylevel': 0.7455126867976273, 'colsample_bynode': 0.8023057861840502, 'colsample_bytree': 0.9237560913738425, 'gamma': 0.19318393866652434, 'learning_rate': 0.0008519957113860956, 'max_depth': 1, 'min_child_weight': 2.5367614847690176, 'n_estimators': 2, 'reg_alpha': 0.0392827338916632, 'reg_lambda': 2.7656564834187147}


In [4]:
best_params = {'colsample_bylevel': 0.7455126867976273, 'colsample_bynode': 0.8023057861840502, 
               'colsample_bytree': 0.9237560913738425, 'gamma': 0.19318393866652434, 'learning_rate': 0.0008519957113860956,
               'max_depth': 4, 'min_child_weight': 2.5367614847690176, 'n_estimators': 500, 
               'reg_alpha': 0.0392827338916632, 'reg_lambda': 2.7656564834187147}
def evaluate():

    mean_f1_scores = []
    mean_parity_ratio = []
    # Flatten the target variable y
    y_data  = y.values.ravel()
    # Calculate scale_pos_weight for XGBClassifier
    scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()
    # Initialize XGBClassifier with best hyperparameters
    model_xgb=XGBClassifier (scale_pos_weight=scale_pos_weight, objective="binary:logistic",
                                    random_state=42, **best_params)
    # Perform stratified k-fold cross-validation
    for train_index, test_index in skf.split(X, y):
        X_train_skf = preprocessor.fit_transform(X.iloc[train_index])
        X_test_skf = preprocessor.transform(X.iloc[test_index])
        sensitive_features_test = X.iloc[test_index]['SEX'].values
        sensitive_features_train = X.iloc[train_index]['SEX'].values
        y_train_skf, y_test_skf = y.iloc[train_index], y.iloc[test_index]
        y_train_skf = y_train_skf.values.ravel() 
        y_test_skf = y_test_skf.values.ravel()
        # Initialize ThresholdOptimizer for fairness constraints
        threshold_optimizer = ThresholdOptimizer(
         estimator=model_xgb,
         constraints="demographic_parity",
         prefit=False,objective='balanced_accuracy_score',
         predict_method="predict"
        )
        # Fit the model
        threshold_optimizer.fit(X_train_skf, y_train_skf, sensitive_features=sensitive_features_train)

        # Predict on the test set
        y_pred_skf = threshold_optimizer.predict(X_test_skf,sensitive_features=sensitive_features_test, random_state=42)

        # Evaluate the model using F1 score and demographic parity ratio
        f1 = f1_score(y_test_skf, y_pred_skf, average='binary')
        mean_f1_scores.append(f1)
        parity_ratio = demographic_parity_ratio(y_test_skf, y_pred_skf, sensitive_features=sensitive_features_test)
        mean_parity_ratio.append(parity_ratio)

    # Print mean F1 score and mean demographic parity ratio
    print("Mean_f1 score", np.mean(mean_f1_scores))
    print("Mean Parity ratio", np.mean(mean_parity_ratio))
# Call the evaluate function
print("call evaluate function")
evaluate()

call evaluate function
Mean_f1 score 0.5342588526835975
Mean Parity ratio 0.9799749520231481


In [6]:
final = pd.read_csv("test.csv")

# Extract 'ID' for later use
i = final['ID']

# Remove 'ID'
final = final.drop(['ID'], axis=1)

# Transform features in the test set using the preprocessor fitted on the training data
X_pre = preprocessor.fit_transform(X)
final_pre = preprocessor.transform(final)

# Extract sensitive features from the test and training data
sensitive_features_final = final['SEX']
sensitive_features_X = X['SEX']

# Flatten the target variable y
y_data = y.values.ravel()

# Calculate scale_pos_weight for XGBClassifier
scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()

# Initialize XGBClassifier with best hyperparameters
best_model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    random_state=42,
    **best_params
)

# Initialize ThresholdOptimizer for fairness constraints
threshold_optimizer = ThresholdOptimizer(
    estimator=best_model,
    constraints="demographic_parity",
    prefit=False,
    objective='balanced_accuracy_score',
    predict_method="predict"
)
# Fit the threshold optimizer on the preprocessed training data
threshold_optimizer.fit(X_pre, y_data, sensitive_features=sensitive_features_X)
# Make predictions on the preprocessed test data
final_predictions = threshold_optimizer.predict(final_pre, sensitive_features=sensitive_features_final,
                                                random_state=42)
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'ID': i,
    'DEFAULT.PAYMENT.NEXT.MONTH': final_predictions
})
# Save results to CSV file
predictions_df.to_csv("/kaggle/working/submission_enigma.csv",index=False)