In [3]:
import pandas as pd

#Import DataSets

# Read the csv file
df = pd.read_csv('data_train.csv')
y_test = pd.read_csv('test_competition.csv')


Y_train = df["default.payment.next.month"]

X_train = df.drop(columns = ["default.payment.next.month"])




In [4]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def bestModelFindingWithXGBoost(param_grid, scoring, X_train, Y_train):
    # Loading XGBoost model
    print("Loading XGBoostClassifier...")
    
    model = xgb.XGBClassifier(random_state=42)
    
    print("Loading GridSearch model...")
    # Configure GridSearchCV with 5-fold cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scoring, n_jobs=-1)
    
    print("Fitting GridSearch model...")
    # Fit the GridSearch model to find the best hyperparameters
    grid_search.fit(X_train, Y_train)
    
    print("GridSearch fitted ")

    # Best parameters and best score found
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_ * 100  # In percentage

    print(f"Best params : {best_params} with the score of : {best_score}")
    
    return grid_search.best_estimator_

# Define the parameter grid for XGBoost
param_grid = {
    'max_depth': range(5, 25),  # Test max_depth from 5 to 24
    'n_estimators': [100, 200, 300],  # Test different numbers of estimators
    'learning_rate': [0.01, 0.1, 0.2],  # Try different learning rates
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Test different subsample ratios
    'colsample_bytree': [0.7, 0.8, 1.0]  # Test different column sampling rates
}

# Call the function to find the best model
bestModel = bestModelFindingWithXGBoost(param_grid, "roc_auc", X_train, Y_train)


Loading XGBoostClassifier...
Loading GridSearch model...
Fitting GridSearch model...



KeyboardInterrupt



In [21]:
from sklearn.model_selection import cross_validate
import numpy as np

#Performance testing with Cross  Validate
# Fitting the model
print("Fitting RandomForestClassifier model")
bestModel.fit(X_train, Y_train)

scores = cross_validate(bestModel, X_train, Y_train , cv=5)
print("Performance testing with Cross  Validate...")

# Computing mean score
avg_score = np.mean(scores['test_score']) * 100

print(f"Cross validate Avg Score : {avg_score:.2f}%")

Fitting RandomForestClassifier model
Performance testing with Cross  Validate...
Cross validate Avg Score : 84.06%


In [None]:
print("Predicting...")
predicted_survival = bestModel.predict_proba(y_test)  

# print(predicted_survival)
print(bestModel.classes_)

# Sort by the first column (index 0) in descending order and then select only the first column
sorted_arr = predicted_survival[predicted_survival[:, 0].argsort()[::-1], 0]

# If y_test contains the PassengerId column, you can combine it with the predictions
predicted_df = pd.DataFrame({
    'ID': y_test['ID'],  # Get the PassengerId from y_test
    'PAYED': sorted_arr  # Add the predicted payment (0 or 1)
})

predictedIds  = predicted_df.drop(columns = "PAYED")

print(predicted_df.head(10))

print(predictedIds.head(1000))

