In [32]:
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency, ttest_ind, pearsonr
from sklearn import set_config

set_config(transform_output="pandas")

# link to Kaggle competition
# https://www.kaggle.com/competitions/mlpro-classification-bank-churn-2025


In [33]:
# Dataset
# Customer ID : Un identifiant pour chaque client
# Surname : Le nom de famille du client
# Credit Score : Une valeur numérique représentant le score de crédit du client
# Geography : Le pays où réside le client (France, Spain ou Germany)
# Gender : Le genre du client (Male or Female)
# Age : L’âge du client
# Tenure : Le nombre d’années pendant lesquelles le client est avec la banque
# Balance : Le solde du compte du client
# NumOfProducts : Le nombre de produits bancaires utilisés par le client (par exemple, compte d’épargne, carte de crédit)
# HasCrCard : Si le client possède une carte de crédit (1 = oui, 0 = non)
# IsActiveMember : Si le client est un membre actif (1 = oui, 0 = non)
# EstimatedSalary : Le salaire estimé du client
# Exited : Si le client a résilié (1 = oui, 0 = non)

df = pd.read_csv('data/train_data.csv')
df_test = pd.read_csv('data/test_data.csv')


In [34]:
# Encoding des variable Geography et Gender

from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

column_transformer = ColumnTransformer(transformers=[
   ("Encoder", OneHotEncoder(sparse_output=False, drop="first"), ["Geography", "Gender"]),
   ("MinMaxScaler", MinMaxScaler(), ['Tenure', 'NumOfProducts', 'Balance', "EstimatedSalary"]),
   ("StandardScaler", StandardScaler(), ['CreditScore', 'Age'])], 
   remainder="passthrough",
   verbose_feature_names_out=False)

# remove age outliers
df = df.query('Age < 90')
training_df = df.drop('Exited', axis=1)

pipeline = Pipeline(steps=[("Encoding + Normalisation", column_transformer)])
training_df = pipeline.fit_transform(training_df)

predictions_columns = [
   "Geography_Germany",
   "Geography_Spain",
   "Gender_Male", 
   "Tenure", 
   "IsActiveMember", 
   "NumOfProducts", 
   "Balance",
   "Age",
   "HasCrCard", 
   "EstimatedSalary",
   "CreditScore"]

X_train = training_df[predictions_columns]
y_train = df["Exited"]


model = GradientBoostingClassifier(learning_rate=0.2, n_estimators=250, max_depth=3, min_samples_split=3)
model.fit(X_train, y_train, sample_weight=[2 if y == 1 else 1 for y in y_train])

df_test = pipeline.transform(df_test)
df_test

X_test = df_test[predictions_columns]
predictions = model.predict(X_test)
predictions

results = pd.DataFrame({
   "ID": df_test["ID"],
   "Exited": predictions
})
results = results.set_index("ID")
results
results.to_csv("submission.csv")


Test with GradientBoostingClassifier :
- learning_rate=0.2
- n_estimators=250
- max_depth=3
- min-samples_split=3
- clean outliers on Age >= 90
- sample_weight on Exited