In [1]:
%store -r df

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

df_copy = df.copy()

df['Age_NumOfProducts'] = df['Age'] * df['NumOfProducts']  
df['Age_IsActiveMemeber'] = df['Age'] * df['IsActiveMember']
df['NumOfProducts_IsActive'] = df['NumOfProducts'] * df['IsActiveMember']

# Preparing data for training
X = df.drop(columns=['Exited', 'CustomerId', 'Surname'])  # Features
X1 = X.copy()
y = df['Exited']  # Target

In [53]:
# Spliting the data into train and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.2, random_state=99, stratify=y)

In [57]:
# Building the model
model = XGBClassifier(random_state=99, n_estimators=200, learning_rate=0.05)  # XGBoost classifier

# Training the model
model.fit(X1_train, y1_train)

# Predictions
y1_pred = model.predict(X1_test)

# Evaluating the model
print(classification_report(y1_test, y1_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.78      0.49      0.60       407

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



----
#### Precision for churners have a big improve
----

In [59]:
X2 = X.copy()

In [63]:
X2 = X2.drop(columns=['CreditScore','HasCrCard','EstimatedSalary','HasZeroBalance'])

In [65]:
# Spliting the data into train and test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.2, random_state=99, stratify=y)
# Building the model
model = XGBClassifier(random_state=99, n_estimators=200, learning_rate=0.05)  # XGBoost classifier

# Training the model
model.fit(X2_train, y2_train)

# Predictions
y2_pred = model.predict(X2_test)

# Evaluating the model
print(classification_report(y2_test, y2_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.77      0.50      0.61       407

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



----
#### Eliminating columns CreditScore, HasCrCard, EstimatedSalary, and HasZeroBalance didn't improve the model
----

In [151]:
X3 = X1.drop(columns='Age_IsActiveMemeber')

# Spliting the data into train and test sets
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.2, random_state=99, stratify=y)
# Building the model
model = XGBClassifier(random_state=99, n_estimators=200, learning_rate=0.05)  # XGBoost classifier

# Training the model
model.fit(X3_train, y3_train)

# Predictions
y3_pred = model.predict(X3_test)

# Evaluating the model
print(classification_report(y3_test, y3_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.80      0.49      0.61       407

    accuracy                           0.87      2000
   macro avg       0.84      0.73      0.77      2000
weighted avg       0.87      0.87      0.86      2000



----
#### Eliminating new feature Age_IsActiveMember improved precision by 3%
----

In [135]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameters to search
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [100, 200, 300, 400],
    'subsample': [0.7, 0.8, 1.0]
}

# Create the XGBClassifier
xgb_model = XGBClassifier(random_state=99)

# Set up RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, cv=5, verbose=1, random_state=99)

# Fit the model
random_search.fit(X3_train, y3_train)

# Get the best parameters
best_params_random = random_search.best_params_
print("Best parameters found by RandomizedSearchCV: ", best_params_random)

# Retrain the model with the best parameters
best_xgb_random = random_search.best_estimator_

# Evaluate the model
y_pred_random = best_xgb_random.predict(X3_test)
print(classification_report(y3_test, y3_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found by RandomizedSearchCV:  {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05}
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.80      0.49      0.61       407

    accuracy                           0.87      2000
   macro avg       0.84      0.73      0.77      2000
weighted avg       0.87      0.87      0.86      2000



In [169]:
X4 = X3.copy()
X4['Tenure_Age'] = X4['Tenure'] * X4['Age']

# Spliting the data into train and test sets
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y, test_size=0.2, random_state=99, stratify=y)
# Building the model
model = XGBClassifier(random_state=99, n_estimators=300, learning_rate=0.05)  # XGBoost classifier

# Training the model
model.fit(X4_train, y4_train)

# Predictions
y4_pred = model.predict(X4_test)

# Evaluating the model
print(classification_report(y4_test, y4_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.81      0.49      0.61       407

    accuracy                           0.87      2000
   macro avg       0.84      0.73      0.77      2000
weighted avg       0.87      0.87      0.86      2000



----
#### Using the best parameters yields the same result. Adding the new feature Tenure_Age improved the precision of churners by 1%. The model has an accuracy of 87%, with both churners and non-churners being at least 80% precise. However, churners' recall remains low and needs improvement.
----