# Margo Lowell BANA 4080 Final Project (Fall 2025)
## Customer Churn Prediction at Regork Telecom
The critical challenge is customer churn. The goal of the project is to help identify customers that are likelyto churn and build models to predict churn probablility. These insights should help Regork in retention tactics to ultimatley save Regork from money or revenue loss.

In [79]:
#import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

## Part 1 : Exploratory Data Analysis
Undertanding the data and identifying "signal" vs "noise."

In [80]:
#read the data set
df = pd.read_csv('customer_retention_training.csv')
df.head()

Unnamed: 0,Status,Gender,SeniorCitizen,Partner,Dependents,Tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,...,NumInternationalCalls,NumAppCrashes,AppSessionLengthAvg,NumPaymentFailures,DataOverageCount,InternalCodeFlag,RandomIDHash,SurveyVersion,LegacySystemScore,PromoCategory
0,Current,Male,No,No,No,27,One year,Yes,Credit card (automatic),81.92,...,11,4,9.631306,2,1,B,X1,V2,Medium,
1,Current,Female,No,Yes,Yes,27,Month-to-month,Yes,Electronic check,102.1,...,18,0,11.521527,0,2,B,X3,V2,Medium,
2,Left,Female,No,No,No,72,Month-to-month,No,Mailed check,104.55,...,15,1,12.498201,1,1,C,X4,V2,Low,App
3,Current,Male,No,Yes,No,53,Month-to-month,Yes,Electronic check,91.3,...,6,4,11.63912,2,1,A,X3,V2,Medium,App
4,Current,Male,No,Yes,No,62,Month-to-month,Yes,Mailed check,84.06,...,11,2,7.133205,1,0,B,X4,V1,High,App


In [81]:
#Question 1: Churn rate
churn_rate = (df['Status']== 'Left').mean()
print("Overall Churn Rate:", round(churn_rate, 4))

Overall Churn Rate: 0.0992


In [82]:
#Question 2: Mean Tenure for churned vs staying customers
tenure_churn = df.groupby('Status')['Tenure'].mean()
print(f"Mean Tenure (by status):{tenure_churn}")

Mean Tenure (by status):Status
Current    36.564107
Left       34.349580
Name: Tenure, dtype: float64


In [83]:
#Question 3: Mean Monthly charges By Contract
monthly_charges_contract = df.groupby('Contract')['MonthlyCharges'].mean()
print(f"Mean Monthly Charges (by contract):{monthly_charges_contract}")

Mean Monthly Charges (by contract):Contract
Month-to-month    81.520549
One year          77.260549
Two year          71.874439
Name: MonthlyCharges, dtype: float64


In [84]:
#Question 4: Churn Rate By Payment Method
churn_rate_payment = df.groupby('PaymentMethod')['Status'].apply(lambda x: (x == 'Left').mean())
print(f"Churn Rate (by payment method):{churn_rate_payment}")

Churn Rate (by payment method):PaymentMethod
Bank transfer (automatic)    0.082584
Credit card (automatic)      0.085202
Electronic check             0.132057
Mailed check                 0.078032
Name: Status, dtype: float64


In [85]:
#Question 5: App Usage Signal
app_usage = df.groupby('Status')['NumAppLoginsLastMonth'].mean()
print(f"Mean App Usage (by status):{app_usage}")

Mean App Usage (by status):Status
Current    24.607031
Left       23.897479
Name: NumAppLoginsLastMonth, dtype: float64


In [86]:
#Question 6: Investigating Noise
noise_investigation = df.groupby('RandomIDHash')['Status'].apply(lambda x: (x == 'Left').mean())
print(f"Churn Rate (by RandomIDHash):{noise_investigation}")

#there is no signifigant variation in these churn rates

Churn Rate (by RandomIDHash):RandomIDHash
X1    0.093586
X2    0.106623
X3    0.102113
X4    0.094682
Name: Status, dtype: float64


In [87]:
#Question 7: Depedants and Churn
dependants_churn = df.groupby('Dependents')['Status'].apply(lambda x: (x == 'Left').mean())
print(f"Churn Rate by Dependents:{dependants_churn}")


Churn Rate by Dependents:Dependents
No     0.098462
Yes    0.100845
Name: Status, dtype: float64


In [88]:
#Question 8: Email engagement
email_engagement = df.groupby('Status')['EmailOpenRate'].mean()
print(f"Mean Email Open Rate (by status):{email_engagement}")

Mean Email Open Rate (by status):Status
Current    0.201111
Left       0.196562
Name: EmailOpenRate, dtype: float64


In [89]:
#Question 9: Cburn Rate By Contract
churn_rate_contract = df.groupby('Contract')['Status'].apply(lambda x: (x == 'Left').mean())
print(f"Churn Rate by Contract:{churn_rate_contract}")

Churn Rate by Contract:Contract
Month-to-month    0.160243
One year          0.022103
Two year          0.028053
Name: Status, dtype: float64


In [90]:
#Question 10: Tenure Group Comparison

#tenure group
df['TenureGroup'] = pd.cut(
    df['Tenure'],
    bins=[0, 12, 24, df['Tenure'].max()],
    labels=['<12 months', '12-24 months', '>24 months'],
    right=True)

#churn rate by tenure group
churn_rate_tenure = df.groupby('TenureGroup', observed=False)['Status'].apply(lambda x: (x == 'Left').mean())
print(f"Churn Rate by Tenure Group:{churn_rate_tenure}")

Churn Rate by Tenure Group:TenureGroup
<12 months      0.105368
12-24 months    0.106426
>24 months      0.095798
Name: Status, dtype: float64


In [91]:
#Question 11: Top Categorical Predictors

#cat columns
cat_cols = df.select_dtypes(include= ['object', 'category']).columns
cat_result=[]

for col in cat_cols:
    if col == 'Status':
      continue
    churned = df[df['Status'] == 'Left'][col].value_counts(normalize=True)
    stayed = df[df['Status'] == 'Current'][col].value_counts(normalize=True)
    diff_series = (churned - stayed).abs().fillna(0)
    cat_result.append((col, diff_series.max()))

cat_ranked = sorted(cat_result, key=lambda x: x[1], reverse=True)
print(f"Top Categorical Predictors: {cat_ranked}")

Top Categorical Predictors: [('Contract', 0.3754615629785679), ('InternetService', 0.30816937320719223), ('TechSupport', 0.13770940383553987), ('OnlineBackup', 0.1252932625409091), ('PaymentMethod', 0.12499630749617147), ('OnlineSecurity', 0.11680128110448618), ('DeviceProtection', 0.11441164810049828), ('StreamingTV', 0.11441164810049828), ('StreamingMovies', 0.11441164810049828), ('SeniorCitizen', 0.06598931894682017), ('PromoCategory', 0.05212111782114992), ('SurveyVersion', 0.05079486003467065), ('InternalCodeFlag', 0.04182090967746954), ('Partner', 0.031475680003731354), ('LegacySystemScore', 0.0270555585786581), ('TenureGroup', 0.0251276828955449), ('RandomIDHash', 0.02100451651520921), ('MultipleLines', 0.017465931793624045), ('PhoneService', 0.010631301549296868), ('PaperlessBilling', 0.008603922605119774), ('Dependents', 0.005558189973491778), ('Gender', 0.002767434448339934)]


In [92]:
#Question 12: Top Numeric Predictors

#num col
num_cols = df.select_dtypes(include= ['int64', 'float64']).columns
num_result=[]

for col in num_cols:
    churn = df[df['Status'] == 'Left'][col].mean()
    stayed = df[df['Status'] == 'Current'][col].mean()
    if stayed == 0:
        percent_diff = abs(churn - stayed)
    else:
        percent_diff = abs(churn - stayed) / stayed
    num_result.append((col, percent_diff))

num_ranked = sorted(num_result, key=lambda x: x[1], reverse=True)
print(f"Top Numeric Predictors: {num_ranked}")

Top Numeric Predictors: [('MonthlyCharges', np.float64(0.15566231383601184)), ('TotalCharges', np.float64(0.09687013656884823)), ('Tenure', np.float64(0.0605656103527486)), ('DataOverageCount', np.float64(0.03537332245559726)), ('ReferralCount', np.float64(0.03324175824175826)), ('NumInternationalCalls', np.float64(0.03286305157305632)), ('NumAppLoginsLastMonth', np.float64(0.02883531740678761)), ('NumPaymentFailures', np.float64(0.02699256834016998)), ('EmailOpenRate', np.float64(0.022618993425641314)), ('NumPageViews', np.float64(0.016052160323755758)), ('AvgCallDuration', np.float64(0.015156871036710838)), ('AppSessionLengthAvg', np.float64(0.005225276266089869)), ('NumAppCrashes', np.float64(0.0043851789714861224))]


In [93]:
#Question 13: Data Quality Check
df.isna().sum()[df.isna().sum()>0]

Unnamed: 0,0
PromoCategory,1469


# Part 1 Conceptual and Essay Questions

Question 14: Identifying noisy features in important to identify becauase they introuce randonmess and. The noisy features can create less meangingful patterns and inaccurate models. When they are left in the dataset they create unneccasry variance and lead to overfitting.



Question 15: From the EDA, a feature that is Signal is Contract. When looking at Mean Monthly charges By Contract, It shows that there is there is higher churn rate in Month-to-Month contracts compared to One-Year or Two-Year contracts. A really clear Noise feature from the EDA is RandomIDHash because when computing the churn rate by RandomIDHash, the groups showed little difference and no patterns. I believe this is because the column itself has no meaningful relation to churn rates.  

Question 16:
The two features with the strongest relationship to churn was the type of Contract and the Tenure Groups. For the type of Contract churn rate was way higher than the other two contract types. The month-to-month customers have a churn rate of 16.02%. This is much higher than the 2.21% for one-year customers and the 2.81% for two-year customers. Month-to-month customers also had the highest average charges with $81.52. The business context is that the customers with the lowest commitment and the highest bills are likely. Tenure showed a similar pattern to Contracts. Tenure has a strong relationship to churn as the churn rate by tenure group decreases with the more months of a customer. Customers with less than 12 months had 10.54% while customers with more than 9.58%. Customers with less months had higher churn rates than those with more months. In conclusion newer customers with shorter contract are more likely to churn. A recommendation is to target these groups with strategies to increase retention. This could be incentives to switch plans or discounts for longer commitment.

Question 17: One limitation is that the data set can't tell us the reason why month-to-month customers have higher churn rates. The data can't prove that the contract type is the reason for higher churn rates. Additionally, we aren't able to see what the competitors are offering or other outside factors. So ultimately it's hard to see what is actually causing the customers to leave. Another limitation is that some of the feature overlap. Contract type and tenure are automatically related because if a customer has a longer contract, they will spend more time at Regork. This makes it difficult to determine if the features are independent of each other or not. A next step would create a model to predict such as Logistic Regression or a Random Forest to measure feature importance and coefficients. You could also split the data in training and test sets to see if the patterns shown above can be generalized.

## Part 2: Structured Predictive Modeling
Reproducible machine learning models following idustry best practices.

In [111]:
#Step 1
#Question 18: Use binary to target variables for training and reliability
df = pd.read_csv('customer_retention_training.csv')
df['Status'] = df['Status'].map({'Current': 0, 'Left': 1})
x= df.drop('Status', axis=1)
y= df['Status']
print(f"Shape of x: {x.shape}")
print(f"Shape of y: {y.shape}")
display(df.head())

Shape of x: (6000, 34)
Shape of y: (6000,)


Unnamed: 0,Status,Gender,SeniorCitizen,Partner,Dependents,Tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,...,NumInternationalCalls,NumAppCrashes,AppSessionLengthAvg,NumPaymentFailures,DataOverageCount,InternalCodeFlag,RandomIDHash,SurveyVersion,LegacySystemScore,PromoCategory
0,0,Male,No,No,No,27,One year,Yes,Credit card (automatic),81.92,...,11,4,9.631306,2,1,B,X1,V2,Medium,
1,0,Female,No,Yes,Yes,27,Month-to-month,Yes,Electronic check,102.1,...,18,0,11.521527,0,2,B,X3,V2,Medium,
2,1,Female,No,No,No,72,Month-to-month,No,Mailed check,104.55,...,15,1,12.498201,1,1,C,X4,V2,Low,App
3,0,Male,No,Yes,No,53,Month-to-month,Yes,Electronic check,91.3,...,6,4,11.63912,2,1,A,X3,V2,Medium,App
4,0,Male,No,Yes,No,62,Month-to-month,Yes,Mailed check,84.06,...,11,2,7.133205,1,0,B,X4,V1,High,App


In [112]:
# Step 2
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify = y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

#Question 19: training set 4200
#Question 20: test set 1800
#Question 21: keep the same churn vs stay ratio, needed to repeat

X_train shape: (4200, 34)
X_test shape: (1800, 34)
y_train shape: (4200,)
y_test shape: (1800,)


In [113]:
#Step 3
numeric_cols = x.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = x.select_dtypes(include=['object', 'category']).columns
print(f"Numeric Columns: {numeric_cols}")
print(f"Categorical Columns: {categorical_cols}")

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

preprocessor
#Question 22: Preprocessing ensures that all steps are consistent preventing data leakage. It also reduces mistakes.


Numeric Columns: Index(['Tenure', 'MonthlyCharges', 'TotalCharges', 'NumAppLoginsLastMonth',
       'EmailOpenRate', 'ReferralCount', 'NumPageViews', 'AvgCallDuration',
       'NumInternationalCalls', 'NumAppCrashes', 'AppSessionLengthAvg',
       'NumPaymentFailures', 'DataOverageCount'],
      dtype='object')
Categorical Columns: Index(['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'InternalCodeFlag',
       'RandomIDHash', 'SurveyVersion', 'LegacySystemScore', 'PromoCategory'],
      dtype='object')


In [114]:
#Step 4
cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val

#Question 23: We use cross validation inste of a single train/test split because cross-validation tests many folds. This can help get a more reliable estimate of a model's performance.


StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [115]:
#Step 5
models = {}

# Logistic Regression
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('log_reg', LogisticRegression(random_state=42))
])

param_grid_lr = {
    'log_reg__C': [0.01, 0.1, 1, 10],
    'log_reg__penalty': ['l2'],
    'log_reg__solver': ['lbfgs']
}

grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=cross_val, scoring='f1')
grid_lr.fit(X_train, y_train)
models['Logistic Regression'] = grid_lr.best_score_
print(f"Logistic Regression Best F1: {grid_lr.best_score_:.4f}")

# Decision Tree
pipeline_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('decision_tree', DecisionTreeClassifier(random_state=42))
])

param_grid_dt = {
    'decision_tree__max_depth': [5, 10],
    'decision_tree__min_samples_split': [2, 10],
    'decision_tree__min_samples_leaf': [1, 5]
}

grid_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=cross_val, scoring='f1')
grid_dt.fit(X_train, y_train)
models['Decision Tree'] = grid_dt.best_score_
print(f"Decision Tree Best F1: {grid_dt.best_score_:.4f}")

# Random Forest
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'random_forest__n_estimators': [200, 500],
    'random_forest__max_depth': [5, 10, 20],
    'random_forest__min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=cross_val, scoring='f1')
grid_rf.fit(X_train, y_train)
models['Random Forest'] = grid_rf.best_score_
print(f"Random Forest Best F1: {grid_rf.best_score_:.4f}")


Logistic Regression Best F1: 0.1690
Decision Tree Best F1: 0.2003
Random Forest Best F1: 0.0141


Step 6 and Question 27: The Decision Tree achieved the best cross-validated F1 score (0.2003), but overall performance remained low across all models.





In [116]:
#Step 7
best_dt_model = grid_dt.best_estimator_

preprocessor_fitted = best_dt_model.named_steps['preprocessor']
final_estimator = best_dt_model.named_steps['decision_tree']

X_test_transformed = preprocessor_fitted.transform(X_test)
feature_names = preprocessor_fitted.get_feature_names_out()

perm_importance = permutation_importance(
    final_estimator,
    X_test_transformed,
    y_test,
    n_repeats=10,
    random_state=42,
    scoring='f1'
)

results = pd.DataFrame({
    'feature': feature_names,
    'importance_mean': perm_importance.importances_mean,
    'importance_std': perm_importance.importances_std
})

results = results.sort_values(by='importance_mean', ascending=False)
print(results.head(15))

#Question 28: Top 5 important features are cat__InternetService_Fiber optic,  cat__Contract_Month-to-month, num__NumPaymentFailures, cat__PaymentMethod_Electronic check, cat__RandomIDHash_X1.


                                       feature  importance_mean  \
36            cat__InternetService_Fiber optic         0.045628   
21                cat__Contract_Month-to-month         0.041042   
11                     num__NumPaymentFailures         0.016383   
28         cat__PaymentMethod_Electronic check         0.013490   
59                        cat__RandomIDHash_X1         0.006845   
15                       cat__SeniorCitizen_No         0.003453   
0                                  num__Tenure         0.002209   
69                      cat__PromoCategory_App         0.002190   
27  cat__PaymentMethod_Credit card (automatic)         0.001837   
58                     cat__InternalCodeFlag_C         0.001794   
67                  cat__LegacySystemScore_Low         0.000678   
6                            num__NumPageViews         0.000079   
62                        cat__RandomIDHash_X4         0.000000   
1                          num__MonthlyCharges         0.00000

In [117]:
#Step 8
best_lr_model = grid_lr.best_estimator_
best_dt_model = grid_dt.best_estimator_
best_rf_model = grid_rf.best_estimator_

models_test_results = {}

for name, model in {
    "Logistic Regression": best_lr_model,
    "Decision Tree": best_dt_model,
    "Random Forest": best_rf_model
}.items():

    y_pred = model.predict(X_test)

    models_test_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0)
    }

results_df = pd.DataFrame(models_test_results).T
print(results_df.round(4))

#Question 29: 0.8694

                     Accuracy  Precision  Recall  F1 Score
Logistic Regression    0.9006     0.4667  0.0393    0.0725
Decision Tree          0.8694     0.1724  0.0843    0.1132
Random Forest          0.9011     0.5000  0.0056    0.0111


# Part 2 Conceptual Questions and Essay

In [118]:
print(f"CV F1 Score (Decision Tree): {grid_dt.best_score_:.4f}")
test_f1 = results_df.loc['Decision Tree', 'F1 Score']
print(f"Test F1 Score (Decision Tree): {test_f1:.4f}")

CV F1 Score (Decision Tree): 0.2003
Test F1 Score (Decision Tree): 0.1132


Step 9: Interpretation

*   Question 33:The Decision Tree’s F1 score during cross-validation was 0.2003, but the F1 score on the test set dropped to a much lower score of 0.1132. The difference of 0.0871 suggests overfitting as the model didn't perform well on the new data because it was trained to specific on the training data.
*   Question 34: Generalization deals with a model's ability to to accurately predict, in this case, customers that it hasn't seen before. Generalization is being able to predict future customer not just past customers. With the low F1 scores, this model will not do well in predicting the churn rates 0f future customers. One concern I have for the future performance is that the data is imbalanced. Most of the customers stay so customers who churn are rarer and it is harder for the model to learn.


Question 35: The top two features of the Permutation Importance are cat__InternetService_Fiber_optic and cat__Contract_Month-to-month. The first feature,cat__InternetService_Fiber_optic means that that customers are subscribed to fiber optic internet. The second feature, at__Contract_Month-to-month, means that the customers have the shorter, month-to-month contract. This means that customers with fiber optic internet and the shorter contract were more likely to churn. Regork should focus on the retention on these two groups specifically the shorterm contract holders with fiber optic internet. Discounts or rewards for switching to a year or two year contract could help with retention. For the fiber optic internet, Regork should focus on lower the costs or the increasing the service and quality to customers as this is a more premium internet experience.


Question 36: I recommend the Decision Tree model to Regork. Even though its test accuracy of about 86% is slightly lower than the Logistic Regression and Random Forest models, Recall and F1 Score are more important for churn prediction because the business cares most about identifying customers who are likely to leave. The Decision Tree achieved the highest Recall with 0.0843 and the highest F1 Score of 0.1132. Ultimately the Decision Tree is detecting more churners than the others. The Random Forest model was the most stable across cross-validation and testing. This means there is less risk of overfitting. The Recall of the RandomForest was extremely low with a score of 0.0056. So even though Random Forest is stable, it isn't good at predicting with customers are likely to leave. The ultimate goal of Regork is to reduce the number of churned customers, so it is more important to be able to identify customers who are likely to leave.

## Part 3: Open-Ending Modeling Competition

Building the best churn model.

I'll be using the XGBoost because it is good for this type of data and it handles the imbalance well. It often achieves higher ROC-AUC performance compared to Logistic Regression or a Decision Tree.

In [119]:
# data
train_df = pd.read_csv("customer_retention_training.csv")
test_df = pd.read_csv("test_features.csv")

# Drop noise
noise_cols = ["InternalCodeFlag", "RandomIDHash"]
train_df = train_df.drop(columns=noise_cols)
test_df = test_df.drop(columns=noise_cols)

# Encode target
train_df['Status'] = train_df['Status'].map({'Current': 0, 'Left': 1})
X = train_df.drop("Status", axis=1)
y = train_df["Status"]

print("Train shape:", X.shape)
print("Test shape:", test_df.shape)

# imbalance
neg = sum(y == 0)
pos = sum(y == 1)
scale = neg / pos
print("Class imbalance ratio:", scale)

# Preprocessing
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# XGBoost model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        eval_metric="logloss",
        random_state=42,
        tree_method="hist",
        scale_pos_weight=scale
    ))
])

# Hyperparameter tuning
param_dist = {
    "model__n_estimators": [450, 550, 650],
    "model__max_depth": [3, 4],
    "model__learning_rate": [0.03, 0.05],
    "model__subsample": [0.7, 0.9],
    "model__colsample_bytree": [0.7, 0.9],
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=12,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

search.fit(X, y)

print("Best Params:", search.best_params_)
print("Best CV ROC-AUC:", search.best_score_)

best_model = search.best_estimator_

# Final fit on full data
final_model = best_model.fit(X, y)

# Predict test probabilities
test_pred_probs = final_model.predict_proba(test_df)[:, 1]

# Create submission CSV
submission = pd.DataFrame({
    "id": range(1, len(test_pred_probs) + 1),
    "prediction": test_pred_probs
})

submission.to_csv("predictions.csv", index=False)

print("Saved", submission.shape)

# Force file download in Colab
from google.colab import files
files.download("predictions.csv")


Train shape: (6000, 32)
Test shape: (1000, 32)
Class imbalance ratio: 9.084033613445378
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params: {'model__subsample': 0.9, 'model__n_estimators': 450, 'model__max_depth': 3, 'model__learning_rate': 0.03, 'model__colsample_bytree': 0.9}
Best CV ROC-AUC: 0.7810181982136056
Saved (1000, 2)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>