<a href="https://colab.research.google.com/github/michaelhenr/michael/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for clean output


In [3]:
# Load the datasets
train_data = pd.read_csv('/content/train (1).csv')
test_data = pd.read_csv('/content/test (1).csv')
submission_template = pd.read_csv('/content/sample_submission (1).csv')

# Display the first few rows of the training dataset
print("Training Data Overview:")
display(train_data.head())

print("\nTest Data Overview:")
display(test_data.head())

Training Data Overview:


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15585961.0,Hs?,684.0,France,Male,41.0,10.0,0.0,2.0,1.0,1.0,173948.4,1.0
1,1,15643378.0,Bellucci,807.0,France,Male,32.0,2.0,0.0,2.0,1.0,0.0,144532.85,0.0
2,2,15651022.0,O'Donnell,553.0,Germany,Male,53.0,9.0,102278.52,1.0,1.0,0.0,158816.03,1.0
3,3,15676521.0,Chiang,587.0,France,Female,34.0,6.0,0.0,1.0,1.0,0.0,167984.72,1.0
4,4,15772650.0,Kambinachi,732.0,Germany,Female,30.0,5.0,135070.92,1.0,1.0,1.0,116097.26,0.0



Test Data Overview:


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15000,15627859.0,Onyekachukwu,759.0,Spain,Male,37.0,5.0,0.0,1.0,1.0,0.0,56875.76
1,15001,15791131.0,Bazarova,645.0,Germany,Male,28.0,5.0,121404.64,2.0,0.0,0.0,27359.19
2,15002,15682548.0,Nwankwo,741.0,France,Male,32.0,7.0,138120.07,1.0,1.0,1.0,147931.32
3,15003,15681705.0,Onio,765.0,Spain,Female,45.0,6.0,0.0,1.0,1.0,1.0,161465.31
4,15004,15754574.0,Y?,719.0,France,Male,39.0,2.0,88826.07,2.0,1.0,1.0,183542.08


In [4]:
# Drop irrelevant columns
train_data_cleaned = train_data.drop(columns=['CustomerId', 'Surname'])
test_data_cleaned = test_data.drop(columns=['CustomerId', 'Surname'])

# Separate features and target
X_train = train_data_cleaned.drop(columns=['Exited'])
y_train = train_data_cleaned['Exited']

# Identify categorical and numerical columns
categorical_features = ['Geography', 'Gender']
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance',
                      'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical features
        ('cat', OneHotEncoder(drop='first'), categorical_features)  # One-hot encode categorical features
    ])

# Transform the training and test datasets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(test_data_cleaned)

# Split the training data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print("Preprocessing complete. Shapes:")
print(f"Training data: {X_train_split.shape}, Validation data: {X_val.shape}")


Preprocessing complete. Shapes:
Training data: (12000, 11), Validation data: (3000, 11)


In [5]:
# Logistic Regression Model
log_reg = LogisticRegression(random_state=42, max_iter=500)
log_reg.fit(X_train_split, y_train_split)

# Predictions and Evaluation
y_val_pred_log = log_reg.predict(X_val)
roc_auc_log = roc_auc_score(y_val, log_reg.predict_proba(X_val)[:, 1])
f1_log = f1_score(y_val, y_val_pred_log)

print("Logistic Regression Results:")
print(f"ROC AUC: {roc_auc_log:.4f}")
print(f"F1 Score: {f1_log:.4f}")


Logistic Regression Results:
ROC AUC: 0.8783
F1 Score: 0.6321


In [6]:
# Decision Tree Model
decision_tree = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=20)
decision_tree.fit(X_train_split, y_train_split)

# Predictions and Evaluation
y_val_pred_tree = decision_tree.predict(X_val)
roc_auc_tree = roc_auc_score(y_val, decision_tree.predict_proba(X_val)[:, 1])
f1_tree = f1_score(y_val, y_val_pred_tree)

print("Decision Tree Results:")
print(f"ROC AUC: {roc_auc_tree:.4f}")
print(f"F1 Score: {f1_tree:.4f}")


Decision Tree Results:
ROC AUC: 0.8872
F1 Score: 0.6900


In [7]:
# Random Forest Model
random_forest = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
random_forest.fit(X_train_split, y_train_split)

# Predictions and Evaluation
y_val_pred_rf = random_forest.predict(X_val)
roc_auc_rf = roc_auc_score(y_val, random_forest.predict_proba(X_val)[:, 1])
f1_rf = f1_score(y_val, y_val_pred_rf)

print("Random Forest Results:")
print(f"ROC AUC: {roc_auc_rf:.4f}")
print(f"F1 Score: {f1_rf:.4f}")


Random Forest Results:
ROC AUC: 0.9265
F1 Score: 0.7140


In [8]:
# Compare all models
print("Model Comparison:")
print(f"Logistic Regression - ROC AUC: {roc_auc_log:.4f}, F1 Score: {f1_log:.4f}")
print(f"Decision Tree       - ROC AUC: {roc_auc_tree:.4f}, F1 Score: {f1_tree:.4f}")
print(f"Random Forest       - ROC AUC: {roc_auc_rf:.4f}, F1 Score: {f1_rf:.4f}")


Model Comparison:
Logistic Regression - ROC AUC: 0.8783, F1 Score: 0.6321
Decision Tree       - ROC AUC: 0.8872, F1 Score: 0.6900
Random Forest       - ROC AUC: 0.9265, F1 Score: 0.7140


In [9]:
# Use the best model (e.g., Random Forest) to predict on test data
test_predictions = random_forest.predict_proba(X_test_processed)[:, 1]

# Prepare the submission file
submission_template['Exited'] = test_predictions
submission_template.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")


Submission file created: submission.csv
