In [48]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, roc_curve, auc, f1_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from joblib import dump, load

# Load train and test datasets
train_data = pd.read_csv('../training/datasets/train_data.csv')
test_data = pd.read_csv('../training/datasets/test_data.csv')

columns = ['tenure', 'TotalCharges', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'Contract', 'Churn']
train_data = train_data[columns]
test_data = test_data[columns]

# Separate features and target variable
X_train = train_data.drop('Churn', axis=1)
y_train = train_data['Churn']
y_train = [1 if label.lower() == 'yes' else 0 for label in y_train]
X_test = test_data.drop('Churn', axis=1)
y_test = test_data['Churn']
y_test = [1 if label.lower() == 'yes' else 0 for label in y_test]

X_train_new = pd.DataFrame(X_train).copy(deep=True)
y_train_new = pd.DataFrame(y_train).copy(deep=True)

# Identify categorical and numerical features
categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('Churn')
numerical_features = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("num_imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)


categorical_transformer = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ("encoder", OrdinalEncoder())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Build pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier())   # Classifier
])

# Define hyperparameters for tuning
param_grid = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best F1 score on validation set:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Predictions on test set
y_pred = best_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Calculate ROC curve
y_prob = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Find the best threshold
best_threshold_index = np.argmax(tpr - fpr)
best_threshold = thresholds[best_threshold_index]
print("Best Threshold (Maximizing TPR - FPR):", best_threshold)

# Save the pipeline
dump(best_model, 'best_model_pipeline.joblib')

# Load the saved pipeline
loaded_model = load('best_model_pipeline.joblib')

# Predict on test set using the loaded pipeline
y_pred_test = loaded_model.predict(X_test)

# Print classification report for the test set
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred_test))

# Calculate F1 score with the best threshold
y_pred_threshold = (y_prob >= best_threshold).astype(int)
print("F1 Score with Best Threshold:", f1_score(y_test, y_pred_threshold))

Best parameters found: {'clf__max_depth': 10, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}
Best F1 score on validation set: 0.5450926669547639
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1036
           1       0.65      0.51      0.57       373

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Best Threshold (Maximizing TPR - FPR): 0.2578142203205028
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1036
           1       0.65      0.51      0.57       373

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

F1 Score with Best Threshold: 0.6233502538071066
<class 'pandas.core.fra

In [56]:
X_sample_test = pd.DataFrame.from_dict([{'tenure': 41, 'TotalCharges': 996.45, 'OnlineSecurity': 'No internet service', 'OnlineBackup': 'No internet service', 'TechSupport': 'No internet service', 'Contract': 'Month-to-month'}])
y_pred_test = loaded_model.predict(X_sample_test)

In [58]:
y_pred_test

array([0])

In [59]:
y_pred_test = loaded_model.predict_proba(X_sample_test)

In [60]:
y_pred_test

array([[0.92901212, 0.07098788]])