In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score 
import pandas as pd 
import joblib
import mlflow

In [2]:
def process(file):
    df = pd.read_csv(file)
    df = df.drop_duplicates()
    df = pd.get_dummies(df, columns=['country', 'gender'])
    
    X = df.drop('churn', axis=1)
    y = df['churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

    # Print the shapes of the arrays
    print("Shape of X_train:", X_train.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_test:", y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [3]:
#process(r'C:\Users\mussie\Music\final pro\Bank Customer Churn Prediction.csv')
X_train, X_test, y_train, y_test = process(r'C:\Users\mussie\Music\final pro\Bank Customer Churn Prediction.csv')

Shape of X_train: (8000, 14)
Shape of y_train: (8000,)
Shape of X_test: (2000, 14)
Shape of y_test: (2000,)


In [43]:
#mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('bank-experiment')

def train_model(X_train, y_train):
    """
    Train a logistic regression model using cross-validation and grid search for parameter tuning.
    """
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Define the grid of hyperparameters to search over
    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l2']
    }
    
    # Start an MLflow run
    with mlflow.start_run(run_name='second'):
        mlflow.autolog()
        
        # Perform a grid search with 5-fold cross-validation
        grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
        grid_search.fit(X_train_scaled, y_train)
    
        lr_model = LogisticRegression(**grid_search.best_params_)
        lr_model.fit(X_train_scaled, y_train)
        
         # Save model to file
        joblib.dump(lr_model, "model_exp_1.joblib")
        
        # Return the trained model
        return lr_model

In [44]:
train_model(X_train,y_train)

2023/07/11 06:03:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/07/11 06:03:33 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


In [6]:
lr= LogisticRegression(random_state=1)
lr_param_grid = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l2'],
        'solver':['liblinear']
    }

lr_gs = GridSearchCV(
    estimator=lr,
    param_grid=lr_param_grid,
    cv =5,
    n_jobs=1,
    scoring='accuracy',
    verbose= 0
)

lr_model= lr.fit(X_train,y_train)



In [8]:
dt = DecisionTreeClassifier(random_state=1)

dt_param_grid = {
        'max_depth': [3,5,7,9,11,13],
        'criterion': ['gini','entropy']
    }

dt_gs = GridSearchCV(
    estimator=dt,
    param_grid=dt_param_grid,
    cv =5,
    n_jobs=1,
    scoring='accuracy',
    verbose= 0
)

dt_model = dt_gs.fit(X_train,y_train)

In [9]:
rf = RandomForestClassifier(random_state=1)
rf_param_grid = {
        'n_estimators': [400,700],
        'max_depth':[15,20,25],
        'criterion': ['gini','entropy'],
        'max_leaf_nodes':[50,100]
    }

rf_gs =  GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv =5,
    n_jobs=1,
    scoring='accuracy',
    verbose= 0
)

rf_model = rf_gs.fit(X_train,y_train)



In [11]:
def model_metrics(actual,pred):
    accuracy = metrics.accuracy_score(y_test,pred)
    f1 =metrics.f1_score(actual,pred,pos_label=1)
    fpr,tpr,threshold1 = metrics.roc_curve(y_test,pred)
    auc = metrics.auc(fpr,tpr)
    plt.figure(figsize=(8,8))
    
    plt.plot(fpr,tpr, color = 'blue', label = 'ROC curve area = %0.2f'%auc)
    
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.1])
    plt.ylim([-0.1,1.1])
    plt.xlabel('False positive rate',size= 14)
    plt.ylabel('True positve Rate',size = 14)
    plt.legend(loc ='lower right')
    
    plt.savefig('plot/roc_curve.png')
    
    plt.close()
    
    return(accuracy,f1,auc)
    
    

In [23]:
def mlflow_logs(model,X,y,name):
    
    mlflow.set_experiment('bank-experiment')
    
    with mlflow.start_run(run_name=name) as run:
        run_id = run.info.run_id
        mlflow.set_tag('run_id',run_id)
        
        pred = model.predict(X)
        
        (accuracy,f1,auc)  = model_metrics(y,pred)
        
        mlflow.log_params(model.best_params_)
        
        
        
        mlflow.log_metric('Mean cv score',model.best_score_)
        mlflow.log_metric('Accuracy',accuracy)
        mlflow.log_metric('f1-score',f1)
        mlflow.log_metric('AUC',auc)
        
        mlflow.log_artifact("plot/ROC_curve.png")
        mlflow.sklearn.log_model(model,name)
        
        mlflow.end_run()

mlflow_logs(dt_model,X_test,y_test,'DecisionTreeClassifier')
mlflow_logs(rf_model,X_test,y_test,'RandomForestClassifier')
#mlflow_logs(lr_model,X_test,y_test,"LogisticRegression")