In [1]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import pandas as pd 
import joblib

In [23]:
def process(file):
    df = pd.read_csv(file)
    df = df.drop_duplicates()
    df = pd.get_dummies(df, columns=['country', 'gender'])
    
    X = df.drop('churn', axis=1)
    y = df['churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

    # Print the shapes of the arrays
    print("Shape of X_train:", X_train.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_test:", y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [24]:
#process(r'C:\Users\mussie\Music\final pro\Bank Customer Churn Prediction.csv')
X_train, X_test, y_train, y_test = process(r'C:\Users\mussie\Music\final pro\Bank Customer Churn Prediction.csv')

Shape of X_train: (8000, 14)
Shape of y_train: (8000,)
Shape of X_test: (2000, 14)
Shape of y_test: (2000,)


In [25]:
def train(x_train,y_train ,xtest,ytest):
    
    
     param_grid = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l1', 'l2']
    }
    lr_model=LogisticRegression()
    lr_model.fit( x_train,y_train)
    lr_pred=lr_model.predict(xtest)
    print("accuracy : ", accuracy_score(y_test,lr_pred))
    
    # Save model to file
    joblib.dump(lr_model, "model_1.joblib")
    

In [43]:
import mlflow
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


#mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('bank-experiment')

def train_model(X_train, y_train):
    """
    Train a logistic regression model using cross-validation and grid search for parameter tuning.
    """
    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Define the grid of hyperparameters to search over
    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l2']
    }
    
    # Start an MLflow run
    with mlflow.start_run(run_name='second'):
        mlflow.autolog()
        
        # Perform a grid search with 5-fold cross-validation
        grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
        grid_search.fit(X_train_scaled, y_train)
    
        lr_model = LogisticRegression(**grid_search.best_params_)
        lr_model.fit(X_train_scaled, y_train)
        
         # Save model to file
        joblib.dump(lr_model, "model_exp_1.joblib")
        
        # Return the trained model
        return lr_model

In [44]:
train_model(X_train,y_train)

2023/07/11 06:03:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/07/11 06:03:33 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


In [26]:
train(X_train,y_train,X_test,y_test)

accuracy :  0.8035
