In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
import os
print(os.getcwd())

C:\Users\MSI\Desktop\Folders\S5\BI-big data\big data\project\kafka-customers-prediction-project\training


In [3]:
df = pd.read_csv("data/customer_churn.csv")

In [4]:
df.head()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1


In [5]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import numpy as np


# Separate numeric and non-numeric columns
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
non_numeric_columns = X_train.select_dtypes(exclude=[np.number]).columns

# Create transformers for each type of column
numeric_transformer = StandardScaler()

# Create a custom transformer for non-numeric columns
def label_encode_column(X):
    for column in X.columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])
    return X

non_numeric_transformer = FunctionTransformer(label_encode_column)

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('non_num', non_numeric_transformer, non_numeric_columns),
    ]
)

# Apply the transformations
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [8]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Use the best parameters to initialize the model
model = LogisticRegression(**best_params)



Best Hyperparameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}




In [10]:
# Initialize Logistic Regression model
model = LogisticRegression()

In [11]:
# Train the model
model.fit(X_train_processed, y_train)

LogisticRegression()

In [12]:
# Make predictions on the test set
y_pred = model.predict(X_test_processed)

In [13]:
# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[142   6]
 [ 12  20]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       148
           1       0.77      0.62      0.69        32

    accuracy                           0.90       180
   macro avg       0.85      0.79      0.82       180
weighted avg       0.89      0.90      0.90       180



In [14]:
# Now you can save the trained model for later use
import joblib
joblib.dump(model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']