In [1]:
#1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
#2. Load the Dataset

# Load the dataset
df = pd.read_csv('Customer_Churn.csv')

# Display the first few rows of the dataset
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [3]:
#3. Data Cleaning and Preparation

#3.1. Handle Missing Values
print(df.isnull().sum())

# Handle missing values for TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')  # Convert to numeric, force errors to NaN
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)  # Fill missing values with median

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)  # Fill missing values with median


In [4]:
#3.2. Convert Target Variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [5]:
#4. Feature Engineering and Preprocessing

# Define features and target
X = df.drop(columns=['Churn', 'customerID'])  # Drop target and any non-predictive columns
y = df['Churn']

# Define categorical and numerical features
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# Create preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [6]:
#4. Preprocessing Pipeline
#4.1 Define Categorical and Numerical Features

# Define categorical and numerical features
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [7]:
#4.2 Create Preprocessing Pipelines
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values by replacing with median
    ('scaler', StandardScaler())  # Scale numerical features
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values by replacing with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical features to one-hot encoded format
])

# Combine pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),  # Apply numerical pipeline to numerical features
        ('cat', categorical_pipeline, categorical_features)  # Apply categorical pipeline to categorical features
    ])

In [8]:
#Using the Preprocessing Pipeline
#Fit and Transform Data
# Separate features and target
X = df.drop(columns=['Churn', 'customerID'])  # Drop target and non-predictive columns
y = df['Churn']

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.7963094393186657
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.66      0.47      0.55       373

    accuracy                           0.80      1409
   macro avg       0.74      0.69      0.71      1409
weighted avg       0.78      0.80      0.78      1409



In [11]:
#5. Prepare Data for Modeling

import pandas as pd

# Load the dataset
df = pd.read_csv('Customer_Churn.csv')

# Separate features and target variable
X = df.drop(columns=['Churn', 'customerID'])  # Drop the target column and non-predictive column
y = df['Churn']  # Target variable

In [12]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('Customer_Churn.csv')

# Separate features and target variable
X = df.drop(columns=['Churn', 'customerID'])  # Drop the target column and non-predictive column
y = df['Churn']  # Target variable

# Check and convert data types for numerical features
numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
for col in numerical_features:
    X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, forcing errors to NaN

# Define categorical features
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

# Numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Feature scaling
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical data to numerical
])

# Combine pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),  # Apply numerical pipeline to numerical features
        ('cat', categorical_pipeline, categorical_features)  # Apply categorical pipeline to categorical features
    ])

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Build the model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if necessary for convergence

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8218594748048261
Classification Report:
               precision    recall  f1-score   support

          No       0.86      0.90      0.88      1036
         Yes       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409



In [13]:
#6. Hyperparameter Tuning

#Hyperparameter Tuning for Logistic Regression and GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Optimization algorithms
    'penalty': ['l1', 'l2']  # Regularization types
}

# Create the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Score: 0.8017399499764382


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import joblib

# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Optimization algorithms
    'penalty': ['l1', 'l2']  # Regularization types
}

# Create the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Check if best_estimator_ exists and print the best parameters
if hasattr(grid_search, 'best_estimator_'):
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    # Save the best model to a file
    joblib.dump(grid_search.best_estimator_, 'best_model.pkl')

    # Load the model back to verify
    loaded_model = joblib.load('best_model.pkl')

    # Optional: Verify that the loaded model works correctly
    # Evaluate the loaded model on the test set
    y_pred = loaded_model.predict(X_test)
    print("Test Accuracy Score of Loaded Model:", accuracy_score(y_test, y_pred))
    print("Test Classification Report of Loaded Model:\n", classification_report(y_test, y_pred))
else:
    print("The grid search did not produce a best estimator.")

Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Score: 0.8017399499764382
Test Accuracy Score of Loaded Model: 0.8225691980127751
Test Classification Report of Loaded Model:
               precision    recall  f1-score   support

          No       0.86      0.91      0.88      1036
         Yes       0.70      0.58      0.64       373

    accuracy                           0.82      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409



In [15]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Create label encoders
le_gender = LabelEncoder()
le_gender.fit(['Male', 'Female'])

le_partner = LabelEncoder()
le_partner.fit(['Yes', 'No'])

le_dependents = LabelEncoder()
le_dependents.fit(['Yes', 'No'])

le_internet_service = LabelEncoder()
le_internet_service.fit(['DSL', 'Fiber optic', 'No'])

# Save the label encoders as a dictionary
label_encoders = {
    'gender': le_gender,
    'Partner': le_partner,
    'Dependents': le_dependents,
    'InternetService': le_internet_service
}

joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']

In [16]:
import joblib

label_encoders = joblib.load('label_encoders.pkl')
print(label_encoders)

{'gender': LabelEncoder(), 'Partner': LabelEncoder(), 'Dependents': LabelEncoder(), 'InternetService': LabelEncoder()}
