In [1]:
!conda list

# packages in environment at C:\Users\admin\anaconda3:
#
# Name                    Version                   Build  Channel
_anaconda_depends         2024.06             py312_mkl_2  
abseil-cpp                20211102.0           hd77b12b_0  
aext-assistant            4.0.15          py312haa95532_jl4_0  
aext-assistant-server     4.0.15          py312haa95532_0  
aext-core                 4.0.15          py312haa95532_jl4_0  
aext-core-server          4.0.15          py312haa95532_1  
aext-panels               4.0.15          py312haa95532_0  
aext-panels-server        4.0.15          py312haa95532_0  
aext-share-notebook       4.0.15          py312haa95532_0  
aext-share-notebook-server 4.0.15          py312haa95532_0  
aext-shared               4.0.15          py312haa95532_0  
aiobotocore               2.12.3          py312haa95532_0  
aiohttp                   3.9.5           py312h2bbff1b_0  
aioitertools              0.7.1              pyhd3eb1b0_0  
aiosignal                 1

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import SMOTE,ADASYN,SMOTENC


In [17]:

# Load the dataset
data = pd.read_csv("C:\\Users\\admin\\Downloads\\Churn_Modelling.csv")

# Separate the majority and minority classes
majority_class = data[data['Exited'] == 0]
minority_class = data[data['Exited'] == 1]

# Shuffle the majority and minority classes to ensure randomness
majority_class = majority_class.sample(frac=1, random_state=42).reset_index(drop=True)
minority_class = minority_class.sample(frac=1, random_state=42).reset_index(drop=True)

splits = 6

# Calculate the size of each group for majority and minority classes
majority_group_size = len(majority_class) // splits
minority_group_size = len(minority_class) // splits

# Create N groups for both majority and minority classes
majority_groups = []
minority_groups = []

for i in range(splits):
    # Get the ith group from the majority class
    start_idx_majority = i * majority_group_size
    end_idx_majority = (i + 1) * majority_group_size if i != splits - 1 else len(majority_class)
    majority_groups.append(majority_class[start_idx_majority:end_idx_majority])
    
    # Get the ith group from the minority class
    start_idx_minority = i * minority_group_size
    end_idx_minority = (i + 1) * minority_group_size if i != splits - 1 else len(minority_class)
    minority_groups.append(minority_class[start_idx_minority:end_idx_minority])

# Concatenate all minority groups except the last one
combined_minority_groups = pd.concat(minority_groups[:-1], axis=0)



In [19]:
# Step 2: Train classifiers [splits-1]

classifiers = []
resampling = RandomUnderSampler(random_state=42)  # Initialize undersampling
# resampling = SMOTENC(random_state=42,k_neighbors=5,categorical_features=[8, 9,10])  # Initialize resampling


for i in range(splits-1):
    # Combine the corresponding majority group with the combined minority groups (excluding the last minority group)
    group = pd.concat([majority_groups[i], combined_minority_groups], axis=0)
    
    # Shuffle the dataset to ensure randomness
    group = group.sample(frac=1, random_state=42).reset_index(drop=True)
    
    X_group = group.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
    y_group = group['Exited']
    print(f"Before Resampling Group {i}: Class 0 = {sum(y_group == 0)}, Class 1 = {sum(y_group == 1)}")
    
    # Perform one-hot encoding for categorical variables
    X_group = pd.get_dummies(X_group, drop_first=True, dtype=int)
    
    # Apply Resampling to the data
    X_resampled, y_resampled = resampling.fit_resample(X_group, y_group)
    print(f"After Resampling Group {i}: Class 0 = {sum(y_resampled == 0)}, Class 1 = {sum(y_resampled == 1)}")
    
    # Train a RandomForestClassifier
    clf = RandomForestClassifier(random_state=42)
    # clf.fit(X_group, y_group)
    clf.fit(X_resampled, y_resampled)
    
    classifiers.append(clf)



Before Resampling Group 0: Class 0 = 1327, Class 1 = 1695
After Resampling Group 0: Class 0 = 1327, Class 1 = 1327
Before Resampling Group 1: Class 0 = 1327, Class 1 = 1695
After Resampling Group 1: Class 0 = 1327, Class 1 = 1327
Before Resampling Group 2: Class 0 = 1327, Class 1 = 1695
After Resampling Group 2: Class 0 = 1327, Class 1 = 1327
Before Resampling Group 3: Class 0 = 1327, Class 1 = 1695
After Resampling Group 3: Class 0 = 1327, Class 1 = 1327
Before Resampling Group 4: Class 0 = 1327, Class 1 = 1695
After Resampling Group 4: Class 0 = 1327, Class 1 = 1327


In [73]:
# Step 3: Perform majority voting on the holdout group (group 4)

holdout_group = pd.concat([majority_groups[-1], minority_groups[-1]], axis=0)

# Shuffle the test dataset to ensure randomness
holdout_group = holdout_group.sample(frac=1, random_state=42).reset_index(drop=True)

X_holdout = holdout_group.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
y_holdout = holdout_group['Exited']

# Perform one-hot encoding for categorical variables
X_holdout = pd.get_dummies(X_holdout, drop_first=True, dtype=int)

# Collect predictions from each classifier
predictions = np.zeros((X_holdout.shape[0], len(classifiers)))

for idx, clf in enumerate(classifiers):
    predictions[:, idx] = clf.predict(X_holdout)

# Averaging predictions
final_predictions = (np.mean(predictions, axis=1) >= 0.5).astype(int)


# for idx, clf in enumerate(classifiers):
#     predictions[:, idx] = clf.predict(X_holdout)

# # Majority voting
# final_predictions = np.apply_along_axis(
#     lambda x: np.argmax(np.bincount(x)),
#     axis=1,
#     arr=predictions.astype(int)
# )



In [75]:
# Step 4: Evaluate the model

cm = confusion_matrix(y_holdout, final_predictions)

print("Confusion Matrix:")
print(cm)

# Calculate and print class-wise test accuracy
matrix = confusion_matrix(y_holdout, final_predictions)
print("Test accuracy class-wise: ", matrix.diagonal() / matrix.sum(axis=1))

Confusion Matrix:
[[1042  286]
 [  78  264]]
Test accuracy class-wise:  [0.78463855 0.77192982]
