In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df=pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df = df.iloc[:, :-2]
df[:2]

In [None]:
# Display basic information
print("Dataset Info:")
df.info()

In [None]:
# Display the first few rows
print("\nFirst Few Rows:")
print(df.head())

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Generate basic statistics
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Handle missing values
# Option 1: Fill missing numeric columns with the mean
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Option 2: Fill missing categorical columns with the mode
categorical_cols = df.select_dtypes(include=[object]).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Identify and handle outliers using the IQR method
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers (optional: or cap them)
for col in numeric_cols:
    df[col] = np.where(df[col] < lower_bound[col], lower_bound[col], df[col])
    df[col] = np.where(df[col] > upper_bound[col], upper_bound[col], df[col])

# Verify changes
print("\nPost-Processing: Missing Values")
print(df.isnull().sum())

print("\nPost-Processing: Basic Statistics")
print(df.describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Age distribution
plt.figure(figsize=(8, 5))
sns.histplot(df['Customer_Age'], kde=True, bins=20, color='blue')
plt.title("Age Distribution of Customers")
plt.xlabel("Customer Age")
plt.ylabel("Frequency")
plt.show()

In [None]:

# Income Category distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Income_Category', order=df['Income_Category'].value_counts().index, palette='coolwarm')
plt.title("Income Category Distribution")
plt.xlabel("Income Category")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:

# Marital Status distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Marital_Status', order=df['Marital_Status'].value_counts().index, palette='viridis')
plt.title("Marital Status Distribution")
plt.xlabel("Marital Status")
plt.ylabel("Count")
plt.show()

In [None]:
# Gender vs Credit Limit
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='Gender', y='Credit_Limit', palette='Set2')
plt.title("Credit Limit by Gender")
plt.xlabel("Gender")
plt.ylabel("Credit Limit")
plt.show()

In [None]:

# Income Category vs Total Revolving Balance
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='Income_Category', y='Total_Revolving_Bal', palette='mako')
plt.title("Total Revolving Balance by Income Category")
plt.xlabel("Income Category")
plt.ylabel("Total Revolving Balance")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Group by Income and analyze averages
income_analysis = df.groupby('Income_Category')[['Credit_Limit', 'Total_Revolving_Bal']].mean().sort_values(by='Credit_Limit', ascending=False)
print("\nAverage Credit Limit and Revolving Balance by Income Category:")
print(income_analysis)


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Select relevant features for clustering
features = df[['Customer_Age', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Utilization_Ratio']]

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Determine the optimal number of clusters using the Silhouette Method
silhouette_scores = {}
for k in range(2, 4):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    silhouette_scores[k] = silhouette_score(scaled_features, kmeans.labels_)

optimal_k = max(silhouette_scores, key=silhouette_scores.get)
print(f"Optimal number of clusters: {optimal_k}")

# Fit K-Means with optimal clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_features)


In [None]:
# Analyze clusters
cluster_summary = df.groupby('Cluster')[['Customer_Age', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Utilization_Ratio']].mean()
print("\nCluster Summary:")
print(cluster_summary)

# Visualize clusters (e.g., Credit Limit vs Age)
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='Customer_Age', y='Credit_Limit', hue='Cluster', palette='viridis')
plt.title("Customer Segments: Age vs Credit Limit")
plt.xlabel("Customer Age")
plt.ylabel("Credit Limit")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
# Check for non-numeric columns
non_numeric_cols = df.select_dtypes(include=['object']).columns
from sklearn.preprocessing import LabelEncoder
from pandas import get_dummies

# Label Encode Gender (binary)
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])  # 'F' -> 0, 'M' -> 1

# One-Hot Encode other categorical variables
categorical_cols = ['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Define features and target
X = df.drop(columns=['Attrition_Flag', 'CLIENTNUM', 'Cluster'])  # Drop non-relevant columns
y = df['Attrition_Flag'].apply(lambda x: 1 if x == 'Attrited Customer' else 0)  # Encode target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_proba_lr = log_reg.predict_proba(X_test)[:, 1]

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]


In [None]:
# Logistic Regression Evaluation
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_lr):.4f}")
print(classification_report(y_test, y_pred_lr))

# Random Forest Evaluation
print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_rf):.4f}")
print(classification_report(y_test, y_pred_rf))
