Customer segmentation by using K-mean clustering  and K-mean clustering with PCA

Data Preprocessing

In [None]:
#import libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt



In [None]:

# Provide the relative file path to the bank_transactions dataset
file_path = "bank_transactions2.csv"

# Load the bank_transactions dataset
data = pd.read_csv(file_path)

# Check for missing values
print(dataset.isnull().sum())

# Explore the dataset
print("First few rows of the dataset:")
print(dataset.head())  
print("\nInformation about the dataset:")
print(dataset.info())  

# Preprocess the data
cleaned_dataset = dataset.copy() 
# Remove rows with missing values
cleaned_dataset= cleaned_dataset.dropna()

# Remove outliers_transaction_amount
Q1 = cleaned_dataset['TransactionAmount'].quantile(0.25)
Q3 = cleaned_dataset['TransactionAmount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
cleaned_dataset = cleaned_dataset[(cleaned_dataset['TransactionAmount'] >= lower_bound) & (cleaned_dataset['TransactionAmount'] <= upper_bound)]

# Convert categorical variables to appropriate data types-'CustAccountBalance'&'TransactionDate'
cleaned_dataset['CustomerDOB'] = pd.to_datetime(cleaned_dataset['CustomerDOB'])
cleaned_dataset['TransactionDate'] = pd.to_datetime(cleaned_dataset['TransactionDate'])

# Convert CustGender to 1 for 'F' and 0 for 'M'
cleaned_dataset['CustGender'] = cleaned_dataset['CustGender'].apply(lambda x: 1.0 if x == 'F' else 0.0)



Grouping Data

In [None]:
# Create a new DataFrame grouped by CustomerID
grouped_data = cleaned_dataset.groupby('CustomerID').first().reset_index()

# Calculate the age based on CustomerDOB
current_year = datetime.now().year
grouped_data['CustomerDOB'] = pd.to_datetime(grouped_data['CustomerDOB'])
grouped_data['Age'] = current_year - grouped_data['CustomerDOB'].dt.year

# Remove rows with age less than 18 or age greater than 75
grouped_data = grouped_data[(grouped_data['Age'] >= 18) & (grouped_data['Age'] <= 75)]

# Assign a unique number to each city in CustLocation
grouped_data['Location'] = pd.Categorical(grouped_data['CustLocation']).codes + 1

# Check the cleaned data
print(grouped_data.head())

# Show the description table
description_table = grouped_data.describe()
print(description_table)

Number of Clusters (k)

In [None]:
sample_size = int(len(grouped_data) /100)
data_sample = grouped_data.sample(n=sample_size, random_state=42)

# Select the relevant columns for clustering
columns_for_clustering = ['CustAccountBalance','TransactionAmount', 'Age', 'Location']
X = data_sample[columns_for_clustering]

 #Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters using the elbow method
inertia = []
k_values = range(1, 11)  # Trying k values from 1 to 10
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Curve')
plt.show()

k-means clustering with k=4

In [None]:

# Select the relevant columns for clustering
columns_for_clustering = ['CustAccountBalance','TransactionAmount', 'Age', 'Location']
X = data_sample[columns_for_clustering]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform k-means clustering with k=4
kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
kmeans.fit(X_scaled)

# Get the cluster labels for each data point
labels = kmeans.labels_

# Add the cluster labels to the original dataset
data_sample['Cluster'] = labels

# Print the counts of each cluster
print(data_sample['Cluster'].value_counts())

In [None]:
analyze each cluster

In [None]:
# Calculate the mean value of each feature in each cluster group
cluster_means = data_sample.groupby('Cluster').mean(numeric_only=True)

# Display the table
print(cluster_means)


Calculate the mean value and the standard deviation of each feature in each cluster group

In [None]:

# Calculate the mean value of each feature in each cluster group
cluster_means = data_sample.groupby('Cluster').mean()

# Plot the mean values for each feature
cluster_means.plot(kind='bar', figsize=(10, 6))
plt.title('Mean Values of Features by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Mean Value')
plt.legend(loc='upper right')
plt.xticks(rotation=0)
plt.show()

# Calculate the standard deviation of each feature in each cluster group
cluster_stds = data_sample.groupby('Cluster').std()

# Plot the standard deviation values for each feature
cluster_stds.plot(kind='bar', figsize=(10, 6))
plt.title('Standard Deviation of Features by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Standard Deviation')
plt.legend(loc='upper right')
plt.xticks(rotation=0)
plt.show()


plot CustAccountBalance-Age

In [None]:
# Define the colors 
cluster_colors = ['red', 'blue', 'green', 'purple','black']

#plot CustAccountBalance-Age
plt.scatter(data_sample['CustAccountBalance'], data_sample['Age'], c=[cluster_colors[i] for i in data_sample['Cluster']])
plt.xlabel('CustAccountBalance')
plt.ylabel('CustGender')
plt.title('Scatter Plot with Clusters')

# Add a legend 
for i, color in enumerate(cluster_colors):
    plt.scatter([], [], c=color, label=f'Cluster {i}')
plt.legend()

plt.show()

Box Plot Cluster-Age

In [None]:
#Box Plot by Cluster
sns.boxplot(x='Cluster', y='Age', data=data_sample)
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.title('Box Plot by Cluster')
plt.show()

k-mean clustering with PCA

Explained Variance Ratio - Number of Components

In [None]:
numeric_columns = data_sample.select_dtypes(include=[np.number]).columns
data_numeric = data_sample[numeric_columns]

# Perform PCA for dimensionality reduction
pca = PCA()
pca.fit(data_numeric)

# Calculate the cumulative explained variance ratio
explained_variance_ratio_cumulative = np.cumsum(pca.explained_variance_ratio_)

# Plot the explained variance ratio
plt.plot(range(1, len(explained_variance_ratio_cumulative) + 1), explained_variance_ratio_cumulative, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Components')
plt.show()


Number of Clusters (k)-k-mean clustering with PCA

In [None]:
# Standardize the data
scaler = StandardScaler()
data_numeric_scaled = scaler.fit_transform(data_numeric)

# Perform PCA for dimensionality reduction
pca = PCA()
X_reduced = pca.fit_transform(data_numeric_scaled)

# Calculate the explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_

# Plot the explained variance ratio
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA - Explained Variance Ratio')
plt.show()

Output of k-mean clustering with PCA

In [None]:
#copydata_sample
data_sample_copy = data_sample.copy()
# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_scaled)

# Perform k-means clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_reduced)

data_sample_copy['Cluster'] = kmeans.labels_

# Plot the clusters
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=data_sample_copy['Cluster'])
plt.xlabel(' Component 1')
plt.ylabel('Component 2')
plt.title('K-means Clustering with PCA')
plt.show()

#PCA Components Heatmap
df_pca_comp=pd.DataFrame(data=pca.components_,columns=data_sample[columns_for_clustering].columns.values,index=['component1','component2'])
sns.heatmap(df_pca_comp, annot=True, cmap='coolwarm')
plt.xlabel('Features')
plt.ylabel('Principal Components')
plt.title('PCA Components Heatmap')
plt.show()
df_pca_comp