In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn import metrics
from numpy import unique
from sklearn.preprocessing import LabelEncoder
import matplotlib as mpl
import matplotlib.cm as cm
import warnings

warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/content/dataset-new (2).csv")

print("Number of datapoints:", len(data))

Number of datapoints: 135


# Section 1: Data Preprocessing


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 18 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   Unnamed: 0                                 135 non-null    int64 
 1   Age                                        135 non-null    object
 2   Gender                                     135 non-null    object
 3   Occupation                                 135 non-null    object
 4   Location                                   135 non-null    object
 5   Shopping Frequency                         135 non-null    object
 6   Shopping Methods                           135 non-null    object
 7   Online, instore preferences                135 non-null    object
 8   Influence Factors                          135 non-null    object
 9   Types of Clothing                          135 non-null    object
 10  Brand Preference                      

In [None]:
missing_values = data.isnull().sum()
print(missing_values)

Unnamed: 0                                    0
Age                                           0
Gender                                        0
Occupation                                    0
Location                                      0
Shopping Frequency                            0
Shopping Methods                              0
Online, instore preferences                   0
Influence Factors                             0
Types of Clothing                             0
Brand Preference                              0
Brand Choice Factors                          0
Likeliness of Promotion or Discount           0
Prefered Types of Promotions or Discounts     0
Communication Method                          0
Average Spent                                 0
Payment Methods                               0
Comments                                     93
dtype: int64


In [None]:
# check for non numeric values

non_numeric_columns =data.select_dtypes(exclude=['int64', 'float64']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['Age', 'Gender', 'Occupation', 'Location', 'Shopping Frequency',
       'Shopping Methods', 'Online, instore preferences', 'Influence Factors',
       'Types of Clothing', 'Brand Preference', 'Brand Choice Factors',
       'Likeliness of Promotion or Discount',
       'Prefered Types of Promotions or Discounts', 'Communication Method',
       'Average Spent', 'Payment Methods', 'Comments'],
      dtype='object')


In [None]:
# drop timestamp
data=data.drop(['Timestamp'],axis=1)

KeyError: "['Timestamp'] not found in axis"

In [None]:
# preprocessing user locations (summerize this)
data['Location (City/Region)']=data['Location (City/Region)'].str.strip()
data['Location (City/Region)']=data['Location (City/Region)'].replace(['piliyandala','piliyandala.', 'Piliyandala / buddist'],'Piliyandala')
data['Location (City/Region)']=data['Location (City/Region)'].replace(['colombo grandpass','Kotahena', 'Kirulapana'],'Colombo')
data['Location (City/Region)']=data['Location (City/Region)'].replace('Colombo 05','Colombo')
data['Location (City/Region)']=data['Location (City/Region)'].replace(['Singapore',
       'Ella', 'Bandaragama', 'Kurunegala', 'Mount lavinia',
        'Al khail gate', 'sri lanka , sinhala',
       'Battaramulla', 'Kaluthara', 'Melbourne, Australia', 'Ratmalana',
       'Awissawella', 'Galle', 'Mathara', 'Nugegoda', 'Mulleriyawa New Town', 'Horana', 'Rathnapura', 'Kegalle', 'Kaduwela', 'Kohuwala', 'Gampaha', 'Balapitiya', 'Thalawathugoda', 'Rajagiriya', 'Panadura',  'Kandy', 'Wattala', 'Ratnapura', 'Panagoda'],'Other')
data['Location (City/Region)'].value_counts()

In [None]:
# rename column names to short ones
data=data.rename({
    'Location (City/Region)':'Location',
    'How often do you shop for clothing and apparel?':'Shopping Frequency',
    'Where do you usually shop for clothing and fashion products?':'Shopping Methods',
    'Do you prefer shopping in physical retail stores or online fashion stores?':'Online, instore preferences',
    'What factors influence your choice of shopping?':'Influence Factors',
    'What types of clothing do you typically buy?':'Types of Clothing',
    'Do you tend to stick with specific fashion brands when shopping?':'Brand Preference',
    'What factors influence your brand choices when shopping for clothing? ':'Brand Choice Factors',
    'How likely are you to make a purchase if there is a special promotion or discount offered?':'Likeliness of Promotion or Discount',
    'What types of promotions or discounts do you find most appealing?':'Preferred Types of Promotions or Discounts',
    'How do you prefer to receive communication from fashion and apparel stores?':'Communication Methods', # to methods
    'On average, how much do you spend on fashion and apparel items per month?':'Average Spent',
    'What payment methods do you primarily use when making fashion and apparel purchases?':'Payment Methods',
    'Any additional comments or thoughts you would like to share about your shopping preferences and experiences?':'Comments'
}, axis='columns')

In [None]:
# remove comments column
data = data.drop(['Comments'],axis=1)

In [None]:
# export the pre-processed dataset
data.to_csv("/content/pp-dataset.csv")

In [None]:
raw_data = data.copy()
data.columns

In [None]:
def pp_types_of_clothing(x):
  x = x.replace('(e.g.bags,jewellary', "")
  x = x.split(",")
  return x

In [None]:
# split the multiple choice questions to list
data['Influence Factors'] = data['Influence Factors'].apply(lambda x: x.split(","))
data['Types of Clothing'] = data['Types of Clothing'].apply(pp_types_of_clothing)
data['Brand Choice Factors'] = data['Brand Choice Factors'].apply(lambda x: x.split(","))
data['Preferred Types of Promotions or Discounts'] = data['Preferred Types of Promotions or Discounts'].apply(lambda x: x.split(","))
data['Payment Methods'] = data['Payment Methods'].apply(lambda x: x.split(","))
data['Shopping Methods'] = data['Shopping Methods'].apply(lambda x: x.split(","))
data['Communication Methods'] = data['Communication Methods'].apply(lambda x: x.split(","))

In [None]:
# data['Shopping Methods'] = data['Shopping Methods'].apply(lambda x: x.split(","))

def make_boolean(x):
  x = x.split(",")
  return True if x[0].strip() == "Yes" else False

data['Brand Preference'] = data['Brand Preference'].apply(make_boolean)

In [None]:
# meter
# very likely => 5
# likely => 4
# neutral => 3
# unlikely => 2
# very unlikely => 1

def make_likeliness(x):
  x = x.strip()
  return 5 if x == "Very likely" else 4 if x == "Likely" else 3 if x == "Neutral" else 2 if x == "Unlikely" else 1 if x == "Very unlikely"  else 0

data['Likeliness of Promotion or Discount'] = data['Likeliness of Promotion or Discount'].apply(make_likeliness)

# Section 2: Visualizations

In [None]:
data.head()

In [None]:
# Gender Difference
gender_count = data.groupby('Gender')['Gender'].count()
gender_count.plot.pie(y='mass', figsize=(5, 5),  autopct='%.2f%%')

In [None]:
data.groupby('Location')["Gender"].count()

In [None]:
# Location Difference
location_count = data.groupby('Location')['Location'].count()
location_count.plot.pie(y='mass', figsize=(5, 5),  autopct='%.2f%%')

In [None]:
# Occupation Difference
occupation_count = data.groupby('Occupation')['Occupation'].count()
occupation_count.plot.pie(y='mass', figsize=(5, 5),  autopct='%.2f%%')

In [None]:
# Age Difference
age_count = data.groupby('Age')['Age'].count()
age_count.plot.pie(y='mass', figsize=(5, 5),  autopct='%.2f%%')

In [None]:
# Online, instore preferences

online_instore_count = data.groupby('Online, instore preferences')['Online, instore preferences'].count()
online_instore_count.plot.pie(y='mass', figsize=(5, 5),  autopct='%.2f%%')

In [None]:
# Average Spent

avg_spent_count = data.groupby('Average Spent')['Average Spent'].count()
avg_spent_count.plot.pie(y='mass', figsize=(5, 5),  autopct='%.2f%%')

In [None]:
# Likeliness of Promotion or Discount

likeliness_scale_count = data.groupby('Likeliness of Promotion or Discount')['Likeliness of Promotion or Discount'].count()

likeliness_scale_count.plot.barh(y='mass', figsize=(5, 5))

In [None]:
data.info()

# Section 3: Data Encoding

In [None]:
data.index = range(len(data))

In [None]:
# preprocess lists

# as mentioned in pre-processing here we take the columns that has lists and put that list items in to the columns by appending the item name to the column name. and if that item is in that row's list we make that column 1 and if it's not present we make it 0.
# It's similler to the onehot encoding.
# refer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

def get_uniques(x):
  columns = []
  for _x in x:
    for __x in _x:
      __x = __x.strip()
      if(not columns.count(__x)):
        columns.append(__x)
  return columns

def add_columns(new_columns, prefix):
  for nc in new_columns:
    data[prefix + "_"+ nc] = 0

def fill(prefix):
  for i in range(len(data[prefix])):
    for __x in data.loc[i, prefix]:
      __x = __x.strip()
      data[prefix + "_"+ __x][i] = 1

def preprocess_lists(column_name):
  uniques = get_uniques(data[column_name])
  add_columns(uniques, column_name)
  fill(column_name)
  data.drop([column_name], axis=1, inplace=True)


preprocess_lists('Types of Clothing')
preprocess_lists('Shopping Methods')
preprocess_lists('Influence Factors')
preprocess_lists('Brand Choice Factors')
preprocess_lists('Preferred Types of Promotions or Discounts')
preprocess_lists('Communication Methods')
preprocess_lists('Payment Methods')

data

In [None]:
# then we label encode the other columns that has string values.

from sklearn.preprocessing import LabelEncoder

categorical_columns = ['Gender', 'Age']
label_encoder = LabelEncoder()

data[['Age', 'Gender', 'Occupation', 'Location', 'Shopping Frequency','Online, instore preferences',
       'Average Spent', 'Brand Preference']]= data[['Age', 'Gender', 'Occupation', 'Location', 'Shopping Frequency', 'Online, instore preferences',
       'Average Spent', 'Brand Preference']].apply(LabelEncoder().fit_transform)
data

In [None]:
def graph_component_silhouette(n_clusters, lim_x, mat_size, sample_silhouette_values, clusters):
    plt.rcParams["patch.force_edgecolor"] = True
    plt.style.use('fivethirtyeight')
    mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(8, 8)
    ax1.set_xlim([lim_x[0], lim_x[1]])
    ax1.set_ylim([0, mat_size + (n_clusters + 1) * 10])
    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        cmap = mpl.colormaps["Spectral"]
        color = cmap(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
                           facecolor=color, edgecolor=color, alpha=0.8)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.03, y_lower + 0.5 * size_cluster_i, str(i), color = 'red', fontweight = 'bold',
                bbox=dict(facecolor='white', edgecolor='black', boxstyle='round, pad=0.3'))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10
    plt.show()

# Section 4: Customer Segmentaion using K Means

In [None]:
def graph_comp(matrix, clusters, n_clusters):
  sample_silhouette_values = silhouette_samples(matrix, clusters)
  graph_component_silhouette(n_clusters, [-0.1, 1], len(matrix), sample_silhouette_values, clusters)

In [None]:
# Define function to benchmarking the performance of the different number of clusters

def benchmark_K_means(X, max_n_clusters, draw_graph_comp=False):
  wcss = [] # withing cluster sum of squares
  silhouette = []
  for i in range(2, max_n_clusters):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=4, n_init="auto")
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    clusters = kmeans.predict(X)
    if draw_graph_comp:
      print("N clusters:" + str(i))
      graph_comp(data, clusters, i)
      print("---")
    silhouette_avg = silhouette_score(X, clusters)
    silhouette.append(silhouette_avg)

  plt.plot(range(2, max_n_clusters), wcss)
  plt.title('The Elbow Method')
  plt.xlabel('no of clusters')
  plt.ylabel('wcss')
  plt.show()

  plt.plot(range(2, max_n_clusters), silhouette)
  plt.title('Silhouette Avg')
  plt.xlabel('no of clusters')
  plt.ylabel('Avg')
  plt.show()
  print(wcss)
  print(silhouette)

In [None]:
benchmark_K_means(data, 11, True)

In [None]:
data.describe()

In [None]:
data.groupby(["Age"]).count()

# Section 4: Customer Segmentaion using K Means (With Dimention Reduction)

In [None]:
# data2 = data.drop(['Cluster', 'cluster'], axis=1)

In [None]:
# Write this first
# PCA

# reduced data using PCA to 2 dims

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(data)

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(data)


# then benchmarked the results
# First graph uses the elbow method analysis, it's plotted n_clusters = 2 to 10.
# wcss is withing cluster sum of squares
# by looking at the elbow method (ref: https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans) and silttouse average scores we can determine the n_clusters = 4 is best for this
benchmark_K_means(reduced_data, 11)

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=4, n_init="auto")
data_copy = data.copy()
kmeans.fit(reduced_data)
data_copy['cluster'] = kmeans.labels_

In [None]:
data_copy['cluster'].describe()

In [None]:
plt.plot(np.arange(len(pca.explained_variance_)), pca.explained_variance_)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.show()

In [None]:
def graph_comp(matrix, clusters, n_clusters):
  sample_silhouette_values = silhouette_samples(matrix, clusters)
  graph_component_silhouette(n_clusters, [-0.1, 1], len(matrix), sample_silhouette_values, clusters)

In [None]:
def k_means_plot(n):
  print("________________________________________________________")
  print(f"Each of this indicate the silhouette analysis when the n_clusters = {n}.")
  print(f"Number of Clusters = {n}" )
  kmeans = KMeans(init="k-means++", n_clusters=n, n_init='auto', )
  C = kmeans.fit_predict(reduced_data)
  silhouette_avg = silhouette_score(reduced_data, C)
  print(f"score: {silhouette_avg}")

  graph_comp(reduced_data, C, n)
  # Step size of the mesh. Decrease to increase the quality of the VQ.
  h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

  # Plot the decision boundary. For that, we will assign a color to each
  x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
  y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
  xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

  # Obtain labels for each point in mesh. Use last trained model.
  Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

  # Put the result into a color plot
  Z = Z.reshape(xx.shape)
  plt.figure(1)
  plt.clf()
  plt.imshow(
      Z,
      interpolation="nearest",
      extent=(xx.min(), xx.max(), yy.min(), yy.max()),
      cmap=plt.cm.Paired,
      aspect="auto",
      origin="lower",
  )

  plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
  # Plot the centroids as a white X
  centroids = kmeans.cluster_centers_
  plt.scatter(
      centroids[:, 0],
      centroids[:, 1],
      marker="x",
      s=169,
      linewidths=3,
      color="w",
      zorder=10,
  )
  plt.title(str(n) + " clusters")
  plt.xlim(x_min, x_max)
  plt.ylim(y_min, y_max)
  plt.xticks(())
  plt.yticks(())
  plt.show()
  print("________________________________________________________")

In [None]:
def cluster_area_plot(data, model, n, h = 0.02):
  x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
  y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
  xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

  # Obtain labels for each point in mesh. Use last trained model.
  Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

  # Put the result into a color plot
  Z = Z.reshape(xx.shape)
  plt.figure(1)
  plt.clf()
  plt.imshow(
      Z,
      interpolation="nearest",
      extent=(xx.min(), xx.max(), yy.min(), yy.max()),
      cmap=plt.cm.Paired,
      aspect="auto",
      origin="lower",
  )

  plt.plot(data[:, 0], data[:, 1], "k.", markersize=2)
  # Plot the centroids
  centroids = model.cluster_centers_
  plt.scatter(
      centroids[:, 0],
      centroids[:, 1],
      marker="o",
      s=169,
      linewidths=3,
      color="w",
      zorder=10,
  )
  plt.title(str(n) + " clusters")
  plt.xlim(x_min, x_max)
  plt.ylim(y_min, y_max)
  plt.xticks(())
  plt.yticks(())
  plt.show()

In [None]:
for n in range(2, 11):
  k_means_plot(n)

In [None]:
# Section 5: Demographic Customer Segmentaion using K Means (With Dimention Reduction)

reduced_demographic_data = PCA(n_components=2).fit_transform(data[['Age', 'Gender', 'Occupation']])

benchmark_K_means(reduced_demographic_data, 11)

# Demographic Segmentation (Without PCA)

In [None]:
demographic_data = data[["Age", "Gender", "Occupation", "Location", "Shopping Frequency"]] # select the features that applicable for demographic segmentation

print(demographic_data.groupby(["Age"]).count())
benchmark_K_means(demographic_data, 11)

In [None]:
# By looking at the benchmark_K_means results n_clusters = 4 will be the optimum

# Perform customer segmentation using k-means clustering



demographic_best_n_clusters = 4

kmeans = KMeans(n_clusters=demographic_best_n_clusters, random_state=4, n_init="auto")  # Adjust the number of clusters based on your analysis
raw_data["Demographic Cluster"] = kmeans.fit_predict(demographic_data)

# Create visualizations to explore segmentation results

# 1. Distribution of customers across clusters
plt.bar([f"{i}" for i in range(demographic_best_n_clusters)], raw_data["Demographic Cluster"].value_counts())
plt.xlabel("Demographic Cluster")
plt.ylabel("Number of customers")
plt.title("Customer Distribution Across Demographic Clusters")
plt.show()


In [None]:
def draw_plot(n_clusters, xlabel, xlim=3):
  fig, ax = plt.subplots(2, 2, figsize=(8, 10))

  for c1 in range(int(n_clusters / 2)):
      for c2 in range(int(n_clusters / 2)):
        ax[c1, c2].set_xlim(right=xlim)
        ax[c1, c2].set_ylim(0, 45)
        ax[c1, c2].hist(raw_data.loc[raw_data["Demographic Cluster"] == (2 * c1) + c2, xlabel])
        ax[c1, c2].set_xlabel(xlabel,  fontsize='small')
        ax[c1, c2].set_ylabel("Number of customers",  fontsize='small')
        ax[c1, c2].set_title("Cluster {}".format((2 * c1) + c2),  fontsize='small')
  plt.show()

In [None]:
# 2. Age distribution within each cluster
draw_plot(4, "Age")

In [None]:
draw_plot(4, "Gender", 1)

In [None]:
draw_plot(4, "Location")

# Demographic Segmentation (With PCA)

In [None]:
demographic_data = data[["Age", "Gender", "Occupation", "Location", "Shopping Frequency"]] # select the features that applicable for demographic segmentation

pca = PCA(n_components=2)
reduced_demographic_data = pca.fit_transform(demographic_data)

benchmark_K_means(reduced_demographic_data, 11)

In [None]:
N_CLUSTERS = 4

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=4, n_init="auto")  # Adjust the number of clusters based on your analysis
demographic_data["Demographic Cluster PCA"] = kmeans.fit_predict(reduced_demographic_data)

# 1. Distribution of customers across clusters
plt.bar([f"{i}" for i in range(N_CLUSTERS)], demographic_data["Demographic Cluster PCA"].value_counts())
plt.xlabel("Demographic Cluster PCA")
plt.ylabel("Number of customers")
plt.title("Customer Distribution Across Demographic Clusters")
plt.show()

In [None]:
# 2. Age distribution within each cluster
fig, ax = plt.subplots(2, 2, figsize=(8, 10))

for c1 in range(int(demographic_best_n_clusters / 2)):
    for c2 in range(int(demographic_best_n_clusters / 2)):
      ax[c1, c2].set_xlim(right=3)
      ax[c1, c2].set_ylim(0, 45)
      ax[c1, c2].hist(demographic_data.loc[demographic_data["Demographic Cluster PCA"] == (2 * c1) + c2, "Gender"], range=(-1, 1))
      ax[c1, c2].set_xlabel("Gender",  fontsize='small')
      ax[c1, c2].set_ylabel("Number of customers",  fontsize='small')
      ax[c1, c2].set_title("Cluster {}".format((2 * c1) + c2),  fontsize='small')
plt.show()

In [None]:
cluster_area_plot(reduced_demographic_data, kmeans, N_CLUSTERS)

In [None]:
data

In [None]:
# Brand Choice Factors
# Influence Factors
# Shopping Frequency
# Shopping Methods
# Online, In-store Preferences
# Types of Clothing
# Average Spent
# Prefered Types of Promotions or Discounts
# Likeliness of Promotion or Discount

psychographic_data = data[['Online, instore preferences', 'Shopping Frequency', 'Brand Preference',
       'Likeliness of Promotion or Discount', 'Average Spent',
       'Types of Clothing_Casual Wear',
       'Types of Clothing_Accessories',
       'Types of Clothing_Formal Wear',
       'Types of Clothing_Athletic/Activewear',
       'Types of Clothing_Ethnic or traditional wear',
       'Shopping Methods_Physical retail stores',
       'Shopping Methods_Online fashion stores/ web-sites',
       'Shopping Methods_Department stores', 'Shopping Methods_Boutiques',
       'Shopping Methods_Thrift/ Second Hand shops',
       'Influence Factors_Availability of a wide range of options',
       'Influence Factors_Competitive pricing',
       'Influence Factors_Ability to try on products before purchasing',
       'Influence Factors_Convenience',
       'Influence Factors_Personalized shopping experience',
       'Influence Factors_Environmental considerations (eg. reduce carbon emission)',
       'Brand Choice Factors_Price and affordability',
       'Brand Choice Factors_Trendiness and style',
       'Brand Choice Factors_Promotions and Discounts',
       'Brand Choice Factors_Brand reputation and quality',
       'Brand Choice Factors_Brand\'s commitment to sustainability',
       'Preferred Types of Promotions or Discounts_Percentage discounts',
       'Preferred Types of Promotions or Discounts_Seasonal sales',
       'Preferred Types of Promotions or Discounts_Other',
       'Preferred Types of Promotions or Discounts_Buy-one-get-one(BOGO) deals',
       'Preferred Types of Promotions or Discounts_Free shipping',
       'Preferred Types of Promotions or Discounts_Loyalty rewards programs']]

print(len(psychographic_data.columns))
psychographic_data.columns

In [None]:
benchmark_K_means(psychographic_data, 11)

In [None]:
# psychographic_data.columns
pca = PCA(n_components=2)
reduced_psychographic_data = pca.fit_transform(psychographic_data)

benchmark_K_means(reduced_psychographic_data, 11)

In [None]:
N_CLUSTERS = 5

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=4, n_init="auto")  # Adjust the number of clusters based on your analysis
psychographic_data["Psychographic Cluster PCA"] = kmeans.fit_predict(reduced_psychographic_data)

# 1. Distribution of customers across clusters
plt.bar([f"{i}" for i in range(N_CLUSTERS)], psychographic_data["Psychographic Cluster PCA"].value_counts())
plt.xlabel("Psychographic Cluster PCA")
plt.ylabel("Number of customers")
plt.title("Customer Distribution Across Psychographic Clusters")
plt.show()

In [None]:
cluster_area_plot(reduced_psychographic_data, kmeans, N_CLUSTERS)

In [None]:
# Online, In-store Preferences
# Communication Method
# Payment Methods

technographic_data = data[['Payment Methods_Cash', 'Payment Methods_Credit/Debit Card',
       'Payment Methods_Mobile Payment Apps',
       'Payment Methods_Online Payment Platforms', 'Communication Methods_SMS/text messages',
       'Communication Methods_Social media updates',
       'Communication Methods_In store notifications',
       'Communication Methods_Leaflets',
       'Communication Methods_Email newsletters', 'Online, instore preferences']]

len(technographic_data.columns)

In [None]:
benchmark_K_means(technographic_data, 11)

In [None]:
pca = PCA(n_components=2)
reduced_technographic_data = pca.fit_transform(technographic_data)

benchmark_K_means(reduced_technographic_data, 11)

In [None]:
N_CLUSTERS = 8

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=4, n_init="auto")  # Adjust the number of clusters based on your analysis
technographic_data["Technographic Cluster PCA"] = kmeans.fit_predict(reduced_technographic_data)

# 1. Distribution of customers across clusters
plt.bar([f"{i}" for i in range(N_CLUSTERS)], technographic_data["Technographic Cluster PCA"].value_counts())
plt.xlabel("Technographic Cluster PCA")
plt.ylabel("Number of customers")
plt.title("Customer Distribution Across Technographic Clusters")
plt.show()

In [None]:
cluster_area_plot(reduced_technographic_data, kmeans, N_CLUSTERS)

In [None]:
# Shopping Frequency
# Shopping Methods
# Online, In-store Preferences
# Types of Clothing
# Brand Preference
# Average Spent
# Payment Methods
# Prefered Types of Promotions or Discounts
data.columns

In [None]:
behavioral_data = data[['Shopping Frequency', 'Shopping Methods_Physical retail stores',
       'Shopping Methods_Online fashion stores/ web-sites',
       'Shopping Methods_Department stores', 'Shopping Methods_Boutiques',
       'Shopping Methods_Thrift/ Second Hand shops', 'Online, instore preferences', 'Types of Clothing_Casual Wear', 'Types of Clothing_Accessories',
       'Types of Clothing_Formal Wear',
       'Types of Clothing_Athletic/Activewear',
       'Types of Clothing_Ethnic or traditional wear', 'Brand Preference', 'Payment Methods_Cash',
       'Payment Methods_Credit/Debit Card',
       'Payment Methods_Mobile Payment Apps',
       'Payment Methods_Online Payment Platforms', 'Average Spent', 'Preferred Types of Promotions or Discounts_Percentage discounts',
       'Preferred Types of Promotions or Discounts_Seasonal sales',
       'Preferred Types of Promotions or Discounts_Other',
       'Preferred Types of Promotions or Discounts_Buy-one-get-one(BOGO) deals',
       'Preferred Types of Promotions or Discounts_Free shipping',
       'Preferred Types of Promotions or Discounts_Loyalty rewards programs',]]

behavioral_data.shape

In [None]:
benchmark_K_means(behavioral_data, 11)

In [None]:
pca = PCA(n_components=2)
reduced_behavioral_data = pca.fit_transform(behavioral_data)

benchmark_K_means(reduced_behavioral_data, 11)

In [None]:
N_CLUSTERS = 5

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=4, n_init="auto")  # Adjust the number of clusters based on your analysis
behavioral_data["Behavioral Cluster PCA"] = kmeans.fit_predict(reduced_behavioral_data)

# 1. Distribution of customers across clusters
plt.bar([f"{i}" for i in range(N_CLUSTERS)], behavioral_data["Behavioral Cluster PCA"].value_counts())
plt.xlabel("Behavioral Cluster PCA")
plt.ylabel("Number of customers")
plt.title("Customer Distribution Across Behavioral Clusters")
plt.show()

In [None]:
cluster_area_plot(reduced_behavioral_data, kmeans, N_CLUSTERS)

In [None]:
data['Demographic Segement'] = demographic_data["Demographic Cluster PCA"]
data['Psychographic Segement'] = psychographic_data["Psychographic Cluster PCA"]
data['Technographic Segment'] = technographic_data["Technographic Cluster PCA"]
data['Behavioral Segment'] = behavioral_data["Behavioral Cluster PCA"]

In [None]:
data