# 1. Data Preparation
Importing the data file, and preparing it to the analysis.

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('..\\Datasets\\Online_Retail.xlsx')

In [None]:
df

In [None]:
# Data Cleaning
df['CustomerID'] = df['CustomerID'].ffill() 
df.drop('Description', axis=1, inplace=True)
df.dropna(subset=['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'Country'], inplace=True)

df.isnull().sum()

In [None]:
df.shape

In [None]:
df.columns

---
# 2. Feature Engineering
We extract relevant features reflecting customer behavior, such as purchase history and total spending. Additional metrics like recency of purchase are calculated to capture behavior accurately.

In [None]:
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

# Feature Aggregation
df_customer = df.groupby('CustomerID').agg({
    'InvoiceNo': 'nunique',    # Order frequency
    'TotalAmount': 'sum',      # Total spending
}).reset_index()

# Rename columns for clarity
df_customer.columns = ['CustomerID', 'OrderFrequency', 'TotalSpending']

df_customer

---
# 3. Dimensionality Reduction with PCA
PCA is applied to reduce the dimensionality of the feature space.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
features = ['OrderFrequency', 'TotalSpending']
x = df_customer[features]
x_scl = StandardScaler().fit_transform(x)

pca = PCA()
pca_comp = pca.fit_transform(x_scl)
df_pca = pd.DataFrame(data=pca_comp, columns=pca.get_feature_names_out())

pca.explained_variance_ratio_

In [None]:
pca = PCA(n_components=2)
pca_comp = pca.fit_transform(x_scl)
df_pca = pd.DataFrame(data=pca_comp, columns=pca.get_feature_names_out())

df_final = pd.concat([df_pca, df_customer[['CustomerID']]], axis=1)

df_final

---
# 4. Determining Optimal Number of Clusters:
We use the Elbow method to find the optimal number of clusters for K-means clustering.

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
list_of_inirtias = []
for k in range(1, 20):
    km = KMeans(n_clusters=k, random_state=29)
    km.fit(df_pca)
    list_of_inirtias.append(km.inertia_)
    print('interia ' + str(k) + ': ' + str(km.inertia_))

plt.figure(figsize=(12, 6))
plt.plot(range(1, 20), list_of_inirtias, 'r-o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inirtias')
plt.show()

## 3 seems to be the best k-value

---
# 5. K-means Clustering

With dimensionality reduced, K-means clustering is performed to assign each customer to a cluster based on their feature values.

In [None]:
km = KMeans(n_clusters=3, random_state=29)
km.fit(df_pca)
predictions = km.fit_predict(df_pca)

df_final['Cluster'] = predictions

df_final

---
# 6. Cluster Profiling:

Each cluster is profiled to get a better sense of what each cluster represents.

In [None]:
df_cluster = pd.merge(df_customer, df_final[['CustomerID', 'Cluster']], on='CustomerID')

prof_cluster = df_cluster.groupby('Cluster').agg({
    'OrderFrequency': 'mean',
    'TotalSpending': 'mean',
}).reset_index()

prof_cluster

---
# 7. Visualization:

Using a scatter plot, we visualize the data to see how the cluster are represented on a plot.

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_final, x='pca0', y='pca1', hue='Cluster', palette='tab10')
plt.title('Customer Segments')
plt.show()

---
# 8. Evaluation:

Clustering quality is determined using appropriate silhouette score and within-cluster sum of squares.

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
sil_score = silhouette_score(df_pca, df_final['Cluster'])
print(f'Silhouette Score: {sil_score}')

distances = km.transform(df_pca)
squared_distances = distances**2
wcss = squared_distances.sum(axis=1)
total_wcss = wcss.sum()

print("Total within-cluster sum of squares (WCSS):", total_wcss)

---
## Extra: Remapping Clusters into proper named categories

We remap the clusters into a new column called 'Buyer type' to properly categorize the clusters instead of leaving them at 0, 1, 2.

In [None]:
cluster_mapping = {
    0: 'Low, Infrequent',
    1: 'High, Frequent',
    2: 'Moderate'
}

df_cluster['BuyerType'] = df_cluster['Cluster'].map(cluster_mapping)


# Move the BuyerType column to the second column
cols_cluster = [df_cluster.columns[0]] + ['BuyerType'] + [col for col in df_cluster.columns if col not in [df_cluster.columns[0], 'BuyerType']]
df_cluster = df_cluster[cols_cluster]

# Remove the old Cluster column
df_cluster.drop('Cluster', axis=1, inplace=True)

df_cluster

In [None]:
df_cluster.query("BuyerType == 'Moderate' or BuyerType == 'High, Frequent'")


---

## 9. Personalization and Recommendations

Tailor personalized marketing strategies and recommendations for each Buyer Type.

In [None]:
# Add a MarketingStrategy to each type of buyer.
def marketing_strategy(cluster):
    if cluster == 'High, Frequent':
        return 'Loyalty programs, Execlusive deals'
    elif cluster == 'Moderate':
        return 'Loyalty programs, Seasonal Promotions'
    elif cluster == 'Low, Infrequent':
        return 'Bundles, Seasonal Promotions'

df_cluster['MarketingStrategy'] = df_cluster['BuyerType'].apply(marketing_strategy)

df_cluster

---

## 10. Interpretation and Insights

### High-Value Customer Engagement

The segmentation analysis revealed distinct customer groups based on purchasing behavior. High-frequency, high-value customers contribute significantly to revenue and should be engaged with loyalty programs and exclusive offers to maintain their interest.

### Addressing Low-Value Customers

Infrequent, low-value customers are at risk of churning. They can be motivated with special discounts and personalized recommendations to boost their purchase frequency and value.

### Optimizing Marketing and Inventory

Understanding these segments cab help in targeted marketing, improved customer retention strategies, and optimized inventory management. This approach ensures popular products are well-stocked while reducing overstock of less popular items, ultimately enhancing customer satisfaction and profitability.