In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX
from datetime import datetime, timedelta
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv("./dataset/final_merged_df.csv")

# Use only the weekly_sales column for clustering
X = data[['weekly_sales']]

# Scale the data (important for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters (Elbow method)
inertia = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42,n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)
data.head()

In [None]:
# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)  # Choose an appropriate number of clusters based on the elbow method
data['Cluster'] = kmeans.fit_predict(X_scaled)

# Analyze the clusters
print(data.groupby('Cluster')['weekly_sales'].describe())

# Visualize the clusters
plt.scatter(data.index, data['weekly_sales'], c=data['Cluster'], cmap='viridis')
plt.title('K-Means Clustering of weekly_sales')
plt.xlabel('Date Index')
plt.ylabel('weekly_sales')
plt.xticks(rotation=45)
plt.show()