In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import warnings

In [35]:
# for cleaner output
warnings.filterwarnings('ignore')

DATA_PATH = r'E:\AIML Tasks\Nike_Sales_Uncleaned.csv'
OUTPUT_IMAGE_ELBOW = 'cluster_elbow_curve.png'
OUTPUT_IMAGE_KMEANS = 'cluster_kmeans_pca.png'
OUTPUT_IMAGE_HIERARCHICAL = 'cluster_hierarchical.png'

In [36]:
def load_and_clean_data(filepath):
    """
    Loads the raw Nike dataset and performs extensive cleaning:
    1. Removes non-numeric characters from currency fields.
    2. Converts dates.
    3. Handles missing values.
    """
    print("Loading and cleaning data...")
    df = pd.read_csv(filepath)
    
    # Standardizing column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    
    # List of columns that are likely currency/numeric but might be dirty
    currency_cols = ['revenue', 'profit', 'mrp']
    
    for col in currency_cols:
        if col in df.columns:
            # Remove '$', ',', and whitespace, then convert to float
            if df[col].dtype == 'object':
                df[col] = df[col].replace(r'[$,\s]', '', regex=True)
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # 'units_sold' usually has commas
    if 'units_sold' in df.columns:
        if df['units_sold'].dtype == 'object':
            df['units_sold'] = df['units_sold'].replace(r'[,]', '', regex=True)
        df['units_sold'] = pd.to_numeric(df['units_sold'], errors='coerce')

    # Drop rows with nulls in critical columns
    clean_df = df.dropna(subset=['revenue', 'profit', 'units_sold'])
    
    print(f"Data cleaned. Rows remaining: {len(clean_df)}")
    return clean_df

In [37]:
def preprocess_features(df):
    """
    Selects numerical features for clustering and scales them.
    """
    # UPDATED: Selecting key metrics for segmentation using CORRECT names
    features = ['revenue', 'profit', 'units_sold']
    X = df[features]
    
    # Standardization is crucial for K-Means
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, features

In [38]:
def determine_optimal_k(X_scaled):
    """
    Uses the Elbow Method and Silhouette Score to find optimal K.
    """
    print("Determining optimal cluster count (K)...")
    inertia = []
    silhouette_scores = []
    k_range = range(2, 11)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        inertia.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
        
    # Plotting Elbow Curve
    plt.figure(figsize=(10, 5))
    plt.plot(k_range, inertia, marker='o', linestyle='--')
    plt.title('Elbow Method for Optimal K')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.savefig(OUTPUT_IMAGE_ELBOW)
    plt.close()
    
    # Automatically select K with highest silhouette score
    best_k = k_range[np.argmax(silhouette_scores)]
    print(f"Optimal K based on Silhouette Score: {best_k}")
    return best_k

In [39]:
def run_kmeans(X_scaled, k):
    """
    Trains K-Means model and visualizes results using PCA (2D projection).
    """
    print(f"Training K-Means with K={k}...")
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    
    # Use PCA to reduce dimensions to 2 for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.figure(figsize=(10, 7))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis', s=100)
    plt.title(f'K-Means Clustering (PCA Projection, K={k})')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(title='Cluster')
    plt.savefig(OUTPUT_IMAGE_KMEANS)
    plt.close()
    
    return clusters

In [40]:
def run_hierarchical(X_scaled, k):
    """
    Trains Agglomerative (Hierarchical) Clustering.
    """
    print("Training Hierarchical Clustering...")
    hc = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
    clusters = hc.fit_predict(X_scaled)
    
    # Visualization (using same PCA for consistency)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.figure(figsize=(10, 7))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='magma', s=100)
    plt.title(f'Hierarchical Clustering (PCA Projection, K={k})')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.savefig(OUTPUT_IMAGE_HIERARCHICAL)
    plt.close()
    
    return clusters

In [41]:
# --- Execution Flow ---
if __name__ == "__main__":
    # 1. Load Data
    df = load_and_clean_data(DATA_PATH)
    
    # 2. Preprocess
    X_scaled, feature_names = preprocess_features(df)
    
    # 3. Find K
    optimal_k = determine_optimal_k(X_scaled)
    
    # 4. K-Means Clustering
    df['kmeans_cluster'] = run_kmeans(X_scaled, optimal_k)
    
    # 5. Hierarchical Clustering
    df['hierarchical_cluster'] = run_hierarchical(X_scaled, optimal_k)
    
    # 6. Save Results
    output_csv = 'Nike_Sales_Clustered.csv'
    df.to_csv(output_csv, index=False)
    print(f"Analysis complete. Clustered data saved to {output_csv}")
    
    # 7. Simple Analysis of Clusters
    # UPDATED: Using correct column names for analysis
    print("\n--- Cluster Profiling (Mean Values) ---")
    print(df.groupby('kmeans_cluster')[['revenue', 'profit', 'units_sold']].mean())

Loading and cleaning data...
Data cleaned. Rows remaining: 1265
Determining optimal cluster count (K)...
Optimal K based on Silhouette Score: 6
Training K-Means with K=6...
Training Hierarchical Clustering...
Analysis complete. Clustered data saved to Nike_Sales_Clustered.csv

--- Cluster Profiling (Mean Values) ---
                     revenue       profit  units_sold
kmeans_cluster                                       
0                 -40.787953  2642.567148    0.003356
1                  66.236986    36.144681    2.960993
2                 -35.448478   162.983913   -0.046584
3               21549.624545  1498.572727    3.363636
4                  37.300234  2649.511371    2.973244
5                8375.019623  1402.122453    2.415094
