In [32]:
import pandas as pd
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
# File name
features_5 = "5_features.csv"

In [38]:
# read data
df = pd.read_csv(features_5).drop("Unnamed: 0", axis=1)
df.columns = ["SKU", "return_rate", "size", "dep_1", "dep_2", "dep_3", "most_sold_season","discounted"]
df.head()

Unnamed: 0,SKU,return_rate,size,dep_1,dep_2,dep_3,most_sold_season,discounted
0,69,0.0,M,0,1,0,Fall,-1
1,73,0.0,OTHER,0,1,0,Spring,1
2,78,0.259259,L,0,1,0,Summer,1
3,105,0.0,S,1,0,0,Summer,0
4,107,0.0,OTHER,1,0,0,Summer,0


In [39]:
# Combine one-hot encoded department columns
df['department'] = df[['dep_1', 'dep_2', 'dep_3']].idxmax(axis=1).str.extract(r'(\d+)').astype(int)
df = df.drop(['dep_1', 'dep_2', 'dep_3'], axis=1)

# Encoding categorical variables
categorical_cols = ['size', 'most_sold_season','discounted','department']
label_encoder = LabelEncoder()
df['size'] = label_encoder.fit_transform(df['size'])
df['most_sold_season'] = label_encoder.fit_transform(df['most_sold_season'])
df['discounted'] = label_encoder.fit_transform(df['discounted'])
df['department'] = label_encoder.fit_transform(df['department'])
# Convert categorical columns to string type
df[categorical_cols] = df[categorical_cols].astype(str)

# Standardize numerical columns
numerical_cols = ['return_rate']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df.head()

Unnamed: 0,SKU,return_rate,size,most_sold_season,discounted,department
0,69,-0.933181,2,0,0,1
1,73,-0.933181,4,1,2,1
2,78,1.894038,1,2,2,1
3,105,-0.933181,5,2,1,0
4,107,-0.933181,4,2,1,0


In [53]:
# Specify the number of clusters
num_clusters = 3

# Fit K-Prototypes model
kproto = KPrototypes(n_clusters=num_clusters, init='Huang', n_init=10, verbose=1)
clusters = kproto.fit_predict(df[['return_rate','size', 'most_sold_season','discounted','department']], categorical=list(range(len(categorical_cols))))

# Add cluster labels to the original DataFrame
df['cluster'] = clusters

# Display the result
df

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 149993, ncost: 474107.89455757523
Run: 1, iteration: 2/100, moves: 117439, ncost: 447985.6730953131
Run: 1, iteration: 3/100, moves: 50128, ncost: 447668.62974253605
Run: 1, iteration: 4/100, moves: 0, ncost: 447668.62974253605
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 30810, ncost: 417903.2254827374
Run: 2, iteration: 2/100, moves: 0, ncost: 417903.2254827374
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 54968, ncost: 517369.16301997914
Run: 3, iteration: 2/100, moves: 32703, ncost: 517369.16301997914
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/100, moves: 63720, ncost: 519240.63443315425
Run: 4, iteration: 2/100, moves: 0, ncost: 519240.63443315425
Init: initializing centroids
Init

Unnamed: 0,SKU,return_rate,size,most_sold_season,discounted,department,cluster
0,69,-0.933181,2,0,0,1,0
1,73,-0.933181,4,1,2,1,0
2,78,1.894038,1,2,2,1,0
3,105,-0.933181,5,2,1,0,2
4,107,-0.933181,4,2,1,0,2
...,...,...,...,...,...,...,...
576488,9999967,-0.933181,0,2,1,1,0
576489,9999970,1.793066,4,0,2,1,0
576490,9999974,0.507100,1,2,2,1,0
576491,9999992,-0.933181,1,2,1,1,0


In [54]:
df.to_csv("clustering_result3.csv")

In [55]:
df["cluster"].value_counts()

cluster
0    305029
2    167012
1    104452
Name: count, dtype: int64

In [50]:
# Specify the number of clusters
num_clusters = 10

# Fit K-Prototypes model
kproto = KPrototypes(n_clusters=num_clusters, init='Huang', n_init=10, verbose=1)
clusters = kproto.fit_predict(df[['return_rate','size', 'most_sold_season','discounted','department']], categorical=list(range(len(categorical_cols))))

# Add cluster labels to the original DataFrame
df['cluster'] = clusters

# Display the result
df

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 124905, ncost: 331394.0388967364
Run: 1, iteration: 2/100, moves: 99125, ncost: 329019.0167170701
Run: 1, iteration: 3/100, moves: 43741, ncost: 329018.73162259604
Run: 1, iteration: 4/100, moves: 0, ncost: 329018.73162259604
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 249822, ncost: 332855.17375636427
Run: 2, iteration: 2/100, moves: 91123, ncost: 331038.69689455157
Run: 2, iteration: 3/100, moves: 2, ncost: 331038.69689455157
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 242580, ncost: 348734.2725239563
Run: 3, iteration: 2/100, moves: 39980, ncost: 347672.22911253496
Run: 3, iteration: 3/100, moves: 5442, ncost: 347672.22911253496
Init: initializing centroids
Init: initializing clusters
Init: initializing centroids
Init: initializing clus

Unnamed: 0,SKU,return_rate,size,most_sold_season,discounted,department,cluster
0,69,-0.933181,2,0,0,1,0
1,73,-0.933181,4,1,2,1,2
2,78,1.894038,1,2,2,1,0
3,105,-0.933181,5,2,1,0,3
4,107,-0.933181,4,2,1,0,3
...,...,...,...,...,...,...,...
576488,9999967,-0.933181,0,2,1,1,8
576489,9999970,1.793066,4,0,2,1,2
576490,9999974,0.507100,1,2,2,1,0
576491,9999992,-0.933181,1,2,1,1,8


In [51]:
df.to_csv("clustering_result10.csv")

In [52]:
df["cluster"].value_counts()

cluster
0    107922
1    104452
3    103108
2     56227
4     54396
5     46617
7     36591
9     27313
6     20101
8     19766
Name: count, dtype: int64