In [1]:
import pandas as pd
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
# File name
features_5 = "5_features.csv"

In [2]:
# read data
df = pd.read_csv(features_5).drop("Unnamed: 0", axis=1)
df.columns = ["SKU", "return_rate", "size", "dep_1", "dep_2", "dep_3", "most_sold_season","discounted"]
df.head()

Unnamed: 0,SKU,return_rate,size,dep_1,dep_2,dep_3,most_sold_season,discounted
0,69,0.0,M,0,1,0,Fall,-1
1,73,0.0,OTHER,0,1,0,Spring,1
2,78,0.259259,L,0,1,0,Summer,1
3,105,0.0,S,1,0,0,Summer,0
4,107,0.0,OTHER,1,0,0,Summer,0


In [3]:
# Combine one-hot encoded department columns
df['department'] = df[['dep_1', 'dep_2', 'dep_3']].idxmax(axis=1).str.extract(r'(\d+)').astype(int)
df = df.drop(['dep_1', 'dep_2', 'dep_3'], axis=1)

# Encoding categorical variables
categorical_cols = ['size', 'most_sold_season','discounted','department']
label_encoder = LabelEncoder()
df['size'] = label_encoder.fit_transform(df['size'])
df['most_sold_season'] = label_encoder.fit_transform(df['most_sold_season'])
df['discounted'] = label_encoder.fit_transform(df['discounted'])
df['department'] = label_encoder.fit_transform(df['department'])
# Convert categorical columns to string type
df[categorical_cols] = df[categorical_cols].astype(str)

# Standardize numerical columns
numerical_cols = ['return_rate']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df.head()

Unnamed: 0,SKU,return_rate,size,most_sold_season,discounted,department
0,69,-0.933181,2,0,0,1
1,73,-0.933181,4,1,2,1
2,78,1.894038,1,2,2,1
3,105,-0.933181,5,2,1,0
4,107,-0.933181,4,2,1,0


In [10]:
# Specify the number of clusters
num_clusters = 3

# Fit K-Prototypes model
kproto = KPrototypes(n_clusters=num_clusters, init='Huang', n_init=10, verbose=1)
clusters = kproto.fit_predict(df[['return_rate','size', 'most_sold_season','discounted','department']], categorical=[1,2,3,4])

# Add cluster labels to the original DataFrame
df['cluster'] = clusters

# Display the result
df

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 79568, ncost: 726101.215773256
Run: 1, iteration: 2/100, moves: 74658, ncost: 706910.6843584717
Run: 1, iteration: 3/100, moves: 38177, ncost: 702746.2178206494
Run: 1, iteration: 4/100, moves: 31253, ncost: 685674.9150009139
Run: 1, iteration: 5/100, moves: 48848, ncost: 683110.4490906268
Run: 1, iteration: 6/100, moves: 13151, ncost: 682473.0653183551
Run: 1, iteration: 7/100, moves: 6165, ncost: 682373.0881284798
Run: 1, iteration: 8/100, moves: 2369, ncost: 682357.9957736043
Run: 1, iteration: 9/100, moves: 980, ncost: 682355.8493571865
Run: 1, iteration: 10/100, moves: 119, ncost: 682355.8168817481
Run: 1, iteration: 11/100, moves: 4, ncost: 682355.8168450969
Run: 1, iteration: 12/100, moves: 0, ncost: 682355.8168450969
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 176871, ncost: 763592.1320352991
Run: 2

Run: 10, iteration: 9/100, moves: 118, ncost: 682355.8168816931
Run: 10, iteration: 10/100, moves: 4, ncost: 682355.8168450512
Run: 10, iteration: 11/100, moves: 0, ncost: 682355.8168450512
Best run was number 10


Unnamed: 0,SKU,return_rate,size,most_sold_season,discounted,department,cluster
0,69,-0.933181,2,0,0,1,2
1,73,-0.933181,4,1,2,1,2
2,78,1.894038,1,2,2,1,0
3,105,-0.933181,5,2,1,0,2
4,107,-0.933181,4,2,1,0,2
...,...,...,...,...,...,...,...
576488,9999967,-0.933181,0,2,1,1,2
576489,9999970,1.793066,4,0,2,1,0
576490,9999974,0.507100,1,2,2,1,1
576491,9999992,-0.933181,1,2,1,1,2


In [11]:
df.to_csv("clustering_result3.csv")

In [12]:
df["cluster"].value_counts()

cluster
2    291111
1    255980
0     29402
Name: count, dtype: int64