## Packages

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans

In [2]:
from src.clustering import *

## Data loading

In [3]:
datatypes = {
    'product_id': 'uint32',
    'client_id': 'uint32',
    'sales_net': 'float64',
    'quantity': 'int32',
    'order_channel': 'object',
    'branch_id': 'uint16',
    'unit_price': 'int8',
    'stock_flow': 'int8',
    'month_order': 'uint8',
    'order_invoice_delta': 'float16'
}

In [4]:
df = load_preprocessed_data(datatypes)
print(df.shape[0] == 63319315)

True


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63319315 entries, 0 to 63319314
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   product_id           uint32 
 1   client_id            uint32 
 2   sales_net            float64
 3   quantity             int32  
 4   order_channel        object 
 5   branch_id            uint16 
 6   unit_price           int8   
 7   stock_flow           int8   
 8   month_order          uint8  
 9   order_invoice_delta  float16
dtypes: float16(1), float64(1), int32(1), int8(2), object(1), uint16(1), uint32(2), uint8(1)
memory usage: 2.1+ GB


## Clustering

In [6]:
df = categorical_encoding(df)

In [7]:
df.dropna(inplace=True)

In [8]:
kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 750,
        "random_state": 0
    }

kmeans = KMeans(n_clusters=4, **kmeans_kwargs)
kmeans.fit(df)

KMeans(init='random', max_iter=750, n_clusters=4, random_state=0)

In [10]:
df_kmeans = pd.DataFrame(kmeans.transform(df))
df_kmeans.columns = [str(i) for i in range(4)]
df_kmeans['cluster'] = kmeans.predict(df)

In [11]:
df_kmeans.head()

Unnamed: 0,0,1,2,3,cluster
0,377263.7,2148597.0,1510704.0,1576281.0,0
1,287832.3,1882430.0,867601.4,1697803.0,0
2,1419991.0,1711950.0,298624.7,2206342.0,2
3,599882.1,2442448.0,1250739.0,2184108.0,0
4,2698694.0,752398.8,2262473.0,1654547.0,1


In [12]:
df_kmeans.to_csv((os.path.join("data", "kmeans_data.csv")))