# Clusters by Demand
by LMZintgraf

### Imports.

In [1]:
import pandas as pd
import numpy as np
import time
import csv

_start_time = time.time()

# define a easy timing function to use going forward
def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    
# utility function- display large dataframes in an html iframe
def df_display(df, lines=500):
    txt = ("<iframe " +
           "srcdoc='" + df.head(lines).to_html() + "' " +
           "width=1000 height=500>" +
           "</iframe>")

    return IPython.display.HTML(txt)

In [2]:
#Read files:
tic()
train = pd.read_csv('input-data/train.csv',
                           dtype  = {'Semana': 'int8',
                                     'Producto_ID':'int32',
                                     'Cliente_ID':'int32',
                                     'Agencia_ID':'uint16',
                                     'Canal_ID':'int8',
                                     'Ruta_SAK':'int32',
                                     'Venta_hoy':'float32',
                                     'Venta_uni_hoy': 'int8',
                                     'Dev_uni_proxima':'int8',
                                     'Dev_proxima':'float32',
                                     'Demanda_uni_equil':'int32'})
test = pd.read_csv('input-data/test.csv',
                           dtype  = {'Semana': 'int8',
                                     'Producto_ID':'int32',
                                     'Cliente_ID':'int32',
                                     'Agencia_ID':'uint16',
                                     'Canal_ID':'int8',
                                     'Ruta_SAK':'int32'})
tac()

Time passed: 0hour:1min:13sec


In [3]:
# remove unnecessary fields in training data
train.drop(['Venta_uni_hoy', 'Venta_hoy','Dev_uni_proxima', 'Dev_proxima'], axis=1, inplace=True)

In [4]:
#Since test dataframe is not the same as train dataframe, we make them equal by removing and adding columns
train.insert(0, 'id', np.nan)
test.insert(7, 'Demanda_uni_equil', np.nan)

In [5]:
tic()
train['source']='train'
test['source']='test'
data = pd.concat([train, test],ignore_index=True)
tac()
print (train.shape, test.shape, data.shape)

Time passed: 0hour:0min:17sec
(74180464, 9) (6999251, 9) (81179715, 9)


In [6]:
#First thing we need to do is to transform our target ( Demanda_uni_equil) to log(1 + demand) - this makes sense since we're 
#trying to minimize rmsle vs the mean which minimizes rmse. At the end of the modeling (for submission) we need to reverse it 
#by applying expm1(x)

data['log_target'] = np.log1p(data["Demanda_uni_equil"])

In [7]:
#Let's also create all the grouping dataframes we are going to need 
tic()

global_mean = data['log_target'].mean()
prod_mean = data.groupby('Producto_ID').agg({'log_target': 'mean' })
client_mean = data.groupby('Cliente_ID').agg({'log_target': 'mean' })
prod_client_mean = data.groupby(['Producto_ID', 'Cliente_ID']).agg({'log_target': 'mean' })

tac()

Time passed: 0hour:1min:13sec


In [8]:
tic()
prod_mean_dict = prod_mean.to_dict()
prod_client_mean_dict = prod_client_mean.to_dict()
tac()

Time passed: 0hour:0min:29sec


In [None]:
def gen_pairs_mean_feature(key):
    key = tuple(key)
    product = key[0]
    client = key[1]
    
    val = prod_client_mean_dict['log_target'][(product,client)]
    if np.isnan(val):
        val = prod_mean_dict['log_target'][(product)]
        if np.isnan(val):
            val = client_mean_dict['log_target'][(client)]
            
    return val

In [None]:
tic()
data['pairs_mean'] = data[['Producto_ID', 'Cliente_ID']].apply(lambda x:gen_pairs_mean_feature(x), axis=1)
tac()

In [None]:
data.head()

### Feature Extraction.

#### (1) Cluster features by demand

In [None]:
# --- HYPERPARAMETERS FOR FEATURE EXTRACTION ---

# number of clusters to group depot/route/produc (if 0 will not be added as feature)
num_clusters_agencia = [25]
num_clusters_ruta = [100]
num_clusters_cliente = [150]
num_clusters_producto = [150]

In [None]:
# to make data categorical
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
# for clustering
from sklearn.cluster import KMeans

### Let's try to find the right amount of clusters

In [None]:
from scipy.spatial.distance import cdist, pdist
from matplotlib import pyplot as plt

# Determine your k range
k_range = range(1,14)

# Fit the kmeans model for each n_clusters = k
k_means_var = [KMeans(n_clusters=k).fit(hpc) for k in k_range]

# Pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

# Calculate the Euclidean distance from 
# each point to each cluster center
k_euclid = [cdist(hpc, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke,axis=1) for ke in k_euclid]

# Total within-cluster sum of squares
wcss = [sum(d**2) for d in dist]

# The total sum of squares
tss = sum(pdist(hpc)**2)/hpc.shape[0]

# The between-cluster sum of squares
bss = tss - wcss

# elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(k_range, bss/tss*100, 'b*-')
ax.set_ylim((0,100))
plt.grid(True)
plt.xlabel('n_clusters')
plt.ylabel('Percentage of variance explained')
plt.title('Variance Explained vs. k')

In [None]:
def get_clusters_by_demand(feats, tars, num_clusters, feat_name):
    '''
    For a given feature vector, respective target vector, and number of clusters, returns
        - feats_unique: a list of unique feature values
        - clusters: for each unique feature, the cluster (ID) it belongs to
        - demand_info_unique: for each unique feature, some statistics about the demand for that feature
    '''
    # get the unique features which we then want to cluster
    feats_unique = np.unique(feats)
    
    # for each unique feature, we will get the median/std demand
    demand_info_unique = np.zeros((len(feats_unique),2))
    for f in range(len(feats_unique)):
        all_orders = tars[feats==feats_unique[f]]
        demand_info_unique[f] = [np.median(all_orders), np.std(all_orders)]
        
    demand_info_unique[np.isnan(demand_info_unique)] = 0
        
    # we use the kmeans clustering algorithm
    kmeans = KMeans(num_clusters, n_jobs=-1)
    clusters = kmeans.fit_predict(demand_info_unique)
    
    # plot demand/cluster
    plt.figure(figsize=(15, 3))
    for c in range(num_clusters):
        d_median = demand_info_unique[clusters==c,0]
        d_std = demand_info_unique[clusters==c,1]
        plt.plot(d_median,d_std,'.')
        plt.xlabel('median')
        plt.ylabel('std')
    plt.savefig(path_datadrive+'clustByDem_{}_{}'.format(feat_name, num_clusters))

    # return a mapping from unique feature values to clusters, and the demand info per cluster
    return feats_unique, clusters

In [None]:
def cluster_and_save(feat_name, num_clusters):
    '''
    Input: 
        - idx: the index of the feature we want to cluster, one-hot-encode, and add to our features
        - num_clusters: the number of clusters we want to use to group the feature values
    '''
    global data
    
    # run clustering by demand using info from week 3-9
    feats_train = data[feat_name][data["Semana"]<9].get_values()
    tars_train = data['pairs_mean'][data['Semana']<9].get_values()
    feats_unique_train, clusters_train = get_clusters_by_demand(feats_train, tars_train, num_clusters, feat_name)
    
    # create new dataframe to save the mapping from feature ID to cluster ID
    a = np.hstack((feats_unique_train[:,np.newaxis], clusters_train[:,np.newaxis]))
    feat_clust_map = pd.DataFrame(data = a, columns=[feat_name,feat_name+'_clust_ID'])

    # save the new feature
    feat_clust_map.to_csv(path_datadrive+"clustByDem_{}_{}".format(feat_name, num_clusters), index=False)

In [None]:
for c in num_clusters_agencia:
    print("agencia...", c)
    cluster_and_save('Agencia_ID', c)

In [None]:
for c in num_clusters_ruta:
    print("ruta... ", c)
    cluster_and_save('Ruta_SAK', c)

In [None]:
for c in num_clusters_cliente:
    print("client...", c)
    cluster_and_save('Cliente_ID', c)

In [None]:
for c in num_clusters_producto:
    print("prod...", c)
    cluster_and_save('Producto_ID', c)