In [324]:
import numpy as np
from scipy.spatial.distance import cdist

def product_recommendation(x, y, z, k = 100, n_products = 1, kwargs= {'metric' : 'seuclidean'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    
    # Neighbourhood
    Y = cdist(x, y, **kwargs)
    nearest_ind = np.argpartition(Y,k)[:,:k]
    nearest_neighbours = np.take(z, nearest_ind, axis = 0)
    
    # Product frequency
    Z = np.einsum('ijk->ik', nearest_neighbours)
    products_freq_ind = np.argpartition(Z,-n_products)[:,-n_products:]
    products_freq_rec = np.take(products_target, products_freq_ind, axis = None)
    
    return products_freq_rec

# DASK implementation

In [1]:
import numpy as np
import dask.array as da
import dask_distance
from scipy.spatial.distance import cdist

def product_recommendation(x, y, z, k = 100, n_products = 1, kwargs= {'metric' : 'seuclidean'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    
    # Neighbourhood
    Y = da.from_array(cdist(x, y, **kwargs), chunks='auto')
    nearest_ind = Y.argtopk(-k)
    nearest_neighbours = np.take(z, nearest_ind, axis = 0)
    
    # Product frequency
    Z = da.from_array(np.einsum('ijk->ik', nearest_neighbours))
    products_freq_ind = Z.argtopk(n_products)
    products_freq_rec = np.take(products_target, products_freq_ind)

    
#     Y = cdist(y, x, **kwargs)
#     nearest_n = np.argsort(Y, axis = 1)s
#     nearest_ind = np.argpartition(Y,-k)[:,-k:]
#     nearest_neighbours = np.take(z, nearest_ind, axis=0)
    
#     below is fully implemented in dask

#     y = y[x.columns]
#     x,y,z, products_target = tuple(map(lambda x: da.from_array(np.array(x)), [x,y,z, z.columns]))
    
#     Y = dask_distance.cdist(y, x, **kwargs)
#     nearest_ind = Y.argtopk(k, axis = 1)
#     nearest_neighbours = da.take(z, nearest_ind, axis = 0)
    
#     Z = da.einsum('ijk->ik', nearest_neighbours)
#     products_freq_ind = Z.argtopk(topn, axis = 1)
#     products_freq_rec = da.take(products_target, products_freq_ind, axis = None)
    
    return products_freq_rec

# Debugging Dask

In [None]:
print(Y.shape,nearest_ind.shape, nearest_neighbours.shape,\
  Z.shape,products_freq_ind.shape, products_freq_rec.shape)
(260, 140) (260, 100) (260, 100, 10) (260, 10) (260, 1) (260, 1)

In [None]:
import string 
import random

In [410]:
x_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(22)] 
x = pd.DataFrame(np.random.randint(1e3, size=(260,22), dtype='int32'), columns = x_col)

y = pd.DataFrame(np.random.randint(1e3, size=(140,22) , dtype='int32'), columns = x_col)

z_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(10)] 
z = pd.DataFrame(np.random.randint(1e3, size=(140,10) , dtype='int32'), columns = z_col)

In [434]:
alp3 = product_recommendation(x,y,z)

(260, 100) (140, 10)
(260, 100, 10) <class 'numpy.ndarray'>


In [376]:
x_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(22)] 
x = pd.DataFrame(np.random.randint(1e3, size=(260,22), dtype='int32'), columns = x_col)

y = pd.DataFrame(np.random.randint(1e3, size=(140,22) , dtype='int32'), columns = x_col)

z_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(10)] 
z = pd.DataFrame(np.random.randint(1e3, size=(140,10) , dtype='int32'), columns = z_col)

In [298]:
z.head()

Unnamed: 0,OWE,R03,GSL,3SW,INQ,TJM,0O4,T1N,XA0,4LE
0,262,447,215,232,768,395,50,860,863,580
1,919,758,549,511,202,661,457,875,160,886
2,159,180,447,2,127,812,207,135,317,305
3,620,789,825,225,177,250,945,836,159,818
4,248,350,773,8,856,937,819,997,327,547


In [357]:
alp = product_recommendation(x,y,z)

In [372]:
alp2 = product_recommendation(x,y,z)