In [1]:
import numpy as np
from scipy.spatial.distance import cdist

def product_recommendation(x, y, z, k = 100, n_products = 1, kwargs= {'metric' : 'cos'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    
    # Neighbourhood
    Y = cdist(x, y, **kwargs)
    nearest_ind = np.argpartition(Y,k)[:,:k]
    nearest_neighbours = np.take(z, nearest_ind, axis = 0)
    
    # Product frequency
    Z = np.einsum('ijk->ik', nearest_neighbours)
    products_freq_ind = np.argpartition(Z,-n_products)[:,-n_products:]
    products_freq_rec = np.take(products_target, products_freq_ind, axis = None)
    
    return products_freq_rec

# DASK implementation

In [1]:
import numpy as np
import dask.array as da
import dask_distance
from scipy.spatial.distance import cdist

def product_recommendation(x, y, z, k = 100, n_products = 1, kwargs= {'metric' : 'seuclidean'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    
    # Neighbourhood
    Y = da.from_array(cdist(x, y, **kwargs), chunks='auto')
    nearest_ind = Y.argtopk(-k)
    nearest_neighbours = np.take(z, nearest_ind, axis = 0)
    
    # Product frequency
    Z = da.from_array(np.einsum('ijk->ik', nearest_neighbours))
#     products_freq_ind = Z.argtopk(n_products)
#     products_freq_rec = np.take(products_target, products_freq_ind)

    
#     Y = cdist(y, x, **kwargs)
#     nearest_n = np.argsort(Y, axis = 1)s
#     nearest_ind = np.argpartition(Y,-k)[:,-k:]
#     nearest_neighbours = np.take(z, nearest_ind, axis=0)
    
#     below is fully implemented in dask

#     y = y[x.columns]
#     x,y,z, products_target = tuple(map(lambda x: da.from_array(np.array(x)), [x,y,z, z.columns]))
    
#     Y = dask_distance.cdist(y, x, **kwargs)
#     nearest_ind = Y.argtopk(k, axis = 1)
#     nearest_neighbours = da.take(z, nearest_ind, axis = 0)
    
#     Z = da.einsum('ijk->ik', nearest_neighbours)
#     products_freq_ind = Z.argtopk(topn, axis = 1)
#     products_freq_rec = da.take(products_target, products_freq_ind, axis = None)
    
    return Z#products_freq_rec

# Debugging Dask

In [None]:
print(Y.shape,nearest_ind.shape, nearest_neighbours.shape,\
  Z.shape,products_freq_ind.shape, products_freq_rec.shape)
(260, 140) (260, 100) (260, 100, 10) (260, 10) (260, 1) (260, 1)

In [16]:
import string 
import random
import pandas as pd
import numpy as np
import sys

import numpy as np
import dask.array as da
import dask_distance
from scipy.spatial.distance import cdist

In [161]:
def product_recommendation0(x, y, z, k = 1000, n_products = 1, kwargs= {'metric' : 'cosine'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    print('x --> ', x.shape, sys.getsizeof(x)/1e6)
    print('y --> ', y.shape, sys.getsizeof(y)/1e6)
    print('z --> ', z.shape, sys.getsizeof(z)/1e6)
    
    # Neighbourhood
    Y = cdist(x, y, **kwargs)
    print('Y --> ', Y.shape, sys.getsizeof(Y)/1e6)
    nearest_ind = np.argpartition(Y,k, axis=1)[:,:k]
    print('nearest_ind --> ', nearest_ind.shape, sys.getsizeof(nearest_ind)/1e6)
    nearest_neighbours = np.take(z, nearest_ind, axis = 0)
    print('nearest_neighbours --> ', nearest_neighbours.shape, sys.getsizeof(nearest_neighbours)/1e6)
    
    # Product frequency
    Z = np.einsum('ijk->ik', nearest_neighbours)
    print('Z --> ', Z.shape, sys.getsizeof(Z)/1e6)
    
    return Z

In [169]:
def product_recommendation(x, y, z, k = 1000, n_products = 1, kwargs= {'metric' : 'cosine'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    
    print('x --> ', x.shape, sys.getsizeof(x)/1e6)
    print('y --> ', y.shape, sys.getsizeof(y)/1e6)
    print('z --> ', z.shape, sys.getsizeof(z)/1e6)
    print('products_target --> ', products_target.shape, sys.getsizeof(products_target)/1e6)
    
    # Neighbourhood
    Y = da.from_array(cdist(x, y, **kwargs), chunks='auto')
#     Y = dask_distance.cdist(x, y, **kwargs)
    print('Y --> ', Y.shape, sys.getsizeof(Y)/1e6)
    print(Y,'\n')
    nearest_ind = Y.argtopk(-k)
    print('nearest_ind --> ', nearest_ind.shape, sys.getsizeof(nearest_ind)/1e6)
    print(nearest_ind, '\n')
    nearest_neighbours = da.from_array(np.take(z, nearest_ind, axis = 0), chunks='auto')
    print('nearest_neighbours --> ', nearest_neighbours.shape, sys.getsizeof(nearest_neighbours)/1e6)
#     print(nearest_neighbours, '\n')

    # Product frequency
#     Z = np.einsum('ijk->ik', nearest_neighbours)
    Z = da.einsum('ijk->ik', nearest_neighbours)
    print('Z --> ', Z.shape, sys.getsizeof(Z)/1e6)
    
    return Z

In [180]:
x_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(40)] 
x = pd.DataFrame(np.random.randint(1e3, size=(40000,40)), columns = x_col)

y = pd.DataFrame(np.random.randint(1e3, size=(26000,40)), columns = x_col)

z_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(45)] 
z = pd.DataFrame(np.random.randint(1e3, size=(26000,45)), columns = z_col)

In [129]:
x = [12.3, 27.4, 276, ]
x = [12.3, 27.4, 276, 20.6]

In [168]:
%time alp3 = product_recommendation0(x,y,z)

x -->  (30000, 40) 4.800112
y -->  (26000, 40) 4.160112
z -->  (26000, 35) 3.640112
Y -->  (30000, 26000) 6240.000112
nearest_ind -->  (30000, 1000) 120.000112
nearest_neighbours -->  (30000, 1000, 35) 1050.000128
Z -->  (30000, 35) 1.050112
CPU times: user 44.9 s, sys: 25.4 s, total: 1min 10s
Wall time: 1min 33s


In [None]:
%time alp3 = product_recommendation0(x,y,z)

x -->  (40000, 40) 12.800112
y -->  (26000, 40) 8.320112
z -->  (26000, 45) 9.360112


In [None]:
%time alp3 = product_recommendation(x,y,z)

In [175]:
%time alp3 = product_recommendation(x,y,z)

x -->  (30000, 40) 4.800112
y -->  (26000, 40) 4.160112
z -->  (26000, 35) 3.640112
products_target -->  (35,) 0.000376
Y -->  (30000, 26000) 8.8e-05
dask.array<array, shape=(30000, 26000), dtype=float64, chunksize=(3750, 3250), chunktype=numpy.ndarray> 

nearest_ind -->  (30000, 1000) 8.8e-05
dask.array<argtopk_aggregate-aggregate, shape=(30000, 1000), dtype=int64, chunksize=(3750, 1000), chunktype=numpy.ndarray> 

nearest_neighbours -->  (30000, 1000, 35) 8.8e-05
Z -->  (30000, 35) 8.8e-05
CPU times: user 1min 21s, sys: 42.3 s, total: 2min 3s
Wall time: 1min 49s


In [173]:
alp3.compute()

array([[11640, 12137, 13919, ..., 10202, 12426, 14716],
       [11640, 12137, 13919, ..., 10202, 12426, 14716],
       [11640, 12137, 13919, ..., 10202, 12426, 14716],
       ...,
       [11640, 12137, 13919, ..., 10202, 12426, 14716],
       [11640, 12137, 13919, ..., 10202, 12426, 14716],
       [11640, 12137, 13919, ..., 10202, 12426, 14716]])

In [376]:
x_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(22)] 
x = pd.DataFrame(np.random.randint(1e3, size=(260,22), dtype='int32'), columns = x_col)

y = pd.DataFrame(np.random.randint(1e3, size=(140,22) , dtype='int32'), columns = x_col)

z_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(10)] 
z = pd.DataFrame(np.random.randint(1e3, size=(140,10) , dtype='int32'), columns = z_col)

In [298]:
z.head()

Unnamed: 0,OWE,R03,GSL,3SW,INQ,TJM,0O4,T1N,XA0,4LE
0,262,447,215,232,768,395,50,860,863,580
1,919,758,549,511,202,661,457,875,160,886
2,159,180,447,2,127,812,207,135,317,305
3,620,789,825,225,177,250,945,836,159,818
4,248,350,773,8,856,937,819,997,327,547


In [357]:
alp = product_recommendation(x,y,z)

In [372]:
alp2 = product_recommendation(x,y,z)