In [18]:
import pandas as pd
import string 
import random
from sklearn.model_selection import train_test_split

In [73]:
import numpy as np
import dask.array as da
from scipy.spatial.distance import cdist

def product_recommendation(x, y, z, k = 100, n_products = 1, weight = 1, kwargs= {'metric' : 'seuclidean'}):
    """Recommend products based on nearest neighbours in vector space.
    
        Parameters
        ----------
        x : sparse 2-D dataframe of shape (m_samples, n_features), dtype=floating
            The input data for which recommendations to be generated.
            
        y : sparse 2-D dataframe of shape (n_samples, n_features), dtype=floating
            The input data having historical samples for product holdings.
            
        z : sparse 2-D dataframe of shape (n_samples, k_products), dtype=floating
            The input data having historical products sold/target.
            
        k: neighbourhood size
        
        n_products: no. of products to recommend <=  unique count of k_products
        
        kwargs: arguments for cdist calculation
    """
    y = y[x.columns]
    x,y,z, products_target = tuple(map(np.array, [x,y,z, z.columns]))
    
    # Neighbourhood
    Y = da.from_array(cdist(x, y, **kwargs), chunks='auto')
    nearest_ind = Y.argtopk(-k)
    nearest_neighbours = np.take(z, nearest_ind, axis = 0)
    
    # Product frequency
    Z = da.from_array(np.einsum('ijk->ik', nearest_neighbours))
    products_freq_ind = Z.argtopk(n_products)
    products_freq_rec = np.take(products_target, products_freq_ind)
    print(Y.shape,nearest_ind.shape, nearest_neighbours.shape,\
  Z.shape,products_freq_ind.shape, products_freq_rec.shape)
    
    return products_freq_rec

In [31]:
data = pd.read_csv(r'/Users/mayankmahawar/Downloads/LS_2.0.csv')

In [36]:
data2 = pd.get_dummies(data[['STATE', 'WINNER', 'PARTY']])

In [37]:
data2.shape

(2263, 170)

In [51]:
filter_col = [col for col in data2 if col.startswith('PARTY_')]

In [55]:
X = data2.loc[:,~(data2.columns.isin(filter_col))]
y = data2.loc[:, (data2.columns.isin(filter_col))]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [60]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1516, 37) (747, 37) (1516, 133) (747, 133)


In [72]:
out = product_recommendation(X_test, X_train, y_train, k=1000, n_products = 10)

ValueError: operand has more dimensions than subscripts given in einstein sum, but no '...' ellipsis provided to broadcast the extra dimensions.

In [62]:
out = product_recommendation(X_test, X_train, y_train, k=1000, n_products = 10)

(747, 1516) (747, 1000) (747, 1000, 133) (747, 133) (747, 10) (747, 10)


In [66]:
for i in pd.DataFrame(out).columns:
    print('---  ---  ---')
    print(pd.DataFrame(out)[i].value_counts())

---  ---  ---
PARTY_INC    549
PARTY_BJP    198
Name: 0, dtype: int64
---  ---  ---
PARTY_NOTA    425
PARTY_INC     198
PARTY_BJP     124
Name: 1, dtype: int64
---  ---  ---
PARTY_BJP     425
PARTY_NOTA    322
Name: 2, dtype: int64
---  ---  ---
PARTY_IND    747
Name: 3, dtype: int64
---  ---  ---
PARTY_BSP    747
Name: 4, dtype: int64
---  ---  ---
PARTY_CPI(M)    747
Name: 5, dtype: int64
---  ---  ---
PARTY_VBA     724
PARTY_AITC     23
Name: 6, dtype: int64
---  ---  ---
PARTY_AITC    724
PARTY_VBA      23
Name: 7, dtype: int64
---  ---  ---
PARTY_SP    747
Name: 8, dtype: int64
---  ---  ---
PARTY_NTK    747
Name: 9, dtype: int64


In [8]:
print(Y.shape,nearest_ind.shape, nearest_neighbours.shape,\
  Z.shape,products_freq_ind.shape, products_freq_rec.shape)
(260, 140) (260, 100) (260, 100, 10) (260, 10) (260, 1) (260, 1)

NameError: name 'Y' is not defined

In [7]:
x_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(22)] 
x = pd.DataFrame(np.random.randint(1e3, size=(260,22), dtype='int32'), columns = x_col)

y = pd.DataFrame(np.random.randint(1e3, size=(140,22) , dtype='int32'), columns = x_col)

z_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(10)] 
z = pd.DataFrame(np.random.randint(1e3, size=(140,10) , dtype='int32'), columns = z_col)

In [11]:
alp3 = product_recommendation(x,y,z)

(260, 140) (260, 100) (260, 100, 10) (260, 10) (260, 1) (260, 1)


In [376]:
x_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(22)] 
x = pd.DataFrame(np.random.randint(1e3, size=(260,22), dtype='int32'), columns = x_col)

y = pd.DataFrame(np.random.randint(1e3, size=(140,22) , dtype='int32'), columns = x_col)

z_col = [''.join(random.choices(string.ascii_uppercase +string.digits, k = 3)) for _ in range(10)] 
z = pd.DataFrame(np.random.randint(1e3, size=(140,10) , dtype='int32'), columns = z_col)

In [298]:
z.head()

Unnamed: 0,OWE,R03,GSL,3SW,INQ,TJM,0O4,T1N,XA0,4LE
0,262,447,215,232,768,395,50,860,863,580
1,919,758,549,511,202,661,457,875,160,886
2,159,180,447,2,127,812,207,135,317,305
3,620,789,825,225,177,250,945,836,159,818
4,248,350,773,8,856,937,819,997,327,547


In [357]:
alp = product_recommendation(x,y,z)

In [372]:
alp2 = product_recommendation(x,y,z)