In [1]:
!pip install pandas sklearn dask

Collecting pandas
  Downloading pandas-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[K     |████████████████████████████████| 11.7 MB 1.7 MB/s eta 0:00:01
[?25hCollecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting dask
  Downloading dask-2022.2.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 86.5 MB/s eta 0:00:01
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[K     |████████████████████████████████| 26.7 MB 110.2 MB/s eta 0:00:01
Collecting pyyaml>=5.3.1
  Downloading PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (701 kB)
[K     |████████████████████████████████| 701 kB 40.6 MB/s eta 0:00:01
[?25hCollecting fsspec>=0.6.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 83.5 MB/s eta 0:00:01
[?25hCollecting cloudp

In [32]:
!unzip transactions_train.csv.zip

Archive:  transactions_train.csv.zip
  inflating: transactions_train.csv  


In [33]:
!unzip articles.csv.zip

Archive:  articles.csv.zip
  inflating: articles.csv            


In [34]:
!unzip sample_submission.csv.zip

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   


In [35]:
!unzip customers.csv.zip

Archive:  customers.csv.zip
  inflating: customers.csv           


In [21]:
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from collections import Counter

In [22]:
import tensorflow as tf

In [23]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [24]:
!ls

articles.csv	   negative_sampling_pipeline.ipynb  sample_submission.csv
articles.csv.zip   nn-pipeline.ipynb		     sample_submission.csv.zip
customers.csv	   onstart.log			     transactions_train.csv
customers.csv.zip  onstart.sh			     transactions_train.csv.zip


In [49]:
transactions = dd.read_csv(
    'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': str,
#           't_dat': str,
#            'customer_id':str, 
#           'price':float,
#           'sales_channel_id': 'object'
          } 
)

In [139]:
customers = dd.read_csv('customers.csv', dtype={'customer_id':str, 
#                                                                                                       'FN':int, 
#                                                                                                         'Active':int, 
#                                                                                                       'club_member_status':int,
#                                                                                                         'fashion_news_frequency':str, 
#                                                                                                       'age':float, 
#                                                                                                         'postal_code':str
                                                                                                     })
articles = dd.read_csv('articles.csv', dtype={'article_id':str, 
#                                                                                                     'product_code':int, 'prod_name':str, 
#                                                                                                       'product_type_no':'object',
#                                                                        'product_type_name':str, 'product_group_name':'object', 'graphical_appearance_no':'object',
#                                                                        'graphical_appearance_name':str, 'colour_group_code':'object', 'colour_group_name':str,
#                                                                        'perceived_colour_value_id':str, 'perceived_colour_value_name':str,
#                                                                        'perceived_colour_master_id':str, 'perceived_colour_master_name':str,
#                                                                        'department_no':int, 'department_name':str, 'index_code':str, 'index_name':str,
#                                                                        'index_group_no':'object', 'index_group_name':str, 'section_no':int, 'section_name':str,
#                                                                        'garment_group_no':'object', 'garment_group_name':str, 'detail_desc':str
                                                                                                   })

In [51]:
customers = customers[['customer_id', 'age', 'postal_code']].compute()

In [52]:
transactions_train = transactions[(transactions['t_dat'] < '2020-09-15') & (transactions['t_dat'] > '2019-09-15')]
transactions_test = transactions[transactions['t_dat'] > '2020-09-15']

In [53]:
transactions_train_arts = dd.merge(transactions_train, articles[['article_id','prod_name']], 
                                   on='article_id', how='left')
transactions_test_arts = dd.merge(transactions_test, articles[['article_id','prod_name']],
                                  on='article_id', how='left')

In [54]:
transactions_train_arts = transactions_train_arts.compute()
transactions_test_arts = transactions_test_arts.compute()

In [55]:
transactions_train_arts['prod_name']

0                            Anika
1        Perrie Trash HW Denim TRS
2                      Katie Skirt
3                      Katie Skirt
4         James basic hip belt (1)
                   ...            
42210     Coachella padded softbra
42211            Madison skinny HW
42212         Mimosa SP Andes opt1
42213          Liza 3p thong micro
42214             Mercedez sweater
Name: prod_name, Length: 14864337, dtype: object

In [56]:
customers_c = Counter(transactions_train_arts.customer_id.values)
customers_top = list(set(c for c in customers_c if customers_c[c] > 5))
label2id_customers = {v:i for i,v in enumerate(customers_top)}

articles_c = Counter(transactions_train_arts.prod_name.values)
articles_top = list(set(c for c in articles_c if articles_c[c] > 5))
label2id_articles = {v:i for i,v in enumerate(articles_top)}

vocab_article_size = len(label2id_articles)
vocab_customer_size = len(label2id_customers)

In [57]:
vocab_article_size

25483

In [58]:
vocab_customer_size

590371

In [57]:
# transactions_test_arts[transactions_test_arts.customer_id.isin(customers_top)]

In [59]:
def preprocess_dataset(filename, df, label2id_customers, label2id_articles ):

    f = open(filename, 'w')
    df = df[df.customer_id.isin(label2id_customers) & df.prod_name.isin(label2id_articles)].sample(frac=1.)

    for customer, artice in df[['customer_id', 'prod_name']].values:
        f.write('\t'.join([str(label2id_customers[customer]), 
                           str(label2id_articles[artice]), 
                           str(1)]) + '\n')
        f.write('\t'.join([str(label2id_customers[customer]), 
                           str(np.random.randint(vocab_article_size)), 
                           str(0)]) + '\n')
        
        

    f.close()



In [64]:
# transactions_train_arts[transactions_train_arts.customer_id.isin(label2id_customers) & 
#                         transactions_train_arts.article_id.isin(label2id_articles)].sample(frac=1.)

In [195]:
preprocess_dataset('train_dataset.csv', 
                   transactions_train_arts, label2id_customers, 
                   label2id_articles)


In [180]:
preprocess_dataset('test_dataset.csv', 
                   transactions_test_arts, label2id_customers, 
                   label2id_articles)


In [181]:
def generate(filename, infer=False):
    for line in open(filename):
        c, a, label = line.strip('\n').split('\t') 
        
        
        
        if not infer:
            yield ((tf.constant([int(c)], dtype=tf.int32), 
                    tf.constant([int(a)], dtype=tf.int32),
                    ), 
                   
                   tf.constant([int(label)], dtype=tf.int32))
        else:
            yield ((tf.constant([int(c)], dtype=tf.int32), 
                    tf.constant([int(a)], dtype=tf.int32),
                    ))

In [182]:
# a = open('infer_df_valid_dataset.csv')

In [196]:
dataset = tf.data.Dataset.from_generator(generate, 
                                         output_signature=((tf.TensorSpec(shape=(1,), dtype=tf.int32),
                                                            tf.TensorSpec(shape=(1,), dtype=tf.int32)),
                                                            tf.TensorSpec(shape=(1,), dtype=tf.int32)
                                                           ),
                                         args=['train_dataset.csv']).batch(500).prefetch(5)

dataset_test = tf.data.Dataset.from_generator(generate, 
                                         output_signature=((tf.TensorSpec(shape=(1,), dtype=tf.int32),
                                                            tf.TensorSpec(shape=(1,), dtype=tf.int32)),
                                                            tf.TensorSpec(shape=(1,), dtype=tf.int32)
                                                           ),
                                         args=['test_dataset.csv']).batch(500)


# dataset_submission = tf.data.Dataset.from_generator(generate, 
#                                          output_signature=(tf.TensorSpec(shape=(1,), dtype=tf.float64),
#                                                             tf.TensorSpec(shape=(1,), dtype=tf.int32),
#                                                             tf.TensorSpec(shape=(1,), dtype=tf.string)),
#                                          args=['submission_dataset.csv', True]).batch(300)

In [65]:
g = generate('train_dataset.csv')

In [66]:
next(g)

((<tf.Tensor: shape=(1,), dtype=int32, numpy=array([457301], dtype=int32)>,
  <tf.Tensor: shape=(1,), dtype=int32, numpy=array([16604], dtype=int32)>),
 <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)

In [47]:
# import tensorflow_addons as tfa

In [246]:
inputs_customer = tf.keras.layers.Input(shape=(1,))
inputs_article = tf.keras.layers.Input(shape=(1,))


embeddings_customer = tf.keras.layers.Embedding(input_dim=vocab_customer_size, output_dim=300)(inputs_customer, )
embeddings_article = tf.keras.layers.Embedding(input_dim=vocab_article_size, output_dim=300)(inputs_article, )

customer = tf.keras.layers.Flatten()(embeddings_customer)
article = tf.keras.layers.Flatten()(embeddings_article)

# customer = tf.keras.layers.Lambda(
#       lambda x: tf.nn.l2_normalize(x, axis=1))(customer)
# article = tf.keras.layers.Lambda(
#       lambda x: tf.nn.l2_normalize(x, axis=1))(article)

# dot = tf.keras.layers.Dot(1)([customer, article])

concat = tf.keras.layers.concatenate([customer, article], axis=-1)
dense = tf.keras.layers.Dense(256, activation='relu')(concat)
dense_1 = tf.keras.layers.Dense(256, activation='relu')(dense)

# concat_dot = tf.keras.layers.concatenate([dense_1, con], axis=-1)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense_1)
# outputs = tf.keras.layers.Activation(activation='sigmoid')(concat_dot)

model = tf.keras.Model(inputs=[inputs_customer, inputs_article], 
                       outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [247]:
print(model.summary())

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_23 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_24 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 embedding_22 (Embedding)       (None, 1, 300)       177111300   ['input_23[0][0]']               
                                                                                                  
 embedding_23 (Embedding)       (None, 1, 300)       7644900     ['input_24[0][0]']               
                                                                                           

In [248]:
model.fit(dataset, validation_data=dataset_test, steps_per_epoch=1000, validation_steps=100,
          epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
 192/1000 [====>.........................] - ETA: 1:20 - loss: 0.3904 - accuracy: 0.8245

KeyboardInterrupt: 

In [81]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7f32272026a0>,
 <keras.engine.input_layer.InputLayer at 0x7f313799e310>,
 <keras.layers.embeddings.Embedding at 0x7f3227194d30>,
 <keras.layers.embeddings.Embedding at 0x7f3227194f70>,
 <keras.layers.core.flatten.Flatten at 0x7f32271c4550>,
 <keras.layers.core.flatten.Flatten at 0x7f3225162220>,
 <keras.layers.merge.Dot at 0x7f32251629a0>,
 <keras.layers.core.activation.Activation at 0x7f3225162df0>]

In [198]:
customer_emdeddings = model.layers[2].get_weights()[0]
article_emdeddings = model.layers[3].get_weights()[0]

In [162]:
# transactions_train = transactions_train.compute()

In [None]:
# transactions_test = transactions_test.compute()

In [199]:
valid = transactions_test_arts.groupby('customer_id')['article_id'].agg(' '.join).reset_index()

In [200]:
# articles = articles.compute()

In [201]:
prod2art = transactions_train_arts[['article_id', 'prod_name']].groupby('prod_name')['article_id'].agg(lambda x:
                                                                            Counter(x).most_common(1)[0][0]).to_dict()

In [202]:
# prod2art

In [203]:
# !pip install annoy

In [215]:
def normalize(v):
    norm=np.linalg.norm(v, ord=1)
    if norm==0:
        norm=np.finfo(v.dtype).eps
    return v/norm

In [219]:
import annoy

In [220]:
index_articles = annoy.AnnoyIndex(300, metric='angular')

In [221]:
for i, vec in enumerate(article_emdeddings):
    index_articles.add_item(i, normalize(vec))

In [222]:
index_articles.build(100)

True

In [223]:
top12 = ' '.join([prod2art[x] for x in transactions_train_arts[
    transactions_train_arts['t_dat'] > '2020-09-01'].prod_name.value_counts().index[:12]])

In [224]:
from sklearn.metrics.pairwise import cosine_distances
def get_similar_articles(customer_id):
    similar = [prod2art[articles_top[i]] for i in 
               index_articles.get_nns_by_vector(normalize(customer_emdeddings[label2id_customers[customer_id]]),
                                                12, search_k=2000)]
    
    return similar

recs = []
for c in valid['customer_id'].values:
    if c in label2id_customers:
        tops = ' '.join(get_similar_articles(c))
        recs.append(tops)
    else:
        recs.append(top12)

In [225]:
valid['recs'] = recs

In [226]:
valid

Unnamed: 0,customer_id,article_id,recs
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,0624486001,0901638001 0687704001 0733101002 0789147006 07...
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,0827487003,0774785002 0751598002 0828251002 0688430004 06...
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0757926001 0788575004 0640021019,0801384013 0859737002 0816563001 0736049001 05...
3,000525e3fe01600d717da8423643a8303390a055c578ed...,0874110016,0751471001 0706016001 0762846008 0685814001 07...
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0903762001 0879189005 0158340001 0867966009 09...,0739590032 0749400010 0794389001 0684209027 08...
...,...,...,...
68979,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,0874816003 0911870004,0816166005 0706016001 0733098018 0803757001 07...
68980,fffa7d7799eb390a76308454cbdd76e473d65b1497fbe4...,0861803014 0849886010,0822158001 0916468003 0806778001 0761406002 05...
68981,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0396135007 0817472007 0715624050 0817472003 08...,0779551002 0610776002 0706016001 0687704001 07...
68982,fffd870c6324ad3bda24e4d6aeae221c199479086bfdfd...,0750423010 0761269001,0751471001 0706016001 0762846008 0685814001 07...


In [227]:
mapk(valid['article_id'].str.split(), valid['recs'].str.split(), 12)

0.0016681786934897966

In [135]:
mapk(valid['prod_name'].str.split('##'), valid['recs'].str.split('##'), 12)

0.009926064083049102

In [174]:
mapk(valid['article_id'].str.split(), valid['recs'].str.split(), 12)

0.0034474813627328933

In [189]:
mapk(valid['article_id'].str.split(), valid['recs'].str.split(), 12)

0.004099793047530371

In [190]:
df_sub = pd.read_csv('sample_submission.csv')

In [191]:
df_sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [None]:
recs = []
for c in df_sub['customer_id'].values:
    if c in label2id_customers:
        tops = ' '.join(get_similar_articles(c))
        recs.append(tops)
    else:
        recs.append(top12)

In [None]:
df_sub['prediction'] = recs

In [None]:
df_sub

In [None]:
df_sub[['customer_id', 'prediction']].to_csv("submission.csv", index=False, header=True)