In [None]:
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from annoy import AnnoyIndex
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
tqdm.pandas()
import gc
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, use_memory_fs=False)
#Config
GENERATE_FOR = "kaggle"
TRAIN = True # set True if you want to train the w2vec model from scratch
VECTOR_SIZE = 32
CANDIDATES = 100
DISK_PIECES = 4
VER = 6

In [None]:
if GENERATE_FOR == "local":
    train_df = pl.concat([pl.read_parquet("./splitted_raw_data/train.parquet"),pl.read_parquet("./splitted_raw_data/val.parquet")])
if GENERATE_FOR == "kaggle":
    train_df = pl.concat([pl.read_parquet('./splitted_raw_data/all_train.parquet'),pl.read_parquet("./splitted_raw_data/test.parquet")])

In [None]:
train_df.shape

Let us now transform the data into a format that the `gensim` library can work with. Thanks to `polars` we can do so very efficiently and very quickly.

There are various ways we could feed our data to our model, however doing so straight from RAM in the form of Python lists is probably one of the fastest! As we have enough resources on Kaggle to do so, let us take this approach!

In [None]:
#word2vec train
if TRAIN:
    sentences_df = train_df.groupby('session').agg(pl.col('aid').alias('sentence'))
    sentences = sentences_df['sentence'].to_list()
    del sentences_df; gc.collect()
    print('Word2Vec training started...')
    w2vec = Word2Vec(sentences=sentences, vector_size=VECTOR_SIZE, min_count=1,window=20, workers=-1) # workers: kaggle nb'de çalıştırcaksan workers ayarla
    print('Word2Vec training, done.')
    wv = w2vec.wv
    wv.save(f"./models/word2vec_{GENERATE_FOR}.wordvectors")

In [None]:
aids = train_df.select(pl.col('aid').unique())['aid'].to_list()
len(aids)

In [None]:
if TRAIN == False:
    wv = KeyedVectors.load(f"./models/word2vec_{GENERATE_FOR}.wordvectors", mmap='r')

In [None]:
#vector extraction
from tqdm import tqdm
vectors = []
for aid in tqdm(aids):
    vectors.append(wv[aid].tolist())

In [None]:
aid_vectors = pd.concat([pd.Series(aids,name='aid'),pd.Series(vectors,name='vectors')],axis=1)
aid_vectors.to_parquet(f'./all_features/{GENERATE_FOR}_w2v_aid_vectors.pqt')
del aid_vectors,vectors;gc.collect()

### Approximate NN for Similarity Score

In [None]:
#annoy indexing
from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(wv.index_to_key)}
annoy = AnnoyIndex(VECTOR_SIZE, 'euclidean')

for aid, idx in aid2idx.items():
    annoy.add_item(idx, wv.vectors[idx])
    
annoy.build(20)

In [None]:
aid_vectors = pd.read_parquet(f'./all_features/{GENERATE_FOR}_w2v_aid_vectors.pqt')

In [None]:
aid_vectors

In [None]:
def annoy_get_distance(aid1,aid2,aid2idx,annoy):
    return annoy.get_distance(aid2idx[aid1], aid2idx[aid2])

In [None]:
from pathlib import Path
data_dir = Path(f"../raw_data/{GENERATE_FOR}_covisitation") #covisitation path
for type_str in tqdm(['clicks','carts','buy2buy']): # types
    part = 0
    whole_out_df = []
    for pqt_file in tqdm(sorted(data_dir.glob(f'{GENERATE_FOR}_top_{CANDIDATES}_{type_str}*'))):
        print(pqt_file)
        temp_df = pd.read_parquet(pqt_file)
        display(temp_df)
        similarities = []
        temp_df['similarity'] = temp_df.parallel_apply(lambda x: annoy_get_distance(x['aid_x'],x['aid_y'],aid2idx,annoy),axis=1)
        whole_out_df.append(temp_df)
        part += 1
        del temp_df; gc.collect()
    whole_out_df = pd.concat(whole_out_df, ignore_index=True)
    whole_out_df.to_parquet(f'./all_features/{GENERATE_FOR}_top_{CANDIDATES}_{type_str}_w2v_similarities.pqt')