In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [3]:
def main(raw_opt_path, preprocess_path, output_path, dims = 16):
    
    train = pl.read_parquet(raw_opt_path + 'train.parquet')
    test = pl.read_parquet(raw_opt_path + 'test.parquet')
    merge = pl.concat([train, test])
    del train, test
    
    merge = merge.filter(pl.col('type') == 0)
    
    sentence_df = merge.groupby('session').agg(pl.col('aid').alias('sentence'))
    sentences = sentence_df['sentence'].to_list()
    w2vec = Word2Vec(sentences=sentences, vector_size=dims, window=5, min_count=1, workers=4)
    
    w2v_df = pd.DataFrame(w2vec.wv.index_to_key, columns = ['aid'])
    w2v_vev_df = pd.DataFrame(w2vec.wv.vectors).add_prefix('vec_')
    w2v_df = pd.concat([w2v_df, w2v_vev_df], axis = 1)
    w2v_df = w2v_df.rename(columns = {'session': 'aid'})
    
    if dims == 16:
        w2v_df.to_parquet(output_path + f'w2v_output_16dims.parquet')
    else:
        w2v_df.to_parquet(output_path + f'w2v_output_64dims.parquet')

In [None]:
raw_opt_path = '../../input/train_test/'
preprocess_path = '../../input/train_valid/'
output_path = '../../input/preprocess/'

main(raw_opt_path, preprocess_path, output_path, dims = 16)
main(raw_opt_path, preprocess_path, output_path, dims = 64)