In [1]:
# Reset environment due to memory constraints.
%reset -f

In [2]:
import os
import json
import gc
import pickle

from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import fasttext

In [3]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [4]:
output_dir = os.path.join('output_data')

aggregated_search_data_path = os.path.join(output_dir, 'aggregated_search_data.jsonl')
preprocessed_products_path = os.path.join(output_dir, 'preprocessed_products.jsonl')
preprocessed_test_queries_path = os.path.join(output_dir, 'preprocessed_test_queries.jsonl')

train_dat_file_path = os.path.join(output_dir, 'train.dat')

random_projection_mat_path = os.path.join(output_dir, 'random_projection_mat.npy')
product_features_path = os.path.join(output_dir, 'product_features.npy')
queries_train_features_path = os.path.join(output_dir, 'queries_train_features.npy')
queries_test_features_path = os.path.join(output_dir, 'queries_test_features.npy')
products_id_to_idx_path = os.path.join(output_dir, 'products_id_to_idx.pkl')

In [5]:
# Number of tokens in the vocabulary of TF-IDF.
VOCAB_SIZE = 4096
# Embedding dimension used for random projection of TF-IDF vectors.
EMBEDDING_DIM = 256
# Number of training samples to use (set to None to use all samples).
NUM_TRAIN_SAMPLES = 10_000

In [6]:
# Load aggregated search data which will be used as training data.
aggregated_searches_df = pd.DataFrame(
    read_json_lines(aggregated_search_data_path, n_lines=NUM_TRAIN_SAMPLES)
)

In [7]:
# Load preprocessed product data.
products_data_df = pd.DataFrame(read_json_lines(preprocessed_products_path))

In [8]:
# Load preprocessed test queries.
test_offline_queries_df = pd.DataFrame(read_json_lines(preprocessed_test_queries_path))

In [9]:
# Create a mapping from ID of products to their integer index.
products_id_to_idx = dict(
    (p_id, idx) for idx, p_id in enumerate(products_data_df['id'])
)

In [10]:
ft_model = fasttext.load_model('/Users/madadi/Projects/models/cc.fa.300.bin')



In [11]:
# test = test_offline_queries_df['raw_query_normalized'].apply(ft_model.get_sentence_vector)
queries_test_projected = np.array([ft_model.get_sentence_vector(x) for x in tqdm(test_offline_queries_df['raw_query_normalized'].values)])
queries_train_projected = np.array([ft_model.get_sentence_vector(x) for x in tqdm(aggregated_searches_df['raw_query_normalized'].values)])
products_projected = np.array([ft_model.get_sentence_vector(x) for x in tqdm(products_data_df['title_normalized'].values)])

100%|██████████| 23140/23140 [00:01<00:00, 21830.56it/s]
100%|██████████| 10000/10000 [00:00<00:00, 42923.98it/s]
100%|██████████| 3612277/3612277 [02:10<00:00, 27633.05it/s]
