In [1]:
# Reset environment due to memory constraints.
%reset -f

In [2]:
import os
import json
import gc
import pickle

from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import fasttext

In [3]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [4]:
class JSONLinesWriter:
    """
    Helper class to write list of dictionaries into a file in json lines
    format, i.e. one json record per line.
    """

    def __init__(self, file_path):
        self.fd = None
        self.file_path = file_path
        self.delimiter = "\n"

    def open(self):
        self.fd = open(self.file_path, "w")
        self.first_record_written = False
        return self

    def close(self):
        self.fd.close()
        self.fd = None

    def write_record(self, obj):
        if self.first_record_written:
            self.fd.write(self.delimiter)
        self.fd.write(json.dumps(obj))
        self.first_record_written = True

    def __enter__(self):
        return self.open()

    def __exit__(self, type, value, traceback):
        self.close()

In [5]:
output_dir = os.path.join('output_data')
volume_dir = ('/mnt/h/torob_data/')

aggregated_search_data_path = os.path.join(output_dir, 'aggregated_search_data.jsonl')
preprocessed_products_path = os.path.join(output_dir, 'preprocessed_products.jsonl')
preprocessed_test_queries_path = os.path.join(output_dir, 'preprocessed_test_queries.jsonl')

train_dat_file_path = os.path.join(volume_dir, 'train.dat')

random_projection_mat_path = os.path.join(output_dir, 'random_projection_mat.npy')
product_features_path = os.path.join(output_dir, 'product_features.npy')
queries_train_features_path = os.path.join(output_dir, 'queries_train_features.npy')
queries_test_features_path = os.path.join(output_dir, 'queries_test_features.npy')
products_id_to_idx_path = os.path.join(output_dir, 'products_id_to_idx.pkl')

In [6]:
# Number of tokens in the vocabulary of TF-IDF.
VOCAB_SIZE = 4096
# Embedding dimension used for random projection of TF-IDF vectors.
EMBEDDING_DIM = 256
# Number of training samples to use (set to None to use all samples).
NUM_TRAIN_SAMPLES = 10_000

In [7]:
# Load aggregated search data which will be used as training data.
aggregated_searches_df = pd.DataFrame(read_json_lines(aggregated_search_data_path, n_lines=NUM_TRAIN_SAMPLES))

In [8]:
# Load preprocessed product data.
products_data_df = pd.DataFrame(read_json_lines(preprocessed_products_path))

In [9]:
# Load preprocessed test queries.
test_offline_queries_df = pd.DataFrame(read_json_lines(preprocessed_test_queries_path))

In [10]:
# Create a mapping from ID of products to their integer index.
products_id_to_idx = dict((p_id, idx) for idx, p_id in enumerate(products_data_df['id']))

In [None]:
ft_model = fasttext.load_model('../models/cc.fa.300.bin')



ValueError: ../models/cc.fa.300.bin cannot be opened for loading!

In [None]:
# test = test_offline_queries_df['raw_query_normalized'].apply(ft_model.get_sentence_vector)
queries_test_projected = np.array([ft_model.get_sentence_vector(x) for x in tqdm(test_offline_queries_df['raw_query_normalized'].values)])
queries_train_projected = np.array([ft_model.get_sentence_vector(x) for x in tqdm(aggregated_searches_df['raw_query_normalized'].values)])
products_projected = np.array([ft_model.get_sentence_vector(x) for x in tqdm(products_data_df['title_normalized'].values)])

  0%|          | 0/23140 [00:00<?, ?it/s]

NameError: name 'ft_model' is not defined

In [None]:
del ft_model
gc.collect();

NameError: name 'ft_model' is not defined

In [None]:
def create_dat_file(
    dat_file_path,
    agg_searches_df,
    query_features,
    product_features,
    n_candidates=None,
):
    """
    Create a `dat` file which is the training data of LambdaMart model.

    The file format of the training and test files is the same as for SVMlight,
    with the exception that the lines in the input files have to be sorted by increasing qid.
    The first lines may contain comments and are ignored if they start with #.
    Each of the following lines represents one training example and is of the following format:

    <line> .=. <target> qid:<qid> <feature>:<value> <feature>:<value> ... <feature>:<value> # <info>
    <target> .=. <float>
    <qid> .=. <positive integer>
    <feature> .=. <positive integer>
    <value> .=. <float>
    <info> .=. <string>

    The target value and each of the feature/value pairs are separated by a space character.
    Feature/value pairs MUST be ordered by increasing feature number.
    Features with value zero can be skipped.
    The target value defines the order of the examples for each query.
    Implicitly, the target values are used to generated pairwise preference constraints as described in [Joachims, 2002c].
    A preference constraint is included for all pairs of examples in the example_file, for which the target value differs.
    The special feature "qid" can be used to restrict the generation of constraints.
    Two examples are considered for a pairwise preference constraint only if the value of "qid" is the same.

    For example, given the example_file

    3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # 1A
    2 qid:1 1:0 2:0 3:1 4:0.1 5:1 # 1B
    1 qid:1 1:0 2:1 3:0 4:0.4 5:0 # 1C
    1 qid:1 1:0 2:0 3:1 4:0.3 5:0 # 1D
    1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # 2A
    2 qid:2 1:1 2:0 3:1 4:0.4 5:0 # 2B
    1 qid:2 1:0 2:0 3:1 4:0.1 5:0 # 2C
    1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # 2D
    2 qid:3 1:0 2:0 3:1 4:0.1 5:1 # 3A
    3 qid:3 1:1 2:1 3:0 4:0.3 5:0 # 3B
    4 qid:3 1:1 2:0 3:0 4:0.4 5:1 # 3C
    1 qid:3 1:0 2:1 3:1 4:0.5 5:0 # 3D

    the following set of pairwise constraints is generated (examples are referred to by the info-string after the # character):

    1A>1B, 1A>1C, 1A>1D, 1B>1C, 1B>1D, 2B>2A, 2B>2C, 2B>2D, 3C>3A, 3C>3B, 3C>3D, 3B>3A, 3B>3D, 3A>3D

    More information:
     - https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#embedding-additional-information-inside-libsvm-file
     - https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html
    """
    with open(dat_file_path, "w") as file:
        for qid, agg_search in tqdm(enumerate(agg_searches_df.itertuples(index=False))):
            if n_candidates is None:
                limit = len(agg_search.results)
            else:
                limit = min(n_candidates, len(agg_search.results))
            clicks = dict(zip(agg_search.clicks, agg_search.clicks_count))

            for candidate_product_id in agg_search.results[:limit]:
                if candidate_product_id is None:
                    continue
                candidate_score = clicks.get(candidate_product_id, 0)
                candidate_score = np.log2(candidate_score + 1)

                p_idx = products_id_to_idx[candidate_product_id]
                features = np.concatenate((product_features[p_idx], query_features[qid]))
                features = np.around(features, 3)

                file.write(
                    f"{candidate_score} qid:{qid} "
                    + " ".join([f"{i}:{s}" for i, s in enumerate(features)])
                    + "\n"
                )

In [None]:
create_dat_file(
    train_dat_file_path,
    aggregated_searches_df,
    queries_train_projected,
    products_projected,
    n_candidates=200,
)

NameError: name 'queries_train_projected' is not defined

In [11]:
def clicked_results(output_path) :
    record = {
        'id' : [],
        'clicks' : []
    }
    print('Writing clicked results into file...')
    with JSONLinesWriter(output_path) as clicked_result_file:
        for index, click_result in tqdm(enumerate(aggregated_searches_df.itertuples())):
            for click_id in click_result.clicks:
                if click_id in record['id'] :
                    record['clicks'][record['id'].index(click_id)] += click_result.clicks_count[click_result.clicks.index(click_id)]
                else :
                    record['id'].append(click_id)
                    record['clicks'].append(click_result.clicks_count[click_result.clicks.index(click_id)])
        clicked_result_file.write_record(record)


In [15]:
def view_results(output_path) :
    record = {
        'id' : [],
        'view' : []
    }
    print('Writing viewed results into file...')
    with JSONLinesWriter(output_path) as clicked_result_file:
        for index, results in tqdm(enumerate(aggregated_searches_df.itertuples())):
            for result_id in results.results:
                if result_id in record['id'] :
                    record['view'][record['id'].index(result_id)] += results.results_count[results.results.index(result_id)]
                else :
                    record['id'].append(result_id)
                    record['view'].append(results.results_count[results.results.index(result_id)])
        clicked_result_file.write_record(record)

In [16]:
clicked_results('output_data/clicked_results.jsonl')
view_results('output_data/viewed_results.jsonl')

Writing viewed results into file...


63it [00:08,  7.27it/s]


KeyboardInterrupt: 