In [1]:
import os
import json
import gc
import pickle
import tensorflow as tf
from typing import Dict, Tuple

from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import fasttext

In [2]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [3]:
output_dir = os.path.join('output_data')

aggregated_search_data_path = os.path.join(output_dir, 'aggregated_search_data.jsonl')
preprocessed_products_path = os.path.join(output_dir, 'preprocessed_products.jsonl')
preprocessed_test_queries_path = os.path.join(output_dir, 'preprocessed_test_queries.jsonl')

# train_dat_file_path = os.path.join(output_dir, 'train.dat')

# random_projection_mat_path = os.path.join(output_dir, 'random_projection_mat.npy')
# product_features_path = os.path.join(output_dir, 'product_features.npy')
# queries_train_features_path = os.path.join(output_dir, 'queries_train_features.npy')
# queries_test_features_path = os.path.join(output_dir, 'queries_test_features.npy')
# products_id_to_idx_path = os.path.join(output_dir, 'products_id_to_idx.pkl')

In [4]:
# Number of tokens in the vocabulary of TF-IDF.
VOCAB_SIZE = 4096
# Embedding dimension used for random projection of TF-IDF vectors.
EMBEDDING_DIM = 256
# Number of training samples to use (set to None to use all samples).
NUM_TRAIN_SAMPLES = 10_000

In [5]:
# Load aggregated search data which will be used as training data.
aggregated_searches_df = pd.DataFrame(read_json_lines(aggregated_search_data_path, n_lines=NUM_TRAIN_SAMPLES))

In [6]:
# Load preprocessed product data.
products_data_df = pd.DataFrame(read_json_lines(preprocessed_products_path))

In [7]:
# Load preprocessed test queries.
test_offline_queries_df = pd.DataFrame(read_json_lines(preprocessed_test_queries_path))

In [10]:
limit = 200
qids = []
products = []
target = []
for qid, agg_search in tqdm(enumerate(aggregated_searches_df.itertuples(index=False))):
    clicks = dict(zip(agg_search.clicks, agg_search.clicks_count))
    for candidate_product_id in agg_search.results[:limit]:
        if candidate_product_id is None:
            continue
        candidate_score = clicks.get(candidate_product_id, 0)
        candidate_score = np.log2(candidate_score + 1)
        
        qids.append(qid)
        products.append(candidate_product_id)
        target.append(candidate_score)

qids = np.array(qids)
products = np.array(products)
target = np.array(target)

10000it [00:01, 9028.72it/s]


In [11]:
df = pd.DataFrame({'qid': qids, 'product': products, 'target': target})

In [22]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(test_size=.30, n_splits=1, random_state=7).split(df, groups=df['qid'])

X_train_inds, X_test_inds = next(gss)

train_data= df.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['qid','target'])]
y_train = train_data.loc[:, train_data.columns.isin(['target'])]

groups = train_data.groupby('qid').size().to_frame('size')['size'].to_numpy()

test_data= df.iloc[X_test_inds]

#We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['target'])]
y_test = test_data.loc[:, test_data.columns.isin(['target'])]

In [None]:
import xgboost as xgb

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)