## Instructions

1. Use krylov namespace to run the notebook. It is verified for the krylov configuration:
```json
{
  "application": "jupyterlab",
  "description": "",
  "workspaceConfiguration": {
    "image": "ecr.vip.ebayc3.com/ppetrov/krylov-passion:latest",
    "hadoop": {
      "batchUser": "b_perso",
      "hadoopCluster": "apollo-rno"
    }
  }
}
```

2. Upload the [pretrainer.ipynb](https://github.corp.ebay.com/dbasin/my_stuff/blob/main/simplex/nbs/pretrainer.ipynb) notebook and [pretrainer.py](https://github.corp.ebay.com/dbasin/my_stuff/blob/main/simplex/nbs/pretrainer.py) file to krylov workspace to the same folder.
3. Update constants:

    - `base_path` - should  reference your model base path, e.g. `/apps/b_perso/vlp/simplark/pretrainer/RecommendedBrandOutletWithMLR` (note `no` viewfs prefix here)
    - `base_out_path` - specify if you use `Extender` to add features. Defines output for numpy files generated by `Extender`. `Extender` will create a folder with run timestamp for each run.
    - `start_date`,`end_date` - specify date range of loaded training data (inclusively)
    - `num_workers` - number of spark executors used for fetching the data



## Installs

In [1]:
import os
os.environ['HTTP_PROXY'] = 'http://httpproxy.vip.ebay.com:80'
os.environ['HTTPS_PROXY']='http://httpproxy.vip.ebay.com:80'

In [2]:
! pip3 install hyperopt

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.python.org/simple
[31mERROR: Could not find a version that satisfies the requirement hyperopt (from versions: none)[0m
[31mERROR: No matching distribution found for hyperopt[0m


In [3]:
! pip3 install xgboost

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.python.org/simple


## Imports

In [None]:
import sys
ptu_repo_path = '/data/ebay/notebooks/dbasin/Repositories/pretrainer_utils'
utils_path = f'{ptu_repo_path}/utils'
sys.path.append(utils_path)

In [4]:



from pyspark.sql import functions as F
from fsspec.implementations import hdfs
from hdfs_torch_dataset import HDFS

from spark_utils import load_spark
from pretrainer import *
import pretrainer_utils
from pretrainer_utils import numpy_data_to_pdf, parse_category, leaf_cats_from_some_id, label_extract_processor, parse_category


from importlib import reload
from functools import partial

ModuleNotFoundError: No module named 'hdfs_torch_dataset'

In [5]:
import gzip
from tempfile import TemporaryDirectory
from hdfs_utils import HDFS

ModuleNotFoundError: No module named 'hdfs_utils'

In [6]:
from sklearn.metrics import ndcg_score

In [7]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

ModuleNotFoundError: No module named 'hyperopt'

In [8]:
import pickle
import pandas as pd

from pretrainer import *

from fsspec.implementations import hdfs

from pyspark.sql import Row

from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, LongType, ArrayType, DoubleType, StringType
import xgboost as xgb

from sklearn.model_selection import train_test_split

In [9]:
from spark_utils import load_spark

ModuleNotFoundError: No module named 'spark_utils'

In [10]:
from pretrainer import load_npy_path, GzipHdfsUploader
from pretrainer_utils import label_extract_processor, calc_active_features, numpy_data_to_pdf, extract_label

ModuleNotFoundError: No module named 'pretrainer_utils'

In [11]:
from xgb_utils import pretrainer_train_test_split, create_dmatrix, calc_feature_imp, RecordEval, load_bst_model
from xgb_utils import calc_rank, sale_rank_stats, calc_pred_score, calc_sale_rank, calc_comb_score, model_vs_prods_ranks

ModuleNotFoundError: No module named 'xgb_utils'

In [12]:
import pretrainer_utils, pretrainer, xgb_utils

ModuleNotFoundError: No module named 'pretrainer_utils'

In [13]:
from importlib import reload

In [14]:
import pretrainer

In [15]:
reload(pretrainer)

<module 'pretrainer' from '/data/shpx/notebooks/olivyatan/Fetcher_multiple_variations_pa_snkrs/sneakers/pretrainer.py'>

## Spark setup

In [9]:
spark = load_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/01 08:57:32 WARN HiveConf: DEPRECATED: hive.metastore.ds.retry.* no longer has any effect.  Use hive.hmshandler.retry.* instead
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.metastore.local does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.enforce.sorting does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.server2.proxyuser.hue.groups does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.server2.proxyuser.hue.hosts does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.metastore.ds.retry.interval does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.enforce.bucketing does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.metastore.ds.retry.attempts does not exist
22/09/01 08:57:32 WARN HiveConf: HiveConf of name hive.server2.enable.impersonation

The executors' logs link:
https://apollo-rno-rm-2.vip.hadoop.ebay.com:50030/proxy/application_1660704390900_1279298


In [11]:
spark.sparkContext.addPyFile(f'{utils_path}/pretrainer.py')
spark.sparkContext.addPyFile(f'{utils_path}/hdfs_utils.py')
spark.sparkContext.addPyFile(f'{utils_path}/pretrainer_utils.py')

In [33]:
# spark.stop()

## Constants

In [12]:
start_date = '20220801'
end_date = '20220819'

root_path = '/apps/b_perso/hp/simplark/pretrainer'
models = ['PersonalizedTopicsV2WithMetaOrganicPRecall','PersonalizedTopicsV2WithTopicMLR']
base_paths = [f'{root_path}/{m}' for m in models]

target_label = 'labelPurchase'

hdfs_out_path = '/user/b_selling_research/dbasin/vibes_piyi'
hdfs_rvi_path = f'{hdfs_out_path}/rvi'
hdfs_meids_path = f'{hdfs_out_path}/meids'
hdfs_url_guid_path = f'{hdfs_out_path}/url_guid'
hdfs_download_url_path = f'{hdfs_out_path}/download_url'

hdfs_images_path = f'{hdfs_out_path}/images'

hdfs_imps_meta_path = f'{hdfs_out_path}/imps_meta'
hdfs_image_embs_path = f'{hdfs_out_path}/image_embs'
hdfs_title_embs_path = f'{hdfs_out_path}/text_embs'

local_features_path = './clip_features.parquet'

In [139]:
hdfs_npy_dir = '/user/dbasin/vibes/piyi'

In [13]:
num_workers=128

In [24]:
piyi_original_features = [
    "BibowatchRelPosition",
    "RecallSourceBullseye",
    "RecallSourceTora",
    "TitleCosineSimilarityToShoppingcartCentroid",
    "FreqSameLeafCatIdInWatchBadge",
    "MaxViewedItemTitleJaccardBigrams",
    "NumSameRviInLastWeek",
    "AvgSameLeafRviPriceRatio",
    "ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile",
    "ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2",
    "MaxViewedItemTitleJaccard",
    "ItemTimeOnSiteV2",
    "ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2",
    "PriceDiffMedianRecall",
    "FreqSameItemInWatchBadge",
    "RecallSourceBestMatch",
    "ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm",
    "FreqWatchPriceBellowItemPrice",
    "MerchImpressionsDecayed",
    "PlImpressionsDecayed",
    "AvgSameLeafRviPriceDiff",
    "ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm",
    "BullseyeRelRVILeafCatMedianPriceDiffV2",
    "BullseyeAbsRVILeafCatMedianPriceDiffV2",
    "BullseyeRVILeafCatMedianPriceV2",
    "LeafCatRVICondition",
    "ItemConditionOrdinal",
    "ItemConditionNorm",
    "SameItemConditionInRvi"
]


## Fetching train data

In [None]:
fetchers = [Fetcher(base_path, start_date, end_date, hdfs.HadoopFileSystem(), num_workers=128) for base_path in base_paths]

In [25]:
np_sample = load_npy_path(fetchers[0].paths[0])
meta_cols =[l for l,dt in np_sample.dtype.descr if l not in ['labels', 'features']]
label_cols = [l for l,dt in np_sample['labels'].dtype.descr]
feature_cols = piyi_original_features

In [26]:
pdfs = [ft.fetch_pandas_df(spark, partial(label_extract_processor, target_label=target_label,feature_cols=feature_cols, meta_cols=meta_cols, label_cols=label_cols)) for ft  in fetchers]

                                                                                

In [27]:
pdf = pd.concat(pdfs)

In [30]:
pdf['meta', 'meid'] = pdf.meta.meid.map(lambda v: v.decode())

## Merge with clip features

In [221]:
pdf.head(3)

Unnamed: 0_level_0,meta,meta,meta,meta,meta,meta,labels,labels,labels,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,itemId,meid,userId,siteId,rank,category,labelCombined,labelClick,labelPurchase,NormItemViewCount7DayDecayDomesticWebAndMobile,...,MaxSameLeafRvihPriceDiff,FreqWatchPriceBellowItemPrice,MaxTransactionPriceRatio,UserLowPricePrpnstyDiff,TitleCosineSimilarityCentroidRvisInLastDay,AvgWatchPriceBidRatioBadge,AvgTransactionPriceRatio,NumSameRviLeafCatInLastTwoDay,TimeSinceAddedWatch,MaxWatchPriceBinRatioBadge
0,234506805927,4120b73a3201499fb48e4186efb83e4a,2434096930,0,0,b'43961',1,1,1,0.003148,...,0.0,3e-06,-1.0,-1.0,0.0,2.6396,-1.0,0.0,0.007478,-1.0
1,195243638026,4120b73a3201499fb48e4186efb83e4a,2434096930,0,1,b'43961',0,0,0,0.002032,...,5.039997,3e-06,-1.0,-1.0,0.0,2.438,-1.0,0.0,0.007478,-1.0
2,194965666729,4120b73a3201499fb48e4186efb83e4a,2434096930,0,2,b'43961',0,0,0,0.004533,...,1.040001,3e-06,-1.0,-1.0,0.0,2.598,-1.0,0.0,0.007478,-1.0


In [32]:
cf_pdf = pd.read_parquet(local_features_path)

In [33]:
del_meta_cols = ['userId', 'siteId', 'rank', 'category']
for c in del_meta_cols:
    del cf_pdf['meta', c]


In [34]:
cf_pdf[['meta','features']].head(2)

Unnamed: 0_level_0,meta,meta,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features,features
Unnamed: 0_level_1,itemId,meid,avgImgClipSim,maxImgClipSim,avgTitleClipSim,maxTitleClipSim,avgConcatClipSim,maxConcatClipSim,catAvgImgClipSim,catMaxImgClipSim,...,avgTitleClipSimNorm,maxTitleClipSimNorm,avgConcatClipSimNorm,maxConcatClipSimNorm,catAvgImgClipSimNorm,catMaxImgClipSimNorm,catAvgTitleClipSimNorm,catMaxTitleClipSimNorm,catAvgConcatClipSimNorm,catMaxConcatClipSimNorm
0,234506805927,4120b73a3201499fb48e4186efb83e4a,0.562857,1.008792,0.265739,1.004275,0.828596,2.013067,1.008792,1.008792,...,0.894088,1.0,0.91624,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,195243638026,4120b73a3201499fb48e4186efb83e4a,0.607126,0.769791,0.297218,0.569916,0.904345,1.339707,0.769791,0.769791,...,1.0,0.567489,1.0,0.665505,0.763082,0.763082,0.567489,0.567489,0.665505,0.665505


In [35]:
data = pd.merge(pdf, cf_pdf[['meta', 'features']], left_on=[('meta', 'itemId'), ('meta', 'meid')], right_on=[('meta', 'itemId'), ('meta', 'meid')], how='inner')

In [36]:
data.shape

(717363, 98)

## Data to npy

In [122]:
np_data = load_npy_path(fetchers[0].paths[0])
np_types = np_data.dtype.descr

In [124]:
keep_cols = set(data.features.columns)
all_features_types = np_types[np_data.dtype.names.index('features')]
filt_feature_types = [ (nt[0],nt[1]) for nt in all_features_types[1] if (nt[0] in keep_cols)]

In [125]:
np_types[np_data.dtype.names.index('features')] = ('features', filt_feature_types)

In [126]:
ftypes = np_types[np_data.dtype.names.index('features')]
for col in cf_pdf.features.columns:
    ftypes[1].append((col, '<f4'))


In [127]:
# append new cols to schema
# create empty array with the new size
new_data = np.zeros((data.shape[0],), dtype=np_types)

In [128]:
meta_types = [(n,t) for n,t in new_data.dtype.descr if n not in ['labels', 'features']]
for m, t in meta_types:
    new_data[m] = pdf.meta[m].astype(t)

In [129]:
for n,t in new_data['labels'].dtype.descr:
    new_data['labels'][n] = pdf.labels[n].astype(t)

In [130]:
for n,t in new_data['features'].dtype.descr:
    new_data['features'][n] = data.features[n].astype(t)

In [133]:
def hdfs_save_np(data, file_name, out_dir_path):
    with TemporaryDirectory() as tmp_dir:
        local_path = f'{tmp_dir}/{file_name}'
        with gzip.open(local_path,'wb') as f:
            np.save(f, data)

        HDFS.put(local_path, f'{out_dir_path}/')

In [141]:
hdfs_save_np(new_data, 'part-0-000.npy.gz', hdfs_npy_dir)

Upload command: ['/apache/hadoop/bin/hadoop', 'fs', '-put', '-f', '/tmp/tmpt2dcrrxa/part-0-000.npy.gz', '/user/dbasin/vibes/piyi/']
