In [3]:
    
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import GenericUnivariateSelect, chi2, f_classif
from sklearn.model_selection import train_test_split,  GroupShuffleSplit
import pathlib
from datetime import datetime
import os


In [4]:
# US_wb-#4_HP_ex0-1_snkrs_size_piyi_v7_2.2m_nov2217.1_23.ipynb

In [5]:
import os
import pickle
import socket
import sys
from datetime import date, timedelta
from glob import glob

import pandas as pd
import pyspark.sql.functions as func
import xgboost as xgb
from fsspec.implementations import hdfs
from pretrainer import *
from pyspark.sql import Row, SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.functions import udf
from pyspark.sql.types import (
    ArrayType,
    DoubleType,
    LongType,
    StringType,
    StructField,
    StructType,
)
from pyspark.sql.window import Window
from sklearn.feature_selection import GenericUnivariateSelect, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [6]:
pd.set_option('display.max_colwidth', None)

# Functions

cross-validation score we get with our current parameters:

run cross-validation on our training dataset 

In [7]:
import pandas as pd
def run_hyper_tuning(dtrain, grid, params, nrounds, output_dir, out_filename, nfold = 5):
    
    print(datetime.now().strftime("%H:%M:%S"))

    hyper_df = pd.DataFrame(columns = ['max_depth', 'min_child_weight', 'subsample', 'eta', 'mean_map', 'boost_rounds'])
    max_map = 0
    best_params = None
    i=1
    for max_depth, min_child_weight,subsample, eta in grid:
        #print(i)
        print(f"CV{i} with max_depth={max_depth}, min_child_weight={min_child_weight}, subsample={subsample}, eta={eta}")
        
        params['max_depth'] = max_depth
        params['min_child_weight'] = min_child_weight
        params['subsample'] = subsample
        params['eta'] = eta

        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=nrounds,
            seed=42,
            nfold=nfold,
            metrics={'map'},
            early_stopping_rounds=5
        )
        # Update best MAP
        mean_map = cv_results['test-map-mean'].max()
        boost_rounds = cv_results['test-map-mean'].argmax() + 1 
        print(f"\tmap {round(mean_map, 4)} for {boost_rounds} rounds")
        if mean_map > max_map:
            max_map = mean_map
            best_params = (max_depth, min_child_weight, subsample, eta)

        hyper_df = hyper_df.append({'max_depth':max_depth,
                                    'min_child_weight':min_child_weight,
                                    'subsample':subsample,
                                    'eta':eta,
                                    'mean_map':mean_map,
                                    'boost_rounds':boost_rounds},
                                   ignore_index=True)
        i=i+1
        
    
    print(f"Best params: {best_params[0]}, {best_params[1]}, {best_params[2]}, {best_params[3]}, map: {max_map}")
    
    print(datetime.now().strftime("%H:%M:%S"))
    
    pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
    print("Saving hyper-paramter output")
    hyper_df.to_csv(os.path.join(output_dir, f"hyper_df_{out_filename}.csv"), index=False)
    
    return hyper_df

In [8]:
def run_hyper_tuning(dtrain, grid, params, nrounds, output_dir, out_filename, nfold = 5):
    
    print(datetime.now().strftime("%H:%M:%S"))

    hyper_df = pd.DataFrame(columns = ['max_depth', 'min_child_weight', 'subsample', 'eta', 'mean_map', 'boost_rounds'])
    max_map = 0
    best_params = None
    i=1
    for max_depth, min_child_weight,subsample, eta in grid:
        #print(i)
        print(f"CV{i} with max_depth={max_depth}, min_child_weight={min_child_weight}, subsample={subsample}, eta={eta}")
        
        params['max_depth'] = max_depth
        params['min_child_weight'] = min_child_weight
        params['subsample'] = subsample
        params['eta'] = eta

        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=nrounds,
            seed=42,
            nfold=nfold,
            metrics={'map'},
            early_stopping_rounds=5
        )
        # Update best MAP
        mean_map = cv_results['test-map-mean'].max()
        boost_rounds = cv_results['test-map-mean'].argmax() + 1 
        print(f"\tmap {round(mean_map, 4)} for {boost_rounds} rounds")
        if mean_map > max_map:
            max_map = mean_map
            best_params = (max_depth, min_child_weight, subsample, eta)

        hyper_df = hyper_df.append({'max_depth':max_depth,
                                    'min_child_weight':min_child_weight,
                                    'subsample':subsample,
                                    'eta':eta,
                                    'mean_map':mean_map,
                                    'boost_rounds':boost_rounds},
                                   ignore_index=True)
        i=i+1
        
    
    print(f"Best params: {best_params[0]}, {best_params[1]}, {best_params[2]}, {best_params[3]}, map: {max_map}")
    
    print(datetime.now().strftime("%H:%M:%S"))
    
    pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
    print("Saving hyper-paramter output")
    hyper_df.to_csv(os.path.join(output_dir, f"hyper_df_{out_filename}.csv"), index=False)
    
    return hyper_df

In [9]:
def calc_feature_imp(model, imp_type = 'gain'):
    
    importance = model._Booster.get_score(importance_type=imp_type)
    fimp_by_gain = np.array([importance.get(f, 0.) for f in model._Booster.feature_names])
    fimp_order = np.argsort(fimp_by_gain)
    values = fimp_by_gain[fimp_order]
    labels = np.array(model._Booster.feature_names)[fimp_order]
    
    return pd.DataFrame({"feature": labels, "score":values}).sort_values(by = "score", ascending=False)

In [10]:
def calc_sale_rank(df, rank_col, ascending, group_col = 'meid', label_col = 'labelPurchase'):
    df_c = df.copy()
    
    df_c = df_c.sort_values([group_col, rank_col], ascending=ascending)
    
    df_c[f'{rank_col}_score'] = df_c.groupby(group_col, sort=False).cumcount() + 1
    
    df_score = df_c[df_c[label_col] == 1]
    
    score = df_score.groupby(group_col, sort=False)[f'{rank_col}_score'].min().tolist()
    score = [x for x in score if x is not None and np.isfinite(x)]
        
    return score

# Load Data sneakers and filter for US:

In [17]:
output_dir="/data/shpx/data/olivyatan/piyi/vlps_pa"

In [9]:
#df_pa_sample=pd.read_parquet('/data/shpx/data/olivyatan/piyi/df_pa_sample.parquet')
#df_pa_sample=pd.read_parquet('/data/shpx/data/olivyatan/pa_data_15.10_21_22_sample.parquet')

### Run 1 month:

#### 1.longer Sneakers data, 2m:

In [11]:
df_sneakers_4m=pd.read_parquet('/data/shpx/data/olivyatan/sneakers_us_17.11.22_25.4.22_allfeat.parquet')  

In [12]:
df_sneakers_4m.siteId.unique()

array([0], dtype=int32)

In [13]:
df_sneakers_4m.dt.unique()

array(['2022-11-17', '2022-11-18', '2022-11-19', '2022-11-20',
       '2022-11-21', '2022-11-22', '2022-11-23', '2022-11-24',
       '2022-11-25', '2022-11-26', '2022-11-27', '2022-11-28',
       '2022-11-29', '2022-11-30', '2022-12-01', '2022-12-02',
       '2022-12-03', '2022-12-04', '2022-12-05', '2022-12-06',
       '2022-12-07', '2022-12-08', '2022-12-09', '2022-12-10',
       '2022-12-11', '2022-12-12', '2022-12-13', '2022-12-14',
       '2022-12-15', '2022-12-16', '2022-12-17', '2022-12-18',
       '2022-12-19', '2022-12-20', '2022-12-21', '2022-12-22',
       '2022-12-23', '2022-12-24', '2022-12-25', '2022-12-26',
       '2022-12-27', '2022-12-28', '2022-12-29', '2022-12-30',
       '2022-12-31', '2023-01-01', '2023-01-02', '2023-01-03',
       '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07',
       '2023-01-08', '2023-01-09', '2023-01-10', '2023-01-11',
       '2023-01-12', '2023-01-13', '2023-01-14', '2023-01-15',
       '2023-01-16', '2023-01-17', '2023-01-18', '2023-

In [13]:
df_sneakers_4m.shape

(94852, 1519)

In [27]:
ix=np.isin(df_sneakers_4m['category'],['15709','95672'])
ix

array([ True,  True,  True, ...,  True,  True,  True])

In [28]:
category_filter = np.isin(df_sneakers_4m['category'], ['15709', '95672'])
site_filter = np.isin(df_sneakers_4m['siteId'], [0])
result = category_filter & site_filter

In [29]:
result

array([ True,  True,  True, ...,  True,  True,  True])

In [31]:
df_sneakers_4m[result]

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.360565,-1.0,-1.0,...,,,,,,,,,,
1,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.212199,-1.0,-1.0,...,,,,,,,,,,
2,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.757790,-1.0,-1.0,...,,,,,,,,,,
3,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.082090,-1.0,-1.0,...,,,,,,,,,,
4,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.312545,-1.0,-1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.354786,-1.0,-1.0,...,,,,,,,,,,
6,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.164180,-1.0,-1.0,...,,,,,,,,,,
7,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.082090,-1.0,-1.0,...,,,,,,,,,,
8,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.164180,-1.0,-1.0,...,,,,,,,,,,


### Save data for nmpy file conversion to simplex training: 

In [57]:
pdf.shape

(239455, 1459)

In [25]:
#pdf.to_parquet('/data/shpx/data/olivyatan/snkrs_us_HP_171122_140123_WL4exclusions_withLeafcatfeature.parquet') 

In [63]:
pdf.to_parquet('/data/shpx/data/olivyatan/2_snkrs_us_HP7d_171122_110423_WL4exclusions.parquet')

In [64]:
pdf.shape

(274884, 1519)

In [65]:
pdf.labelPurchase.value_counts()

0    245951
1     28933
Name: labelPurchase, dtype: int64

In [109]:
pdf.labelPurchase.value_counts() #prev 

0    299226
1     35238
Name: labelPurchase, dtype: int64

In [37]:
df_sneakers_4m.shape

(94852, 1519)

In [14]:
import pandas as pd
def filter_df( df, start_date, end_date):
    subset_df = pd.DataFrame()
    subset_df=df[ (df['dt'] >= start_date) & (df['dt'] <= end_date) ].copy()
    return subset_df 


In [15]:
df_us= filter_df( df_sneakers_4m, '2022-11-17', '2023-04-11')

In [16]:
df_hp_sub= filter_df( df_hp, '2023-04-05', '2023-04-11')

In [17]:
df_hp_sub.shape

(192343, 1519)

In [32]:
df_hp_sub.shape

(253045, 1519)

In [33]:
df_us.shape

(86793, 1519)

In [15]:
test_2w_sneakers=filter_df( df_sneakers_4m, '2023-04-12', '2023-04-25')

In [16]:
test_2w_sneakers.shape

(8059, 1519)

In [17]:
test_2w_sneakers.dt.nunique()

12

In [18]:
test_2w_sneakers.labelPurchase.value_counts()

0    7210
1     849
Name: labelPurchase, dtype: int64

In [20]:
test_2w_sneakers.to_pickle('/data/shpx/data/olivyatan/test_2w_sneakers_Us.parquet')

In [26]:
test_2w_sneakers.siteId.unique()

array([0], dtype=int32)

### calc sale rank per date: 

In [309]:
grouping = dict(tuple(df_us.groupby(['dt'])))

In [311]:
grouping =df_us.groupby(['dt'])

In [332]:

grouping.get_group('2022-11-17').head(1)

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,labelBBEDefectType,labelSitePurchaseImpr,labelBBEImpr,labelBin,labelOffer,labelWatch,labelAdd2Cart,labelPurchaseImpr,labelCombined,dt
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.360565,-1.0,-1.0,...,-1,0,-1,0,0,0,0,0,0,2022-11-17


In [328]:
dt_saleRank={}
for  group_dt in grouping: 
   # print (group_dt[0])
    #print (group_dt)
    saleRank=np.mean(calc_sale_rank(grouping.get_group(group_dt[0]), 'rank', ascending=True))
    #print(  saleRank) 
    dt_saleRank[str(group_dt[0])] =saleRank

In [329]:
dt_saleRank


{'2022-11-17': 2.1159420289855073,
 '2022-11-18': 3.3424657534246576,
 '2022-11-19': 2.5789473684210527,
 '2022-11-20': 2.5308641975308643,
 '2022-11-21': 3.533333333333333,
 '2022-11-22': 2.953846153846154,
 '2022-11-23': 2.8088235294117645,
 '2022-11-24': 2.8194444444444446,
 '2022-11-25': 2.625,
 '2022-11-26': 2.740740740740741,
 '2022-11-27': 2.870967741935484,
 '2022-11-28': 2.4375,
 '2022-11-29': 3.1805555555555554,
 '2022-11-30': 3.064516129032258,
 '2022-12-01': 3.138888888888889,
 '2022-12-02': 2.8513513513513513,
 '2022-12-03': 3.015625,
 '2022-12-04': 3.287671232876712,
 '2022-12-05': 2.8313253012048194,
 '2022-12-06': 2.6666666666666665,
 '2022-12-07': 2.9069767441860463,
 '2022-12-08': 3.1864406779661016,
 '2022-12-09': 2.767123287671233,
 '2022-12-10': 2.835294117647059,
 '2022-12-11': 2.975,
 '2022-12-12': 2.935483870967742,
 '2022-12-13': 2.710144927536232,
 '2022-12-14': 2.719298245614035,
 '2022-12-15': 2.7708333333333335,
 '2022-12-16': 2.9,
 '2022-12-17': 2.0,
 '202

In [330]:
np.array(list(dt_saleRank.values())).mean()

2.8274916355735735

In [331]:
np.array(list(dt_saleRank.values())).std()

0.30391453077034697

# 2.Load data HP US:

In [1]:
import pandas as pd
#df_hp=pd.read_parquet('/data/shpx/data/olivyatan/hp_1w_080123_140123_allcat.parquet')  

In [9]:
import pandas as pd
df_hp=pd.read_parquet('/data/shpx/data/olivyatan/HP_us_2w_270323_110423_allfeat.parquet')

In [10]:
df_hp.siteId.unique()

array([0], dtype=int32)

In [11]:
df_hp.shape

(424269, 1519)

In [12]:
df_hp.columns

Index(['f_SlrImmatureDaysLastDefect', 'f_PriceV2',
       'f_IsAuthenticityGuaranteed',
       'f_ItemWatchCount7DayDecayDomesticWebAndMobile', 'f_CssL3MeanBbeCount',
       'f_IsVariantItem', 'f_HydraStdWOCat', 'f_KnnNlpUserItemsRecallSource',
       'f_ItemSoldCountWnD', 'f_RecoHasEpid',
       ...
       'f_MaxViewedItemTitleJaccardWithSameCate',
       'f_MeanViewedItemTitleJaccardWithSameCate',
       'f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile',
       'f_Seller90DayBBEOverTransactionsLogSmooth', 'f_ItemIsNew',
       'f_RecallSourceToraUb',
       'f_Seller90DayStockoutOverTransactionsLogSmooth',
       'f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile',
       'f_ItemSellerFixedPriceViewsOverImp',
       'f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2'],
      dtype='object', length=1519)

In [23]:
df_hp_sub.shape, df_hp_sub.meid.nunique(), df_hp_sub.labelPurchase.sum(), df_hp_sub.siteId.unique()

((192343, 1519), 19961, 20244, array([0], dtype=int32))

In [24]:
df_us.shape, df_us.meid.nunique(), df_us.labelPurchase.sum(), df_us.siteId.unique()

((86793, 1519), 9097, 9134, array([0], dtype=int32))

In [25]:
test.shape, test.meid.nunique(), test.labelPurchase.sum(), test.siteId.unique()

((8059, 1519), 847, 849, array([0], dtype=int32))

In [26]:
df_us.head(2)

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.360565,-1.0,-1.0,...,,,,,,,,,,
1,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.212199,-1.0,-1.0,...,,,,,,,,,,


In [27]:
df_hp.head(2)


Unnamed: 0,f_SlrImmatureDaysLastDefect,f_PriceV2,f_IsAuthenticityGuaranteed,f_ItemWatchCount7DayDecayDomesticWebAndMobile,f_CssL3MeanBbeCount,f_IsVariantItem,f_HydraStdWOCat,f_KnnNlpUserItemsRecallSource,f_ItemSoldCountWnD,f_RecoHasEpid,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,-1.0,0.0,0.0,9.65,-1.0,0.0,-3.064059,0.0,0.47352,0.0,...,,,,,,,,,,
1,-1.0,0.0,0.0,0.0,-1.0,0.0,-3.064059,0.0,0.386298,0.0,...,,,,,,,,,,


In [56]:
df_hp.columns

Index(['f_SlrImmatureDaysLastDefect', 'f_PriceV2',
       'f_IsAuthenticityGuaranteed',
       'f_ItemWatchCount7DayDecayDomesticWebAndMobile', 'f_CssL3MeanBbeCount',
       'f_IsVariantItem', 'f_HydraStdWOCat', 'f_KnnNlpUserItemsRecallSource',
       'f_ItemSoldCountWnD', 'f_RecoHasEpid',
       ...
       'f_MaxViewedItemTitleJaccardWithSameCate',
       'f_MeanViewedItemTitleJaccardWithSameCate',
       'f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile',
       'f_Seller90DayBBEOverTransactionsLogSmooth', 'f_ItemIsNew',
       'f_RecallSourceToraUb',
       'f_Seller90DayStockoutOverTransactionsLogSmooth',
       'f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile',
       'f_ItemSellerFixedPriceViewsOverImp',
       'f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2'],
      dtype='object', length=1519)

In [67]:
df_hp.l1Cat.unique()

array([b'0.619', b'0.11700', b'0.6000', b'0.11450', b'0.26395', b'0.220',
       b'0.1249', b'0.237', b'0.64482', b'0.1', b'0.11116', b'0.15032',
       b'0.12576', b'0.11233', b'0.58058', b'0.267', b'0.281', b'0.870',
       b'0.1281', b'0.260', b'0.45100', b'0.293', b'0.14339', b'0.888',
       b'0.625', b'0.2984', b'0.11232', b'0.20081', b'0.99', b'',
       b'0.3252', b'0.172008', b'0.550', b'0.316'], dtype=object)

In [60]:
df_us.l1Cat.unique()

array([b'0.11450'], dtype=object)

In [69]:
df_us.category

0     15709
1     15709
2     15709
3     15709
4     15709
      ...  
31    15709
32    15709
33    15709
34    15709
35    15709
Name: category, Length: 35134, dtype: object

## Check if hp and sneakers have same columns order and columns and reorder it: 

In [28]:
if set(df_hp_sub.columns) == set(df_us.columns):
    print("The data frames have the same columns")
else:
    print("The data frames do not have the same columns")

The data frames have the same columns


In [29]:
df_hp_sub.shape

(192343, 1519)

In [30]:
df_us.head(2)

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.360565,-1.0,-1.0,...,,,,,,,,,,
1,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.212199,-1.0,-1.0,...,,,,,,,,,,


In [31]:
df_hp_sub = df_hp_sub.reindex(columns=df_us.columns)

In [32]:
df_hp_sub.head(2)

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.190607,-1.0,-1.0,...,-1.0,-1.0,1.697867,-5.071487,1.0,0.0,-13.468362,0.0,0.001376,-3.532466
1,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.130109,-1.0,-1.0,...,-1.0,-1.0,0.0,-5.071487,1.0,0.0,-13.468362,0.0,0.001376,-3.184154


### Examination of hp and sneakers data intersection:

In [33]:
import pandas as pd

int_df = pd.merge(df_us, df_hp_sub, how ='inner', on =['itemId','meid'])
print(int_df['itemId'])

0       225301575590
1       204272259664
2       334810585208
3       334812223787
4       394370293571
            ...     
4218    325245933912
4219    155496715073
4220    203649348459
4221    134485753700
4222    325384031719
Name: itemId, Length: 4223, dtype: int64


In [95]:
### calc intersection of HP and sneakers us to remove it.

In [34]:
df_hp_sub[df_hp_sub.itemId.isin([225301575590 ]) ][['meid', 'dt', 'itemId','category', 'labelClick']]

Unnamed: 0,meid,dt,itemId,category,labelClick
20,1b118e03710b4b94870ef4bbcfdca8ad,2023-04-05,225301575590,b'95672',1


In [35]:
df_us[df_us.itemId.isin([225301575590]) ][['meid', 'dt', 'itemId','category','labelClick']]

Unnamed: 0,meid,dt,itemId,category,labelClick
1,2b8bba35dcdb421993e891d167363bb1,2022-12-28,225301575590,95672,0
0,1b118e03710b4b94870ef4bbcfdca8ad,2023-04-05,225301575590,95672,1


In [36]:
if all([set(df_us.columns) == set(df_hp_sub.columns)]):
    print('All have the same columns')
else:
    print('Some have different columns')

All have the same columns


In [None]:
s = pd.groupby("siteId")['siteId'].count().to_frame()
s.rename(columns={"siteId": "num_values"}, inplace=True)
s['%']=s['num_values']/s['num_values'].sum()*100
s['unique_values']=pd.groupby("siteId")['f_SizeMedianDiffMaxMinTh1EvidenceTh4'].unique()
s

In [None]:
pd['siteId'].unique()

In [None]:
pd[pd['siteId']==3]['f_SizeMedianDiffMaxMinTh1EvidenceTh4'].unique()

### 3.Train on US  sneakes only:

In [37]:
df_us.meid.nunique(), df_us.labelPurchase.sum() , df_us.shape


(9097, 9134, (86793, 1519))

In [38]:

df_hp_sub.meid.nunique(), df_hp_sub.labelPurchase.sum(),df_hp_sub.shape

(19961, 20244, (192343, 1519))

In [40]:
192343+ 86793- 4223

274913

# 4.Concat HP and sneakers US: 

In [39]:
pdf=pd.concat([df_us,df_hp_sub]).sort_values(['meid','itemId']).\
drop_duplicates(['meid','itemId']).reset_index(drop=True)

In [68]:
df_hp_sub.head(10)

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.08209,-1.0,-1.0,...,-1.0,-1.0,0.948168,-3.922363,1.0,0.0,-5.282352,0.477519,0.001958,-4.158981
1,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.354786,-1.0,-1.0,...,-1.0,-1.0,4.927349,-3.868783,1.0,0.0,-5.276208,16.359272,0.003094,-3.122868
2,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.335539,-1.0,-1.0,...,-1.0,-1.0,1.270526,-5.294393,1.0,0.0,-7.816494,1.684873,0.007407,-4.499829
3,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.272697,-1.0,-1.0,...,-1.0,-1.0,3.05506,-3.868783,1.0,0.0,-5.276208,14.233041,0.003094,-2.955543
4,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.272697,-1.0,-1.0,...,-1.0,-1.0,2.006639,-5.005934,1.0,0.0,-9.373088,6.951083,0.002555,-3.917584
5,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.328359,-1.0,-1.0,...,-1.0,-1.0,4.301292,-3.708095,1.0,0.0,-4.978708,7.471943,0.004089,-3.611195
6,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.130109,-1.0,-1.0,...,-1.0,-1.0,0.957409,-4.626245,1.0,0.0,-4.332437,0.0,0.002462,-4.539145
7,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.24627,-1.0,-1.0,...,-1.0,-1.0,0.571211,-3.416162,1.0,0.0,-5.713809,12.828444,0.00422,-4.564827
8,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.272697,-1.0,-1.0,...,-1.0,-1.0,1.392515,-4.501021,1.0,0.0,-4.644405,2.214185,0.002581,-3.838308
9,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,...,-1.0,-1.0,0.021509,-3.566935,1.0,0.0,-5.8632,2.214431,0.00342,-3.851002


In [41]:
pdf.shape

(274884, 1519)

In [42]:
pdf.meid.nunique(), pdf.labelPurchase.sum(),pdf.shape

(28616, 28933, (274884, 1519))

In [43]:
pdf.head(1)

Unnamed: 0,f_CoviewCountWnD,f_ConditionGranular,f_SlrRampupTrans8w,f_SearchModel103UserItemAffinity_PredPriceSd,f_SlrImmatureRatioSnad14d,f_ConditionFilterRecoMatchness,f_ColorDistance,f_ItemWatchCountInteractSellerLstgConvNorm,f_SizeDistance,f_UserHotClickCategory3Day,...,f_MaxViewedItemTitleJaccardWithSameCate,f_MeanViewedItemTitleJaccardWithSameCate,f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile,f_Seller90DayBBEOverTransactionsLogSmooth,f_ItemIsNew,f_RecallSourceToraUb,f_Seller90DayStockoutOverTransactionsLogSmooth,f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile,f_ItemSellerFixedPriceViewsOverImp,f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2
0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,...,,,,,,,,,,


### 5.copy df for post manual leakage fix examinations

In [44]:
pdf_pre=pdf.copy()

In [45]:
pdf['meid'].nunique()

28616

In [46]:
pdf['siteId'].unique()

array([0], dtype=int32)

In [47]:
pdf['labelPurchase'].value_counts()

0    245951
1     28933
Name: labelPurchase, dtype: int64

In [48]:
pdf['dt'].nunique()

146

In [50]:
piyi_v7_features=[
                          "BibowatchRelPosition",
                          "RecallSourceBullseye",
                          "RecallSourceTora",
                          "TitleCosineSimilarityToShoppingcartCentroid",
                          "FreqSameLeafCatIdInWatchBadge",
                          "MaxViewedItemTitleJaccardBigrams",
                          "NumSameRviInLastWeek",
                          "AvgSameLeafRviPriceRatio",
                          "ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile",
                          "ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2",
                          "MaxViewedItemTitleJaccard",
                          "ItemTimeOnSiteV2",
                          "ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2",
                          "PriceDiffMedianRecall",
                          "FreqSameItemInWatchBadge",
                          "RecallSourceBestMatch",
                          "ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm",
                          "FreqWatchPriceBellowItemPrice",
                          "MerchImpressionsDecayed",
                          "PlImpressionsDecayed",
                          "AvgSameLeafRviPriceDiff",
                          "ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm",
                          "BullseyeRelRVILeafCatMedianPriceDiffV2",
                          "BullseyeAbsRVILeafCatMedianPriceDiffV2",
                          "BullseyeRVILeafCatMedianPriceV2",
                          "ItemTimeLeftSec",
                          "ItemTimeOnSiteNorm",
                          "ItemWatchOverImpLogSmoothAllNorm",
                          "NormItemViewCount7DayDecayDomesticWebAndMobile",
                          "ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm",
                          "EpidRelative",
                          "SellerFeedbackLogNormWnD",
                          "NSFWScore"
                      ]

In [51]:
len(piyi_v7_features)

33

# Leakage changes

## wachlist & Shoppingcart changes: 

#### f_TitleCosineSimilarityToShoppingcartCentroid==1--> set to -1 .

In [None]:
import matplotlib.pyplot as plt

In [52]:
pdf.loc[(pdf.labelAdd2Cart == 1)& (pdf.labelPurchase == 1) & (pdf.f_TitleCosineSimilarityToShoppingcartCentroid==1) , 'f_TitleCosineSimilarityToShoppingcartCentroid'] = -1

In [27]:
pdf[(pdf.labelAdd2Cart == 1)& (pdf.labelPurchase == 1)& (pdf.f_TitleCosineSimilarityToShoppingcartCentroid==1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

Series([], Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64)

In [110]:
pdf[(pdf.labelAdd2Cart == 1)& (pdf.labelPurchase == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    2152
 0.000000     129
 0.707107      17
 0.866025       8
 0.774597       7
 0.577350       7
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [111]:
pdf_pre[(pdf_pre.labelAdd2Cart == 1)& (pdf_pre.labelPurchase == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    1271
 1.000000     881
 0.000000     129
 0.707107      17
 0.866025       8
 0.816497       7
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [112]:
pdf_pre[(pdf_pre.labelAdd2Cart == 0)& (pdf_pre.labelPurchase == 0)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True).head(6)

-1.000000    0.591772
 0.000000    0.098452
 0.500000    0.000916
 0.333333    0.000869
 0.288675    0.000822
 0.160128    0.000794
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: float64

In [113]:
pdf_pre[(pdf_pre.labelAdd2Cart == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True).head(6)

-1.000000    0.308077
 1.000000    0.213248
 0.000000    0.032404
 0.707107    0.004051
 0.866025    0.001906
 0.774597    0.001668
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: float64

In [114]:
pdf_pre[(pdf_pre.labelAdd2Cart == 0)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True).head(6)

-1.000000    0.596371
 0.000000    0.098738
 1.000000    0.002147
 0.500000    0.000876
 0.333333    0.000833
 0.288675    0.000791
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: float64

In [115]:
pdf[(pdf.labelAdd2Cart == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    2174
 0.000000     136
 0.707107      17
 1.000000      14
 0.866025       8
 0.774597       7
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [116]:
pdf_pre[(pdf_pre.labelAdd2Cart == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    1293
 1.000000     895
 0.000000     136
 0.707107      17
 0.866025       8
 0.774597       7
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [117]:
pdf_pre[(pdf_pre.labelAdd2Cart == 1)& (pdf_pre.labelPurchase == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    1271
 1.000000     881
 0.000000     129
 0.707107      17
 0.866025       8
 0.816497       7
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [118]:
pdf_pre[(pdf_pre.labelAdd2Cart == 0)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    140301
 0.000000     23229
 1.000000       505
 0.500000       206
 0.333333       196
 0.288675       186
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [119]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True).head(6)

-1.000000    0.590720
 0.000000    0.090541
 1.000000    0.048424
 0.707107    0.001071
 0.087039    0.000674
 0.083333    0.000635
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: float64

In [120]:
pdf_pre[(pdf_pre.labelPurchase ==0)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True).head(6)

-1.000000    0.591388
 0.000000    0.098404
 0.500000    0.000915
 0.333333    0.000868
 1.000000    0.000836
 0.288675    0.000822
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: float64

In [121]:
pdf_pre[(pdf_pre.labelPurchase ==1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    14895
 0.000000     2283
 1.000000     1221
 0.707107       27
 0.087039       17
 0.083333       16
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

In [122]:
pdf_pre[(pdf_pre.labelPurchase ==0)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=False).head(6)

-1.000000    126699
 0.000000     21082
 0.500000       196
 0.333333       186
 1.000000       179
 0.288675       176
Name: f_TitleCosineSimilarityToShoppingcartCentroid, dtype: int64

## watch & rel position Leakage Changes: 

#### remove FreqSameLeafCatIdInWatchBadge

In [53]:
piyi_v7_features.remove("FreqSameLeafCatIdInWatchBadge")

In [54]:
piyi_v7 = ['f_' + sub for sub in piyi_v7_features]

In [57]:
X_f_cols = [i[0] for i in (np.argwhere(pdf.columns.str.startswith("f_") == True))]
len(X_f_cols)

1470

In [58]:
len(piyi_v7)

32

In [46]:
pdf[(pdf.labelPurchase == 1) & (pdf.f_FreqSameItemInWatchBadge > 0)]['labelWatch'].value_counts()

0    4032
1    3144
Name: labelWatch, dtype: int64

In [131]:
pdf[(pdf.labelWatch == 1) ]['f_FreqSameItemInWatchBadge'].value_counts(normalize=False).head(5)

 1.0    3317
 0.0     553
-1.0      36
 2.0       8
 5.0       1
Name: f_FreqSameItemInWatchBadge, dtype: int64

In [132]:
pdf[(pdf.labelWatch == 1) ]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=False).head(5)

1.0    836
2.0    669
3.0    475
4.0    341
5.0    253
Name: f_FreqSameLeafCatIdInWatchBadge, dtype: int64

#### Apply changes to 0:f_FreqSameItemInWatchBadge

In [59]:
pdf.loc[pdf["labelWatch"] == 1, 'f_FreqSameItemInWatchBadge'] = 0

In [60]:
pdf[(pdf.labelWatch == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=False)

0.0    4455
Name: f_FreqSameItemInWatchBadge, dtype: int64

In [138]:
pdf_pre[(pdf_pre.labelPurchase == 0)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=True)

 0.0     0.827791
-1.0     0.132081
 1.0     0.039965
 2.0     0.000112
 3.0     0.000037
 8.0     0.000005
 4.0     0.000005
 18.0    0.000005
Name: f_FreqSameItemInWatchBadge, dtype: float64

In [140]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=False)

 0.0    14704
 1.0     7152
-1.0     3335
 2.0       18
 4.0        2
 3.0        2
 5.0        2
Name: f_FreqSameItemInWatchBadge, dtype: int64

###### 1.

In [50]:
pdf[(pdf.labelPurchase == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=True)

 0.0    0.709141
 1.0    0.159350
-1.0    0.130954
 2.0    0.000397
 4.0    0.000079
 3.0    0.000040
 5.0    0.000040
Name: f_FreqSameItemInWatchBadge, dtype: float64

In [51]:
pdf.loc[(pdf["labelWatch"] == 1) & (pdf['labelPurchase'] == 0)].f_FreqSameItemInWatchBadge.value_counts()

0.0    248
Name: f_FreqSameItemInWatchBadge, dtype: int64

In [143]:
pdf.loc[(pdf["labelWatch"] == 1) & (pdf['labelPurchase'] == 1)].f_FreqSameItemInWatchBadge.value_counts()

0.0    3668
Name: f_FreqSameItemInWatchBadge, dtype: int64

In [144]:
pdf[(pdf.labelWatch == 1) ]['f_FreqSameItemInWatchBadge'].value_counts(normalize=False).head(5)

0.0    3916
Name: f_FreqSameItemInWatchBadge, dtype: int64

In [145]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=False).head(5)

 0.0    14704
 1.0     7152
-1.0     3335
 2.0       18
 4.0        2
Name: f_FreqSameItemInWatchBadge, dtype: int64

In [146]:
pdf[(pdf.labelPurchase == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=False)

 0.0    17881
 1.0     4018
-1.0     3302
 2.0       10
 4.0        2
 3.0        1
 5.0        1
Name: f_FreqSameItemInWatchBadge, dtype: int64

#### 1.keep feature with no changes in data, after exclude it 
##### prev: Apply -1 to f_FreqSameLeafCatIdInWatchBadge:

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=True)

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=False).head(5)

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 0)]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=True)

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 0)]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=False).head(5)

In [None]:
pdf_pre.loc[(pdf_pre["labelWatch"] == 1) & (pdf_pre['labelPurchase'] == 1)].f_FreqSameLeafCatIdInWatchBadge.value_counts().head(6)

In [None]:
#pdf.loc[pdf["labelWatch"] == 1, 'f_FreqSameLeafCatIdInWatchBadge'] = -1

In [None]:
pdf[(pdf.labelWatch == 1) ]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=False).head(5)

In [None]:
pdf[(pdf.labelPurchase == 1) ]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=False).head(5)

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1) ]['f_FreqSameLeafCatIdInWatchBadge'].value_counts(normalize=False).head(5)

#### For labelwatch=1 & BibowatchRelPosition>=0 : apply -1 values:

In [61]:
pdf.loc[((pdf["labelWatch"] == 1 )&(pdf["f_BibowatchRelPosition"] >=0 )) , 'f_BibowatchRelPosition'] = -1

In [148]:
pdf_pre[(pdf_pre.labelWatch == 1)]['f_BibowatchRelPosition'].value_counts(normalize=False).head(6)

 0.0    2704
-1.0     589
 1.0     422
 2.0     116
 3.0      42
 4.0      20
Name: f_BibowatchRelPosition, dtype: int64

In [62]:
pdf[(pdf.labelWatch == 1)]['f_BibowatchRelPosition'].value_counts(normalize=False).head(6)

-1.0    4455
Name: f_BibowatchRelPosition, dtype: int64

In [150]:
pdf_pre[(pdf_pre.labelWatch == 0)]['f_BibowatchRelPosition'].value_counts(normalize=False).head(6)

-1.0    223093
 0.0      1662
 1.0      1438
 2.0      1247
 3.0      1081
 4.0       891
Name: f_BibowatchRelPosition, dtype: int64

In [151]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_BibowatchRelPosition'].value_counts(normalize=True).head(6)

-1.0    0.715407
 0.0    0.135594
 1.0    0.034543
 2.0    0.020583
 3.0    0.014515
 4.0    0.010827
Name: f_BibowatchRelPosition, dtype: float64

In [152]:
pdf_pre[(pdf_pre.labelPurchase == 0)]['f_BibowatchRelPosition'].value_counts(normalize=True).head(6)

-1.0    0.959872
 1.0    0.004616
 0.0    0.004420
 2.0    0.003940
 3.0    0.003533
 4.0    0.002978
Name: f_BibowatchRelPosition, dtype: float64

In [153]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    18039
 0.0     3419
 1.0      871
 2.0      519
 3.0      366
Name: f_BibowatchRelPosition, dtype: int64

In [154]:
pdf_pre[(pdf_pre.labelPurchase == 0)]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    205643
 1.0       989
 0.0       947
 2.0       844
 3.0       757
Name: f_BibowatchRelPosition, dtype: int64

In [155]:
pdf_pre.loc[(pdf_pre["labelWatch"] == 1)].f_BibowatchRelPosition.value_counts()

 0.0     2704
-1.0      589
 1.0      422
 2.0      116
 3.0       42
 4.0       20
 5.0       12
 9.0        4
 6.0        3
 10.0       2
 21.0       1
 14.0       1
Name: f_BibowatchRelPosition, dtype: int64

In [156]:
pdf.loc[(pdf["labelWatch"] == 1) ].f_BibowatchRelPosition.value_counts()

-1.0    3916
Name: f_BibowatchRelPosition, dtype: int64

In [157]:
pdf[(pdf.labelWatch == 1) ]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    3916
Name: f_BibowatchRelPosition, dtype: int64

In [158]:
pdf[(pdf.labelWatch == 1) ]['f_BibowatchRelPosition'].value_counts(normalize=False).sum()

3916

In [159]:
pdf[(pdf.labelWatch == 1) ]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    3916
Name: f_BibowatchRelPosition, dtype: int64

In [160]:
pdf_pre[(pdf_pre.labelPurchase == 1) ]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    18039
 0.0     3419
 1.0      871
 2.0      519
 3.0      366
Name: f_BibowatchRelPosition, dtype: int64

In [161]:
pdf[(pdf.labelPurchase == 1) ]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    21183
 0.0      824
 1.0      488
 2.0      421
 3.0      335
Name: f_BibowatchRelPosition, dtype: int64

In [162]:
pdf[((pdf["labelWatch"] == 1 )&(pdf["f_BibowatchRelPosition"] <=0 )) ]['f_BibowatchRelPosition'].value_counts(normalize=False).head(5)

-1.0    3916
Name: f_BibowatchRelPosition, dtype: int64

In [None]:
pdf[['labelWatch','f_FreqSameItemInWatchBadge']] .value_counts()

#### Correlations:

In [None]:
from scipy import stats
res = stats.spearmanr(pdf_pre.f_FreqSameItemInWatchBadge, pdf_pre.f_BibowatchRelPosition)
res

In [None]:
np.corrcoef(pdf_pre.f_FreqSameItemInWatchBadge, pdf_pre.f_BibowatchRelPosition)

In [None]:
res = stats.spearmanr(pdf.f_FreqSameItemInWatchBadge, pdf.f_BibowatchRelPosition)
res

In [None]:
np.corrcoef(pdf.f_FreqSameItemInWatchBadge, pdf.f_BibowatchRelPosition)

In [None]:
res = stats.spearmanr(pdf_pre.f_FreqSameItemInWatchBadge, pdf_pre.f_FreqSameLeafCatIdInWatchBadge)
res

In [None]:
res = stats.spearmanr(pdf.f_FreqSameItemInWatchBadge, pdf.f_FreqSameLeafCatIdInWatchBadge)
res

# Features and validations

## Features: validate features columns in data

In [66]:
len(piyi_v7)

32

In [67]:
#select piyi_v7 as in pd columns order: 

piyiv7_sneakers_f=pdf[[c for c in pdf.columns if c in piyi_v7]].columns.tolist()

len(piyiv7_sneakers_f)

32

In [68]:
cols = [x for x in pdf.columns if 'Size' in x] 

print(pdf[cols].columns) 

Index(['f_SizeDistance', 'f_RecallSizeLog', 'f_SizeDistanceCoaspectCSA',
       'f_AspectMatchMensUSShoeSize', 'f_IsSizeMatchSearchQuery',
       'f_AspectMatchUSShoeSize', 'f_IsSizeMatchHnG',
       'f_SizeDistanceCoaspectHnG', 'f_RecallSize', 'f_IsExactSizeMatch',
       'f_RecallSizeNorm', 'f_AspectMatchWomensUSShoeSize', 'f_UserSizeMedian',
       'f_UserSizeMean', 'f_SizeMedianDiff', 'f_SizeMeanDiff',
       'f_UserSizeMaxMinDiff', 'f_UserSizeStd', 'f_UserSizeNumberOfEvidence',
       'f_SizeMedianDiffMaxMinTh0.5', 'f_SizeMedianDiffMaxMinTh1',
       'f_SizeMedianDiffMaxMinTh1.5', 'f_SizeMeanDiffStdTh0.25',
       'f_SizeMeanDiffStdTh0.5', 'f_SizeMeanDiffStdTh0.75',
       'f_SizeMedianDiffMaxMinThn0.5EvidenceTh1',
       'f_SizeMedianDiffMaxMinThn0.5EvidenceTh4',
       'f_SizeMedianDiffMaxMinThn0.5EvidenceTh8',
       'f_SizeMedianDiffMaxMinTh1EvidenceTh1',
       'f_SizeMedianDiffMaxMinTh1EvidenceTh4',
       'f_SizeMedianDiffMaxMinTh1EvidenceTh8',
       'f_SizeMedianDiffMaxMi

Size: 
    SizeMedianDiff, UserSizeMaxMinDiff, UserSizeNumberOfEvidence 
    SizeMedianDiff_MaxMinTh1_EvidanceTh4 

In [69]:
size_3f=['f_IsMsku', 'f_SizeMedianDiff', 'f_UserSizeMaxMinDiff', 'f_UserSizeNumberOfEvidence']
size_1f=['f_IsMsku',  'f_SizeMedianDiffMaxMinTh1EvidenceTh4' ]
(pdf.columns.intersection(size_3f))

Index(['f_SizeMedianDiff', 'f_UserSizeMaxMinDiff',
       'f_UserSizeNumberOfEvidence', 'f_IsMsku'],
      dtype='object')

In [70]:
size3f_piyi_features=piyi_v7 +size_3f 
size1f_piyi_features=piyi_v7 +size_1f 

In [71]:
pdf[[c for c in pdf.columns if c in size3f_piyi_features]].shape

(274884, 36)

In [72]:
cols = [x for x in pdf.columns if 'IsMsku' in x] 

print(pdf[cols].columns) 

Index(['f_IsMskuTraitNameMatch', 'f_IsMsku'], dtype='object')


#### select pdf data frame columns of size sneakers as in pdf columns order. 

In [170]:
sneakers1f_f=pdf[[c for c in pdf.columns if c in size1f_piyi_features]].columns.tolist()
len(sneakers1f_f)

34

In [73]:
sneakers3f_f=pdf[[c for c in pdf.columns if c in size3f_piyi_features]].columns.tolist()
sneakers3f_f

['f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm',
 'f_BullseyeRelRVILeafCatMedianPriceDiffV2',
 'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm',
 'f_MerchImpressionsDecayed',
 'f_SellerFeedbackLogNormWnD',
 'f_EpidRelative',
 'f_ItemWatchOverImpLogSmoothAllNorm',
 'f_ItemTimeLeftSec',
 'f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm',
 'f_ItemTimeOnSiteNorm',
 'f_PlImpressionsDecayed',
 'f_NormItemViewCount7DayDecayDomesticWebAndMobile',
 'f_NSFWScore',
 'f_MaxViewedItemTitleJaccard',
 'f_ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2',
 'f_ItemTimeOnSiteV2',
 'f_MaxViewedItemTitleJaccardBigrams',
 'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile',
 'f_RecallSourceBullseye',
 'f_RecallSourceTora',
 'f_ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2',
 'f_RecallSourceBestMatch',
 'f_PriceDiffMedianRecall',
 'f_SizeMedianDiff',
 'f_UserSizeMaxMinDiff'

In [74]:
len(sneakers3f_f)

36

In [173]:
pdf[[c for c in pdf.columns if c in size1f_piyi_features]].shape


(239455, 34)

In [75]:
pdf[[c for c in pdf.columns if c in size3f_piyi_features]].shape

(274884, 36)

In [76]:
pdf[[c for c in pdf.columns if c in piyiv7_sneakers_f]].shape

(274884, 32)

In [None]:
size4f_piyiv7=sneakers3f_f+['f_SizeMedianDiffMaxMinTh1EvidenceTh4']

In [None]:
size4f_piyiv7_f=pdf[[c for c in pdf.columns if c in size4f_piyiv7]].columns.tolist()

In [None]:
pdf.shape[0]

# Leackage examination:

In [176]:
pdf_pre=pdf.copy()

In [None]:
pdf_pre[pdf_pre.f_BibowatchRelPosition != -1].shape

In [None]:
pdf_pre[['labelWatch','f_FreqSameItemInWatchBadge']] .value_counts()

In [None]:
pdf_pre.f_FreqSameItemInWatchBadge.value_counts()

In [None]:
pdf_pre.f_FreqSameItemInWatchBadge.value_counts(normalize=True)

In [None]:
pdf_pre[(pdf_pre.f_FreqSameItemInWatchBadge == 0)]['f_SizeMedianDiff'].value_counts(normalize=True).head(5)

In [None]:
pdf_pre[(pdf_pre.f_FreqSameItemInWatchBadge == 1)]['f_SizeMedianDiff'].value_counts(normalize=True).head(5)

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['labelClick'].value_counts()

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['labelWatch'].value_counts()

In [None]:
pdf_pre.f_FreqSameItemInWatchBadge.value_counts()

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1) & (pdf_pre.f_FreqSameItemInWatchBadge > 0)]['labelWatch'].value_counts()

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1) ]['labelWatch'].value_counts()

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['labelWatch'].value_counts()

#### Leakage: 34% of purchases from watchlist: 

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=True)

### After change: 

In [None]:
pdf[(pdf.labelPurchase == 1)]['f_FreqSameItemInWatchBadge'].value_counts(normalize=True)

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1)]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True)
   # compare to add2crt 

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 0) ]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True).head(10)

In [None]:
a = pdf_pre[(pdf_pre.labelAdd2Cart == 1)]

In [None]:
a.f_TitleCosineSimilarityToShoppingcartCentroid.value_counts()

In [None]:
pdf_pre[pdf_pre.f_TitleCosineSimilarityToShoppingcartCentroid == 1].labelAdd2Cart.value_counts()

In [None]:
pdf_pre[(pdf_pre.labelAdd2Cart == 1)& (pdf.labelPurchase == 1) ]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts()

In [None]:
pdf_pre[(pdf_pre.labelAdd2Cart == 1)& (pdf.labelPurchase == 1) ]['f_TitleCosineSimilarityToShoppingcartCentroid'].value_counts(normalize=True)

In [None]:
(2.5387 - 2.4408)/2.5387*100


In [None]:
( 2.4408-2.5387)/2.5387*100

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 0) & (pdf_pre.f_SizeMedianDiff != -1)]['f_SizeMedianDiff'].mean()

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1) & (pdf_pre.f_SizeMedianDiff != -1)]['f_SizeMedianDiff'].mean()

In [None]:
fc = pdf[size4f_piyiv7_f].count().sum()
fc

In [None]:
pdf_pre[pdf_pre.labelPurchase == 0]['f_RecallSourceBullseye'].value_counts()

In [None]:
2214/3690

In [None]:
pdf_pre[(pdf_pre.labelPurchase == 1) & ((pdf_pre.f_RecallSourceBullseye ==1) | (pdf_pre.f_FreqSameItemInWatchBadge > 0))].shape

In [None]:
pdf_pre[pdf_pre.labelPurchase == 1]['f_RecallSourceBullseye'].value_counts()

In [None]:
pdf_pre.f_RecallSourceBullseye.value_counts()

In [None]:
140510/(140510+1237151)

In [None]:
pdf_pre.labelPurchase.value_counts()

In [None]:
182413/(182413+1195248)

In [None]:
pdf_pre \
    .groupby('meid') \
    .size() \
    .to_frame("meid_count") \
    .groupby('meid_count') \
    .size() / pdf.meid.drop_duplicates().shape[0]

### Sale rank of hp piyi v7model on sneakers data 

In [None]:
np.mean(calc_sale_rank(pdf, 'rank', ascending=True))

# Training: 

# Process data

In [80]:
pdf.sort_values(by=['meid'], ascending=True, inplace=True)

In [81]:
#train test val: 
train_inds, test_inds = next(GroupShuffleSplit(test_size=.40, n_splits=2,
                                               random_state = 7).split(pdf,
                                                                       groups=pdf['meid']))

train = pdf.iloc[train_inds]
test = pdf.iloc[test_inds] 

val_inds, test_inds = next(GroupShuffleSplit(test_size=.5, n_splits=2,
                                               random_state = 7).split(test,
                                                                       groups=test['meid']))

valid = test.iloc[val_inds]
test = test.iloc[test_inds]

In [82]:
train['meid'].nunique()

17169

In [83]:
train.sort_values(by=['meid'], ascending=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [84]:
test.sort_values(by=['meid'], ascending=True, inplace=True),
valid.sort_values(by=['meid'], ascending=True, inplace=True)

In [85]:
len(train), len(valid), len(test)

(164863, 55025, 54996)

In [86]:
test.labelPurchase.sum()

5771

In [87]:
train.sort_values(by=['meid'], ascending=True)[[ 'meid','itemId','siteId','userId', 'l1Cat', 'rank', 'labelPurchase']].head(3)

Unnamed: 0,meid,itemId,siteId,userId,l1Cat,rank,labelPurchase
0,00033b890abc47b2b65d42547cc30913,155441804370,0,1078293796,b'0.237',9,0
1,00033b890abc47b2b65d42547cc30913,165999363134,0,1078293796,b'0.237',1,0
2,00033b890abc47b2b65d42547cc30913,175549638773,0,1078293796,b'0.237',3,0


In [88]:
train.labelPurchase.value_counts()

0    147500
1     17363
Name: labelPurchase, dtype: int64

In [89]:
test.labelPurchase.value_counts()

0    49225
1     5771
Name: labelPurchase, dtype: int64

In [90]:
valid.labelPurchase.value_counts()

0    49226
1     5799
Name: labelPurchase, dtype: int64

In [91]:
train.to_parquet('/data/shpx/data/olivyatan/piyi/train_snkrs_4m_7dhp.parquet')
valid.to_parquet('/data/shpx/data/olivyatan/piyi/valid_snkrs_4m_7dhp.parquet')
test.to_parquet('/data/shpx/data/olivyatan/piyi/test_snkrs_4m_7dhp.parquet')

#### sale rank of hp model by rank for pa data

In [92]:
test.shape

(54996, 1519)

In [111]:
valid_sr_prod = calc_sale_rank(valid, 'rank', ascending=True)
train_sr_prod = calc_sale_rank(train, 'rank', ascending=True)
test_sr_prod = calc_sale_rank(test, 'rank', ascending=True)
np.mean(valid_sr_prod), np.mean(train_sr_prod),np.mean(test_sr_prod)

(2.826324099325391, 2.8243935116512753, 2.857758002009473)

In [93]:
valid_sr_prod = calc_sale_rank(valid, 'rank', ascending=True)
train_sr_prod = calc_sale_rank(train, 'rank', ascending=True)
test_sr_prod = calc_sale_rank(test, 'rank', ascending=True)
np.mean(valid_sr_prod), np.mean(train_sr_prod),np.mean(test_sr_prod)
#new 7d

(2.8229949327275903, 2.8276544935639816, 2.7833682739343115)

In [69]:
#same sale reank for purchases only in test and clicks

# Train with original features:  
#### train & val test- label purchase

In [94]:
X_f_cols = [i[0] for i in (np.argwhere(pdf.columns.str.startswith("f_") == True))]
len(X_f_cols)

1470

In [95]:
pdf.columns

Index(['f_CoviewCountWnD', 'f_ConditionGranular', 'f_SlrRampupTrans8w',
       'f_SearchModel103UserItemAffinity_PredPriceSd',
       'f_SlrImmatureRatioSnad14d', 'f_ConditionFilterRecoMatchness',
       'f_ColorDistance', 'f_ItemWatchCountInteractSellerLstgConvNorm',
       'f_SizeDistance', 'f_UserHotClickCategory3Day',
       ...
       'f_MaxViewedItemTitleJaccardWithSameCate',
       'f_MeanViewedItemTitleJaccardWithSameCate',
       'f_ItemFastIMAWatchCount7DayDecayDomesticWebAndMobile',
       'f_Seller90DayBBEOverTransactionsLogSmooth', 'f_ItemIsNew',
       'f_RecallSourceToraUb',
       'f_Seller90DayStockoutOverTransactionsLogSmooth',
       'f_ItemFastIMASaleCount7DayDecayDomesticWebAndMobile',
       'f_ItemSellerFixedPriceViewsOverImp',
       'f_ItemFastIMAViewsOverImp7DayDecayLogSmoothAllWebAndMobileV2'],
      dtype='object', length=1519)

### 1.Train piyi v7 features only: 

In [96]:
X_train = train[piyiv7_sneakers_f]
y_train = train.labelPurchase

X_valid = valid[piyiv7_sneakers_f]
y_valid = valid.labelPurchase

X_test = test[piyiv7_sneakers_f]
y_test = test.labelPurchase
#change test to label purchase.

X_train.shape, X_valid.shape, X_test.shape

((164863, 32), (55025, 32), (54996, 32))

In [97]:
X_train.head(3)

Unnamed: 0,f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm,f_BullseyeRelRVILeafCatMedianPriceDiffV2,f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm,f_MerchImpressionsDecayed,f_SellerFeedbackLogNormWnD,f_EpidRelative,f_ItemWatchOverImpLogSmoothAllNorm,f_ItemTimeLeftSec,f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm,f_ItemTimeOnSiteNorm,...,f_PriceDiffMedianRecall,f_BullseyeAbsRVILeafCatMedianPriceDiffV2,f_BullseyeRVILeafCatMedianPriceV2,f_AvgSameLeafRviPriceRatio,f_FreqSameItemInWatchBadge,f_TitleCosineSimilarityToShoppingcartCentroid,f_AvgSameLeafRviPriceDiff,f_NumSameRviInLastWeek,f_BibowatchRelPosition,f_FreqWatchPriceBellowItemPrice
0,0.522002,-0.224435,0.497201,0.0,0.434398,0.679423,0.443638,2524605.0,0.515278,0.011363,...,-988.5,-586.0,2611.0,0.324669,0.0,-1.0,-42.121246,0.0,-1.0,2.31334e-09
1,0.535889,-0.234393,0.424974,0.0,0.512884,1.684803,0.442389,1592189.0,0.626618,0.004477,...,-318.5,-612.0,2611.0,0.3205,0.0,-1.0,-42.381248,0.0,-1.0,2.31334e-09
2,0.519483,0.531597,0.400974,187.360001,0.648639,2.038154,0.52617,1631550.0,0.706909,0.036633,...,1581.5,1388.0,2611.0,0.641161,0.0,-1.0,-22.381245,0.0,-1.0,2.31334e-09


In [98]:
X_train
cols = [x for x in X_train.columns if 'Size' in x] 

print(X_train[cols].columns) 

Index([], dtype='object')


In [99]:
y_test.value_counts()

0    49225
1     5771
Name: labelPurchase, dtype: int64

In [100]:
group_train = train.groupby('meid').size().to_frame('size')['size'].to_numpy()
group_valid = valid.groupby('meid').size().to_frame('size')['size'].to_numpy()
group_test=test.groupby('meid').size().to_frame('size')['size'].to_numpy()

In [101]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group=group_train)

In [102]:
from datetime import datetime 
import pandas as pd
params = {
    'objective': "rank:pairwise",
    'nthread': -1
}

gridsearch_params = [
    (max_depth, min_child_weight, subsample, eta)
    for max_depth in [4]
    for min_child_weight in [5]
    for subsample in [0.7, 1]
    for eta in [0.3 ]
]

N_ROUNDS = 100

size3f_hyper_df = run_hyper_tuning(dtrain= dtrain,
                                    grid = gridsearch_params,
                                    params = params,
                                    output_dir="/data/shpx/data/olivyatan/piyi/snkrs",
                                    out_filename= "train_piyi7sneakers_orig2m",
                                    nrounds = N_ROUNDS)

08:51:11
CV1 with max_depth=4, min_child_weight=5, subsample=0.7, eta=0.3
	map 0.629 for 99 rounds
CV2 with max_depth=4, min_child_weight=5, subsample=1, eta=0.3
	map 0.628 for 100 rounds
Best params: 4, 5, 0.7, 0.3, map: 0.6289846000000001
09:17:11
Saving hyper-paramter output


In [103]:
orig_hyper_df = pd.read_csv("/data/shpx/data/olivyatan/piyi/snkrs/hyper_df_train_piyi7sneakers_orig2m.csv")

In [104]:
orig_hyper_df_sorted = orig_hyper_df.sort_values("mean_map", ascending=False)

In [105]:
orig_hyper_df_sorted.head()

Unnamed: 0,max_depth,min_child_weight,subsample,eta,mean_map,boost_rounds
0,4.0,5.0,0.7,0.3,0.628985,99.0
1,4.0,5.0,1.0,0.3,0.627987,100.0


#### Train with best params :

In [106]:
clf = xgb.XGBRanker(max_depth= int(orig_hyper_df_sorted.max_depth.iloc[0]),
                    learning_rate=orig_hyper_df_sorted.eta.iloc[0],
                    n_estimators= int(orig_hyper_df_sorted.boost_rounds.iloc[0]),                    
                    min_child_weight= int(orig_hyper_df_sorted.min_child_weight.iloc[0]),
                    subsample=orig_hyper_df_sorted.subsample.iloc[0],
                    objective="rank:pairwise",
                    tree_method="exact",
                    n_jobs=8)

In [107]:
clf

XGBRanker(base_score=None, booster=None, colsample_bylevel=None,
          colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None,
          importance_type='gain', interaction_constraints=None,
          learning_rate=0.3, max_delta_step=None, max_depth=4,
          min_child_weight=5, missing=nan, monotone_constraints=None,
          n_estimators=99, n_jobs=8, num_parallel_tree=None, random_state=None,
          reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=0.7,
          tree_method='exact', validate_parameters=None, verbosity=None)

In [108]:
eval_set = [(X_train, y_train), (X_valid, y_valid)]
eval_group = [group_train, group_valid]
eval_metric = ['map', 'ndcg@10-']


model = clf.fit(X_train,
                y_train,
                eval_set=eval_set,
                eval_metric=eval_metric,
                early_stopping_rounds=10,
                group=group_train,
                eval_group=eval_group,
                verbose=True)

[0]	validation_0-map:0.49873	validation_0-ndcg@10-:0.61663	validation_1-map:0.49012	validation_1-ndcg@10-:0.61024
[1]	validation_0-map:0.51297	validation_0-ndcg@10-:0.62793	validation_1-map:0.50511	validation_1-ndcg@10-:0.62209
[2]	validation_0-map:0.52351	validation_0-ndcg@10-:0.63629	validation_1-map:0.51481	validation_1-ndcg@10-:0.62982
[3]	validation_0-map:0.53170	validation_0-ndcg@10-:0.64277	validation_1-map:0.52792	validation_1-ndcg@10-:0.63996
[4]	validation_0-map:0.53817	validation_0-ndcg@10-:0.64776	validation_1-map:0.53119	validation_1-ndcg@10-:0.64258
[5]	validation_0-map:0.54127	validation_0-ndcg@10-:0.65026	validation_1-map:0.53391	validation_1-ndcg@10-:0.64472
[6]	validation_0-map:0.54889	validation_0-ndcg@10-:0.65617	validation_1-map:0.54137	validation_1-ndcg@10-:0.65054
[7]	validation_0-map:0.55040	validation_0-ndcg@10-:0.65738	validation_1-map:0.54430	validation_1-ndcg@10-:0.65277
[8]	validation_0-map:0.55534	validation_0-ndcg@10-:0.66121	validation_1-map:0.54856	vali

In [109]:
xgb_imp_gain = calc_feature_imp(model, 'gain')
xgb_imp_weight = calc_feature_imp(model, 'weight')

In [110]:
xgb_imp_gain.sort_values(ascending=False, by='score').reset_index(drop=True)

Unnamed: 0,feature,score
0,f_RecallSourceBullseye,597.512309
1,f_FreqSameItemInWatchBadge,337.934172
2,f_NumSameRviInLastWeek,171.417766
3,f_BullseyeRelRVILeafCatMedianPriceDiffV2,162.184618
4,f_ItemTimeLeftSec,138.582585
5,f_MaxViewedItemTitleJaccardBigrams,122.962797
6,f_BullseyeAbsRVILeafCatMedianPriceDiffV2,103.236865
7,f_ItemTimeOnSiteV2,82.647496
8,f_TitleCosineSimilarityToShoppingcartCentroid,78.368051
9,f_RecallSourceTora,77.294489


In [111]:
valid_preds = model.predict(X_valid)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

valid['piyi_orig_pred'] = valid_preds
train['piyi_orig_pred'] = train_preds
test['piyi_orig_pred'] = test_preds

sr_rank_valid = calc_sale_rank(valid, 'piyi_orig_pred', ascending=False)
sr_rank_train = calc_sale_rank(train, 'piyi_orig_pred', ascending=False)
sr_rank_test = calc_sale_rank(test, 'piyi_orig_pred', ascending=False)

np.mean(sr_rank_valid), np.mean(sr_rank_train), np.mean(sr_rank_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(2.638651057137865, 2.417846118003378, 2.660377358490566)

In [131]:
valid_preds = model.predict(X_valid)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

valid['piyi_orig_pred'] = valid_preds
train['piyi_orig_pred'] = train_preds
test['piyi_orig_pred'] = test_preds

sr_rank_valid = calc_sale_rank(valid, 'piyi_orig_pred', ascending=False)
sr_rank_train = calc_sale_rank(train, 'piyi_orig_pred', ascending=False)
sr_rank_test = calc_sale_rank(test, 'piyi_orig_pred', ascending=False)

np.mean(sr_rank_valid), np.mean(sr_rank_train), np.mean(sr_rank_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(2.662408497201091, 2.456337623809752, 2.6503516578154156)

In [207]:
valid_preds = model.predict(X_valid)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

valid['piyi_orig_pred'] = valid_preds
train['piyi_orig_pred'] = train_preds
test['piyi_orig_pred'] = test_preds

sr_rank_valid = calc_sale_rank(valid, 'piyi_orig_pred', ascending=False)
sr_rank_train = calc_sale_rank(train, 'piyi_orig_pred', ascending=False)
sr_rank_test = calc_sale_rank(test, 'piyi_orig_pred', ascending=False)

np.mean(sr_rank_valid), np.mean(sr_rank_train), np.mean(sr_rank_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(2.6965074267362503, 2.427940586109996, 2.657635962271724)

In [112]:
import pickle

file_name = '/data/shpx/data/olivyatan/piyi/snkrs_hp7d_piyiv7_4m_April23.pkl'
# save
pickle.dump(model, open(file_name, "wb"))

In [113]:
model.save_model('/data/shpx/data/olivyatan/piyi/snkrs_hp7d_piyiv7_4m_April23.model')  

### 2.Train Sneakers 3 features: 

In [114]:
X_train_3f = train[sneakers3f_f]
#y_train = train.labelPurchase

X_valid_3f = valid[sneakers3f_f]
#y_valid = valid.labelPurchase

X_test_3f = test[sneakers3f_f]
#y_test = test.labelPurchase
#change test to label purchase.

X_train_3f.shape, X_valid_3f.shape, X_test_3f.shape

((164863, 36), (55025, 36), (54996, 36))

In [115]:
pdf[sneakers3f_f].shape


(274884, 36)

In [116]:
X_train_3f
cols = [x for x in X_train_3f.columns if 'Size' in x] 
print(X_train_3f[cols].columns) 

Index(['f_SizeMedianDiff', 'f_UserSizeMaxMinDiff',
       'f_UserSizeNumberOfEvidence'],
      dtype='object')


In [117]:
X_train_3f.columns

Index(['f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm',
       'f_BullseyeRelRVILeafCatMedianPriceDiffV2',
       'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm',
       'f_MerchImpressionsDecayed', 'f_SellerFeedbackLogNormWnD',
       'f_EpidRelative', 'f_ItemWatchOverImpLogSmoothAllNorm',
       'f_ItemTimeLeftSec',
       'f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm',
       'f_ItemTimeOnSiteNorm', 'f_PlImpressionsDecayed',
       'f_NormItemViewCount7DayDecayDomesticWebAndMobile', 'f_NSFWScore',
       'f_MaxViewedItemTitleJaccard',
       'f_ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2',
       'f_ItemTimeOnSiteV2', 'f_MaxViewedItemTitleJaccardBigrams',
       'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile',
       'f_RecallSourceBullseye', 'f_RecallSourceTora',
       'f_ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2',
       'f_Recall

In [118]:
y_test.value_counts()

0    49225
1     5771
Name: labelPurchase, dtype: int64

In [119]:
y_valid.value_counts()

0    49226
1     5799
Name: labelPurchase, dtype: int64

In [120]:
y_train.value_counts()

0    147500
1     17363
Name: labelPurchase, dtype: int64

In [121]:
dtrain = xgb.DMatrix(X_train_3f, label=y_train) #set dtrain per features columns examination. 
dtrain.set_group(group=group_train)

#### Hyper-paramter tuning

from datetime import datetime1.1

Hyper param initioal:

1.0

In [None]:
    #for max_depth in [4, 7]
  #  for min_child_weight in [5, 8]
  #  for subsample in [0.7, 1]
 #   for eta in [0.3 ]

In [139]:
from datetime import datetime 

params = {
    'objective': "rank:pairwise",
    'nthread': -1
}

gridsearch_params = [
    (max_depth, min_child_weight, subsample, eta)
    for max_depth in [4]
    for min_child_weight in [5, 8]
    for subsample in [0.7, 1]
    for eta in [0.3 ]
]

N_ROUNDS = 100

size3f_hyper_df = run_hyper_tuning(dtrain= dtrain,
                                    grid = gridsearch_params,
                                    params = params,
                                    output_dir="/data/shpx/data/olivyatan/piyi/snkrs",
                                    out_filename= "train_snkrs3f_orig2_2m",
                                    nrounds = N_ROUNDS)

10:33:04
CV1 with max_depth=4, min_child_weight=5, subsample=0.7, eta=0.3
	map 0.628 for 72 rounds
CV2 with max_depth=4, min_child_weight=5, subsample=1, eta=0.3
	map 0.6285 for 76 rounds
CV3 with max_depth=4, min_child_weight=8, subsample=0.7, eta=0.3
	map 0.624 for 65 rounds
CV4 with max_depth=4, min_child_weight=8, subsample=1, eta=0.3
	map 0.6315 for 99 rounds
Best params: 4, 8, 1, 0.3, map: 0.6314984
11:16:19
Saving hyper-paramter output


In [140]:
orig_hyper_df = pd.read_csv("/data/shpx/data/olivyatan/piyi/snkrs/hyper_df_train_snkrs3f_orig2_2m.csv")

In [141]:
orig_hyper_df.shape

(4, 6)

In [142]:
orig_hyper_df_sorted = orig_hyper_df.sort_values("mean_map", ascending=False)

In [143]:
orig_hyper_df_sorted.head()

Unnamed: 0,max_depth,min_child_weight,subsample,eta,mean_map,boost_rounds
3,4.0,8.0,1.0,0.3,0.631498,99.0
1,4.0,5.0,1.0,0.3,0.628546,76.0
0,4.0,5.0,0.7,0.3,0.628044,72.0
2,4.0,8.0,0.7,0.3,0.623986,65.0


#### Train with best params

In [144]:
clf = xgb.XGBRanker(max_depth= int(orig_hyper_df_sorted.max_depth.iloc[0]),
                    learning_rate=orig_hyper_df_sorted.eta.iloc[0],
                    n_estimators= int(orig_hyper_df_sorted.boost_rounds.iloc[0]),                    
                    min_child_weight= int(orig_hyper_df_sorted.min_child_weight.iloc[0]),
                    subsample=orig_hyper_df_sorted.subsample.iloc[0],
                    objective="rank:pairwise",
                    tree_method="exact",
                    n_jobs=8)

#                     gamma=3,
#                     max_delta_step=0,
#                     colsample_bytree=0.86,
#                     colsample_bylevel=1,
#                     reg_lambda=0,
#                     reg_alpha=1,
#                     base_score=0.5,
#                     scale_pos_weight=6,

In [145]:
clf

XGBRanker(base_score=None, booster=None, colsample_bylevel=None,
          colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None,
          importance_type='gain', interaction_constraints=None,
          learning_rate=0.3, max_delta_step=None, max_depth=4,
          min_child_weight=8, missing=nan, monotone_constraints=None,
          n_estimators=99, n_jobs=8, num_parallel_tree=None, random_state=None,
          reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=1.0,
          tree_method='exact', validate_parameters=None, verbosity=None)

In [146]:
eval_set = [(X_train_3f, y_train), (X_valid_3f, y_valid)]
eval_group = [group_train, group_valid]
eval_metric = ['map', 'ndcg@10-']


model = clf.fit(X_train_3f,#set to train df of correct columns combination
                y_train,
                eval_set=eval_set,
                eval_metric=eval_metric,
                early_stopping_rounds=10,
                group=group_train,
                eval_group=eval_group,
                verbose=True)

[0]	validation_0-map:0.49700	validation_0-ndcg@10-:0.61519	validation_1-map:0.48885	validation_1-ndcg@10-:0.60906
[1]	validation_0-map:0.51413	validation_0-ndcg@10-:0.62893	validation_1-map:0.50652	validation_1-ndcg@10-:0.62326
[2]	validation_0-map:0.52007	validation_0-ndcg@10-:0.63352	validation_1-map:0.51483	validation_1-ndcg@10-:0.62965
[3]	validation_0-map:0.53169	validation_0-ndcg@10-:0.64269	validation_1-map:0.52534	validation_1-ndcg@10-:0.63792
[4]	validation_0-map:0.54302	validation_0-ndcg@10-:0.65156	validation_1-map:0.53831	validation_1-ndcg@10-:0.64798
[5]	validation_0-map:0.54517	validation_0-ndcg@10-:0.65320	validation_1-map:0.54044	validation_1-ndcg@10-:0.64967
[6]	validation_0-map:0.54865	validation_0-ndcg@10-:0.65604	validation_1-map:0.54097	validation_1-ndcg@10-:0.65025
[7]	validation_0-map:0.55302	validation_0-ndcg@10-:0.65940	validation_1-map:0.54581	validation_1-ndcg@10-:0.65398
[8]	validation_0-map:0.55796	validation_0-ndcg@10-:0.66339	validation_1-map:0.55266	vali

In [147]:
xgb_imp_gain = calc_feature_imp(model, 'gain')
xgb_imp_weight = calc_feature_imp(model, 'weight')

In [148]:
from scipy import stats
res = stats.spearmanr(pdf.f_FreqSameItemInWatchBadge, pdf.f_BibowatchRelPosition)
res

SpearmanrResult(correlation=0.6117288361067644, pvalue=0.0)

In [149]:
xgb_imp_gain.sort_values(ascending=False, by='score').reset_index(drop=True)

Unnamed: 0,feature,score
0,f_RecallSourceBullseye,709.317694
1,f_FreqSameItemInWatchBadge,458.111355
2,f_BullseyeAbsRVILeafCatMedianPriceDiffV2,217.72819
3,f_BullseyeRelRVILeafCatMedianPriceDiffV2,201.582995
4,f_NumSameRviInLastWeek,196.715227
5,f_ItemTimeLeftSec,162.594102
6,f_MaxViewedItemTitleJaccardBigrams,152.339836
7,f_SizeMedianDiff,147.293582
8,f_ItemTimeOnSiteV2,138.001157
9,f_TitleCosineSimilarityToShoppingcartCentroid,95.61598


#### Evaluation

Predict on test of purchases:

In [150]:
valid_preds = model.predict(X_valid_3f)
train_preds = model.predict(X_train_3f)
test_preds = model.predict(X_test_3f)

valid['3f_orig_pred'] = valid_preds
train['3f_orig_pred'] = train_preds
test['3f_orig_pred'] = test_preds

sr_rank_valid = calc_sale_rank(valid, '3f_orig_pred', ascending=False)
sr_rank_train = calc_sale_rank(train, '3f_orig_pred', ascending=False)
sr_rank_test = calc_sale_rank(test, '3f_orig_pred', ascending=False)

np.mean(sr_rank_valid), np.mean(sr_rank_train), np.mean(sr_rank_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(2.596715009610344, 2.392102044382317, 2.596785464709993)

In [None]:
#prev:

In [230]:
valid_preds = model.predict(X_valid_3f)
train_preds = model.predict(X_train_3f)
test_preds = model.predict(X_test_3f)

valid['3f_orig_pred'] = valid_preds
train['3f_orig_pred'] = train_preds
test['3f_orig_pred'] = test_preds

sr_rank_valid = calc_sale_rank(valid, '3f_orig_pred', ascending=False)
sr_rank_train = calc_sale_rank(train, '3f_orig_pred', ascending=False)
sr_rank_test = calc_sale_rank(test, '3f_orig_pred', ascending=False)

np.mean(sr_rank_valid), np.mean(sr_rank_train), np.mean(sr_rank_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(2.68667201926937, 2.4085374013113876, 2.637166365643187)

In [151]:
import pickle

file_name = '/data/shpx/data/olivyatan/piyi/2snkrs_hp7d_size_3f_4m_exApril23.pkl'
# save
pickle.dump(model, open(file_name, "wb"))

In [152]:
#save model
model.save_model('/data/shpx/data/olivyatan/piyi/2snkrs_hp7d_size_3f_4m_exApril23.model')  

### 3. Examination of longer test results:

In [162]:
test_2w_sneakers.shape

(8059, 1519)

In [163]:
test_2w_sneakers.labelPurchase.sum()

849

In [164]:
test_2w_sneakers.category.unique()

array(['95672', '15709'], dtype=object)

In [165]:
test_2w_sneakers.dt.unique()

array(['2023-04-12', '2023-04-13', '2023-04-14', '2023-04-15',
       '2023-04-16', '2023-04-17', '2023-04-19', '2023-04-20',
       '2023-04-21', '2023-04-23', '2023-04-24', '2023-04-25'],
      dtype=object)

In [166]:
test_sr_prod = calc_sale_rank(test_2w_sneakers, 'rank', ascending=True)
np.mean(test_sr_prod)
# prev :2.881

2.9114521841794567

In [258]:
## piyi v7 sale rank:

In [175]:
file_name = '/data/shpx/data/olivyatan/piyi/snkrs_hp7d_piyiv7_4m_April23.pkl'
xgb_model_loaded = pickle.load(open(file_name, "rb"))

In [176]:
piyi_v7_cols=X_test.columns.values.tolist()

In [177]:
piyi_v7_cols

['f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm',
 'f_BullseyeRelRVILeafCatMedianPriceDiffV2',
 'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm',
 'f_MerchImpressionsDecayed',
 'f_SellerFeedbackLogNormWnD',
 'f_EpidRelative',
 'f_ItemWatchOverImpLogSmoothAllNorm',
 'f_ItemTimeLeftSec',
 'f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm',
 'f_ItemTimeOnSiteNorm',
 'f_PlImpressionsDecayed',
 'f_NormItemViewCount7DayDecayDomesticWebAndMobile',
 'f_NSFWScore',
 'f_MaxViewedItemTitleJaccard',
 'f_ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2',
 'f_ItemTimeOnSiteV2',
 'f_MaxViewedItemTitleJaccardBigrams',
 'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile',
 'f_RecallSourceBullseye',
 'f_RecallSourceTora',
 'f_ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2',
 'f_RecallSourceBestMatch',
 'f_PriceDiffMedianRecall',
 'f_BullseyeAbsRVILeafCatMedianPriceDiffV2',

In [178]:
X_test_longer_v7 = test_2w_sneakers[piyiv7_sneakers_f]
y_test_longr_v7 = test_2w_sneakers.labelPurchase 

In [179]:
test_preds = xgb_model_loaded.predict(X_test_longer_v7) 

test_2w_sneakers['longer_test_v7'] = test_preds 
sr_rank_test = calc_sale_rank(test_2w_sneakers, 'longer_test_v7', ascending=False) 
np.mean(sr_rank_test)

2.343565525383707

In [242]:
pdf_longer_test.columns

Index(['f_SlrImmatureDaysLastDefect', 'f_PriceV2',
       'f_IsAuthenticityGuaranteed',
       'f_ItemWatchCount7DayDecayDomesticWebAndMobile', 'f_CssL3MeanBbeCount',
       'f_IsVariantItem', 'f_HydraStdWOCat', 'f_KnnNlpUserItemsRecallSource',
       'f_ItemSoldCountWnD', 'f_RecoHasEpid',
       ...
       'labelBBEDefectType', 'labelSitePurchaseImpr', 'labelBBEImpr',
       'labelBin', 'labelOffer', 'labelWatch', 'labelAdd2Cart',
       'labelPurchaseImpr', 'labelCombined', 'dt'],
      dtype='object', length=1467)

In [259]:
#piyi v7 3f sale rank:

In [167]:
piyi_v7_3f_cols=X_test_3f.columns.values.tolist()

In [168]:
piyi_v7_3f_cols

['f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm',
 'f_BullseyeRelRVILeafCatMedianPriceDiffV2',
 'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm',
 'f_MerchImpressionsDecayed',
 'f_SellerFeedbackLogNormWnD',
 'f_EpidRelative',
 'f_ItemWatchOverImpLogSmoothAllNorm',
 'f_ItemTimeLeftSec',
 'f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm',
 'f_ItemTimeOnSiteNorm',
 'f_PlImpressionsDecayed',
 'f_NormItemViewCount7DayDecayDomesticWebAndMobile',
 'f_NSFWScore',
 'f_MaxViewedItemTitleJaccard',
 'f_ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2',
 'f_ItemTimeOnSiteV2',
 'f_MaxViewedItemTitleJaccardBigrams',
 'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile',
 'f_RecallSourceBullseye',
 'f_RecallSourceTora',
 'f_ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2',
 'f_RecallSourceBestMatch',
 'f_PriceDiffMedianRecall',
 'f_SizeMedianDiff',
 'f_UserSizeMaxMinDiff'

In [170]:
X_test_longer = test_2w_sneakers[sneakers3f_f]
y_test_longr = test_2w_sneakers.labelPurchase 

In [171]:
test_preds = model.predict(X_test_longer)

In [172]:
X_test_longer.shape

(8059, 36)

In [173]:
y_test_longr.sum()

849

In [174]:
test_2w_sneakers['piyi_longer_pred'] = test_preds
sr_rank_test = calc_sale_rank(test_2w_sneakers, 'piyi_longer_pred', ascending=False)

np.mean(sr_rank_test)

2.2384887839433296

In [268]:
import pickle
 
file_name = '/data/shpx/data/olivyatan/piyi/snkrs_hp_size_3f_2.2m_exJan23.pkl'
xgb_model_loaded = pickle.load(open(file_name, "rb"))

In [270]:
test_preds = model.predict(X_test_longer) 

pdf_longer_test['longer_test'] = test_preds 
sr_rank_test = calc_sale_rank(pdf_longer_test, 'longer_test', ascending=False) 
np.mean(sr_rank_test)

2.159682899207248

In [254]:
test_preds = model.predict(X_test_longer) 

pdf_longer_test['longer_test'] = test_preds 
sr_rank_test = calc_sale_rank(pdf_longer_test, 'longer_test', ascending=False) 
np.mean(sr_rank_test)

2.159682899207248

In [None]:
X_test_longer = pdf_longer_test[sneakers1f_f]
y_test_longr = pdf_longer_test.labelPurchase 

In [None]:
test_preds = xgb_model_loaded.predict(X_test_longer)

In [None]:
pdf_longer_test['piyi_1f_longer_pred'] = test_preds
sr_rank_test = calc_sale_rank(pdf_longer_test, 'piyi_1f_longer_pred', ascending=False)

np.mean(sr_rank_test)

## Train Sneakers 1 feature: 

In [None]:
X_train_1f = train[sneakers1f_f]
#y_train = train.labelPurchase

X_valid_1f = valid[sneakers1f_f]
#y_valid = valid.labelPurchase

X_test_1f = test[sneakers1f_f]
#y_test = test.labelPurchase


X_train_1f.shape, X_valid_1f.shape, X_test_1f.shape

In [None]:
dtrain = xgb.DMatrix(X_train_1f, label=y_train)
dtrain.set_group(group=group_train)

In [None]:
from datetime import datetime 

params = {
    'objective': "rank:pairwise",
    'nthread': -1
}

gridsearch_params = [
    (max_depth, min_child_weight, subsample, eta)
    for max_depth in [4, 7]
    for min_child_weight in [5, 8]
    for subsample in [0.7, 1]
    for eta in [0.3 ]
]

N_ROUNDS = 100

size1f_hyper_df = run_hyper_tuning(dtrain= dtrain,
                                    grid = gridsearch_params,
                                    params = params,
                                    output_dir="/data/shpx/data/olivyatan/piyi/snkrs",
                                    out_filename= "train_snkrs1f_orig3_2m",
                                    nrounds = N_ROUNDS)

In [None]:
orig_hyper_df = pd.read_csv("/data/shpx/data/olivyatan/piyi/snkrs/hyper_df_train_snkrs1f_orig3_2m.csv") 

In [None]:
orig_hyper_df.shape

In [None]:
orig_hyper_df_sorted = orig_hyper_df.sort_values("mean_map", ascending=False)

In [None]:
orig_hyper_df_sorted.head()

In [None]:
clf = xgb.XGBRanker(max_depth= int(orig_hyper_df_sorted.max_depth.iloc[0]),
                    learning_rate=orig_hyper_df_sorted.eta.iloc[0],
                    n_estimators= int(orig_hyper_df_sorted.boost_rounds.iloc[0]),                    
                    min_child_weight= int(orig_hyper_df_sorted.min_child_weight.iloc[0]),
                    subsample=orig_hyper_df_sorted.subsample.iloc[0],
                    objective="rank:pairwise",
                    tree_method="exact",
                    n_jobs=8)

In [None]:
eval_set = [(X_train_1f, y_train), (X_valid_1f, y_valid)]
eval_group = [group_train, group_valid]
eval_metric = ['map', 'ndcg@10-']

model = clf.fit(X_train_1f,
                y_train,
                eval_set=eval_set,
                eval_metric=eval_metric,
                early_stopping_rounds=10,
                group=group_train,
                eval_group=eval_group,
                verbose=True)

In [None]:
xgb_imp_gain = calc_feature_imp(model, 'gain')
xgb_imp_weight = calc_feature_imp(model, 'weight')

In [None]:
xgb_imp_gain.sort_values(ascending=False, by='score').reset_index(drop=True)

#### Evaluation: 

In [None]:
valid_preds = model.predict(X_valid_1f)
train_preds = model.predict(X_train_1f)
test_preds = model.predict(X_test_1f)

valid['1f_orig_pred'] = valid_preds
train['1f_orig_pred'] = train_preds
test['1f_orig_pred'] = test_preds

sr_rank_valid = calc_sale_rank(valid, '1f_orig_pred', ascending=False)
sr_rank_train = calc_sale_rank(train, '1f_orig_pred', ascending=False)
sr_rank_test = calc_sale_rank(test, '1f_orig_pred', ascending=False)

np.mean(sr_rank_valid), np.mean(sr_rank_train), np.mean(sr_rank_test)

In [None]:
import pickle

file_name = '/data/shpx/data/olivyatan/piyi/snkrs_size_1f_2.2m_nov22_22123Run.pkl'
# save
pickle.dump(model, open(file_name, "wb"))

In [None]:
#save model
model.save_model('/data/shpx/data/olivyatan/piyi/snkrs_size_1f_2.2m_nov22_22123Run.model')  

### test sale rank of new sneakers  data 17/11-5/1/23:


In [None]:
loaded_model = xgb.Booster()
loaded_model.load_model('/data/shpx/data/olivyatan/piyi/snkrs_3f_2m_nov22.model') 



In [None]:
# load 
file_name = '/data/shpx/data/olivyatan/piyi/snkrs_3f_2m_nov22.pkl'
xgb_model_loaded = pickle.load(open(file_name, "rb"))
xgb_imp_gain = calc_feature_imp(xgb_model_loaded, 'gain')
xgb_imp_gain.sort_values(ascending=False, by='score').reset_index(drop=True)

In [None]:
## 1 month model:

In [None]:
loaded_model = xgb.Booster()
loaded_model.load_model('/data/shpx/data/olivyatan/piyi/snkrs_1f_1m_nov22.model')  

In [None]:
import pickle

file_name = '/data/shpx/data/olivyatan/piyi/snkrs_1f_1m_nov22.pkl'
xgb_model_loaded = pickle.load(open(file_name, "rb"))

In [247]:
X_test_longer.columns

Index(['f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm',
       'f_BullseyeRelRVILeafCatMedianPriceDiffV2',
       'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm',
       'f_MerchImpressionsDecayed', 'f_SellerFeedbackLogNormWnD',
       'f_EpidRelative', 'f_ItemWatchOverImpLogSmoothAllNorm',
       'f_ItemTimeLeftSec',
       'f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm',
       'f_ItemTimeOnSiteNorm', 'f_PlImpressionsDecayed',
       'f_NormItemViewCount7DayDecayDomesticWebAndMobile', 'f_NSFWScore',
       'f_MaxViewedItemTitleJaccard',
       'f_ItemWatchesOverImp7DayDecayLogSmoothDomesticWebAndMobileV2',
       'f_ItemTimeOnSiteV2', 'f_MaxViewedItemTitleJaccardBigrams',
       'f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobile',
       'f_RecallSourceBullseye', 'f_RecallSourceTora',
       'f_ItemVariantSalesOverImpressions7DayDecayLogSmoothDomesticWebAndMobileV2',
       'f_Recall

In [250]:
X_test.head(1)

Unnamed: 0,f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm,f_BullseyeRelRVILeafCatMedianPriceDiffV2,f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm,f_MerchImpressionsDecayed,f_SellerFeedbackLogNormWnD,f_EpidRelative,f_ItemWatchOverImpLogSmoothAllNorm,f_ItemTimeLeftSec,f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm,f_ItemTimeOnSiteNorm,...,f_PriceDiffMedianRecall,f_BullseyeAbsRVILeafCatMedianPriceDiffV2,f_BullseyeRVILeafCatMedianPriceV2,f_AvgSameLeafRviPriceRatio,f_FreqSameItemInWatchBadge,f_TitleCosineSimilarityToShoppingcartCentroid,f_AvgSameLeafRviPriceDiff,f_NumSameRviInLastWeek,f_BibowatchRelPosition,f_FreqWatchPriceBellowItemPrice
25,0.525253,-0.3088,0.361057,0.0,0.16636,0.349519,0.682847,550207.0,0.617837,0.00881,...,-486.0,-1544.0,5000.0,0.6912,1.0,-1.0,-15.439999,0.0,8.0,2.739718e-08


In [251]:
X_test_longer.head(1)

Unnamed: 0,f_ItemSalesOverImpPricePrior7DayDecayLogSmoothInternationalWebAndMobileNorm,f_BullseyeRelRVILeafCatMedianPriceDiffV2,f_ItemSalesOverImpPricePrior7DayDecayLogSmoothDomesticWebAndMobileNorm,f_MerchImpressionsDecayed,f_SellerFeedbackLogNormWnD,f_EpidRelative,f_ItemWatchOverImpLogSmoothAllNorm,f_ItemTimeLeftSec,f_ItemVariantWatchOverImpressions7DayDecayLogSmoothDomesticWebAndMobileNorm,f_ItemTimeOnSiteNorm,...,f_BullseyeAbsRVILeafCatMedianPriceDiffV2,f_BullseyeRVILeafCatMedianPriceV2,f_AvgSameLeafRviPriceRatio,f_FreqSameItemInWatchBadge,f_TitleCosineSimilarityToShoppingcartCentroid,f_AvgSameLeafRviPriceDiff,f_NumSameRviInLastWeek,f_BibowatchRelPosition,f_IsMsku,f_FreqWatchPriceBellowItemPrice
0,0.651617,0.174647,0.280768,0.0,0.350626,1.1314,0.696004,14614.0,0.625784,0.002414,...,1397.0,7999.0,1.168283,1.0,-1.0,13.534286,0.0,1.0,0.0,8.487007e-10


In [None]:
df_pa_dedup_data=pd.read_parquet('/data/shpx/data/olivyatan/pa_data_april_mai_22_dedup.parquet')  
#PIYI V5 FEATURES 
X_test_pa = df_pa_dedup_data.iloc[:, X_f_cols]
y_test_pa = df_pa_dedup_data.labelPurchase 

group_test_pa=df_pa_dedup_data.groupby('meid').size().to_frame('size')['size'].to_numpy()
test_preds_pa = model.predict(X_test_pa) 

df_pa_dedup_data['pa_orig_pred'] = test_preds_pa 
sr_rank_test_pa = calc_sale_rank(df_pa_dedup_data, 'pa_orig_pred', ascending=False) 
np.mean(sr_rank_test_pa)

In [None]:
X_test_pa.columns

In [None]:
X_test_pa.head(2)

In [None]:
def calc_sale_rank(df, rank_col, ascending, group_col = 'meid', label_col = 'labelPurchase'):
    df_c = df.copy()
    
    iidf_c = df_c.sort_values([group_col, rank_col], ascending=ascending)
    
    df_c[f'{rank_col}_score'] = df_c.groupby(group_col, sort=False).cumcount() + 1
    
    df_score = df_c[df_c[label_col] == 1]
    
    score = df_score.groupby(group_col, sort=False)[f'{rank_col}_score'].min().tolist()
    score = [x for x in score if x is not None and np.isfinite(x)]
        
    return score

In [None]:
test_preds

In [None]:
##madcdl vs pretrainer: 

In [None]:
pdf[(pdf['itemId']==155291701019) &( pdf['userId']==2007592935 )]

In [None]:
pdf[(pdf['itemId']==314010287171) &( pdf['userId']==1892621100 )]

In [None]:
pdf[(pdf['itemId']==325489293460) ]

In [None]:
train['userId']

In [None]:
analysis_cols = ['itemId','meid','META_CATEG_ID', 'labelPurchase', 'rank', 'pa_orig_pred', 
                 'f_RecallSourceBullseye', 'f_MaxViewedItemTitleJaccard']

In [None]:
train[train.userId == 1857193597][analysis_cols]\
.sort_values('pa_orig_pred', ascending=False).head(17)

In [None]:
train.siteId.value_counts(normalize=True)

In [None]:
a_1 = train[train.labelPurchase == 1]
a_0 = train[train.labelPurchase == 0]

In [None]:
a_0.f_FreqSameItemInWatchBadge.value_counts(normalize=True)

In [None]:
a_1.f_FreqSameItemInWatchBadge.value_counts(normalize=True)

In [None]:
valid.userId.iloc[1870]

In [None]:
valid[valid.meid == valid.meid.iloc[150]]\
[analysis_cols].sort_values('pa_purchase_pred', ascending=False)

In [None]:
for i in valid[valid.userId == valid.userId.iloc[1870]][analysis_cols].sort_values('pa_purchase_pred', ascending=False)['itemId']:
    print ("https:ebay.com/itm/" + str(i))