In [2]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
from functions.featuresAdder import ADD_covWgt, ADD_cfSim, Add_features, ADD_w2vSim, ADD_cfSim_lastN, ADD_w2vSim_lastN, ADD_covScore_lastN, Add_freq_features
import os
from annoy import AnnoyIndex

In [4]:
TOPN_candidate = 100
SETS = [2, 3]
predTypes = ['clicks', 'carts', 'orders']
SUBSETNUM = 2
input_note = 'covisit_20_20_20_newSuggester2'

## Add features to candidates

### user, item -based features

In [None]:
add_features = Add_features(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note)
add_features.process()

In [None]:
add_features = Add_freq_features(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note)
add_features.process()

### covisitation feature

In [None]:
add_features = ADD_covWgt(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note, True)
add_features.process()

In [None]:
for i in range(1, 2):
    add_features = ADD_covScore_lastN(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note, False)
    add_features.lastN = i
    add_features.output_note = f'{add_features.output_note}_{add_features.lastN}'
    add_features.process()

### CF features

In [None]:
add_features = ADD_cfSim(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note, True)
add_features.output_note = 'cfSim'
add_features.process()

In [None]:
for i in range(1, 4):
    add_features = ADD_cfSim_lastN(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note, False)
    add_features.lastN = i
    add_features.output_note = f'{add_features.output_note}_{add_features.lastN}'
    add_features.process()

### w2v features

In [None]:
add_features = ADD_w2vSim(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note, True)
add_features.output_note = 'w2vSim_3x'
add_features.process()

In [None]:
for i in range(1, 5):
    add_features = ADD_w2vSim_lastN(TOPN_candidate, SETS, predTypes, SUBSETNUM, input_note, False)
    add_features.lastN = i
    add_features.output_note = f'{add_features.output_note}_{add_features.lastN}'
    add_features.process()

## Merge candidate & features

### Candidates to data4xgb

In [3]:
def mergeFeatures(feature_notes, candidates_path, predType, s, data4xgb_path):
    for e, feature_note in enumerate(feature_notes):
        if e == 0:
            data4xgb = pd.read_parquet(f'{candidates_path}/{predType}_{s}.pqt').reset_index(drop=True).astype('int32')
        else:
            thisFeature =  pd.read_parquet(f'{candidates_path}_{feature_note}/{predType}_{s}.pqt').iloc[:, 2:].astype('float32').reset_index(drop=True)
            data4xgb = pd.concat([data4xgb, thisFeature], axis=1)
    data4xgb = data4xgb.fillna(0)
    print(data4xgb.shape)
    data4xgb.to_parquet(f'{data4xgb_path}/{predType}_{s}.pqt')

In [None]:
outputPath = '../output/newSplited/'
# feature_notes = ['', 'features_dropped', 'covWgt_t1', 'cfSim_last_1_t0', 'cfSim_last_2_t0', 'cfSim_last_3_t0', 'w2vSim_last_1_t0', 'w2vSim_last_2_t0', 'w2vSim_last_3_t0', 'w2vSim_last_4_t0', 'covScore_last_1_t0']
feature_notes = ['', 'features_dropped', 'covWgt_t1', 'covScore_last_1_t0'] #'cfSim_last_1_t0', 'cfSim_last_2_t0', 'cfSim_last_3_t0', 'w2vSim_last_1_t0', 'w2vSim_last_2_t0', 'w2vSim_last_3_t0', 'w2vSim_last_4_t0', ]
data4xgb_note = 'suggester_addLast'

for SET in SETS:
    candidates_path = outputPath + f'candidates/set{SET}_top_{TOPN_candidate}/{input_note}'
    data4xgb_path = outputPath + f'data4xgb/set{SET}_top_{TOPN_candidate}/{data4xgb_note}'
    try: 
        os.makedirs(data4xgb_path) 
    except OSError as error: 
        print(error)

    for predType in predTypes:
        for s in range(SUBSETNUM):
            print(SET, predType, s)
            mergeFeatures(feature_notes, candidates_path, predType, s, data4xgb_path)

### data4xgb to data4xgb

In [6]:
def addFeatures(feature_notes, input_path, predType, s, data4xgb_path, output_path):
    # if predType == 'orders':
        # toDrop = ['covScore_cartsOrders_2', 'covScore_buy2buy_2', 'covScore_cartsOrders_3', 'covScore_buy2buy_3', 'covScore_buy2buy_1', 'item_ts_min_valA', 'item_type_std_valA']
    data4xgb = pd.read_parquet(f'{data4xgb_path}/{predType}_{s}.pqt').reset_index(drop=True)#.drop(columns=toDrop)
    
    for feature_note in feature_notes:
        thisFeature =  pd.read_parquet(f'{input_path}_{feature_note}/{predType}_{s}.pqt').iloc[:, 2:].astype('float32').reset_index(drop=True)
        data4xgb = pd.concat([data4xgb, thisFeature], axis=1)
    data4xgb.fillna(0)
    print(data4xgb.shape)
    # data4xgb.to_parquet(f'{output_path}/{predType}_{s}.pqt')

    chunkSize = 33000000
    idx = 0
    n = len(data4xgb)
    while (idx * chunkSize < n):
        # dtest = xgb.DMatrix(data4xgb.iloc[i*chunkSize:(i+1)*chunkSize, 2:])
        sub = data4xgb[idx*chunkSize:(idx+1)*chunkSize].reset_index(drop=True)
        sub.to_parquet(f'{output_path}/{predType}_{s}_{idx}.pqt')
        idx += 1

In [None]:
outputPath = '../output/newSplited/'
# feature_notes = ['cfSim_3x_t1', 'cfSim_last_1_t0', 'cfSim_last_2_t0', 'cfSim_last_3_t0'] 
# feature_notes =  ['w2vSim_3x_t1', 'w2vSim_last_1_t0', 'w2vSim_last_2_t0', 'w2vSim_last_3_t0', 'w2vSim_last_4_t0']
feature_notes = ['features_norm_freq_t0']
data4xgb_note = 'covisit_20_20_20_newSuggester2_drop_12_add_last3CovScore'
output_note = 'covisit_20_20_20_newSuggester2_add_freq'
for SET in SETS:
    input_path = outputPath + f'candidates/set{SET}_top_{TOPN_candidate}/{input_note}'
    data4xgb_path = outputPath + f'data4xgb/set{SET}_top_{TOPN_candidate}/{data4xgb_note}'
    output_path = outputPath + f'data4xgb/set{SET}_top_{TOPN_candidate}/{output_note}'
    try: 
        os.makedirs(output_path) 
    except OSError as error: 
        print(error)

    for predType in predTypes:
        for s in range(SUBSETNUM):
            print(SET, predType, s)
            addFeatures(feature_notes, input_path, predType, s, data4xgb_path, output_path)

# Drop useless features

### Drop columns from data4xgb

In [3]:
input_note = 'suggester_addLast'
output_note = 'suggester_addLast'

for SET in [2, 3]:
    ! mkdir ../output/newSplited/data4xgb/set{SET}_top_100/{output_note}
    for predType in predTypes:
        if predType == 'clicks':
            toDrop_col = ['item_type_median', 'item_type_median_valA', 'item_clicked_cnt_val', 'item_carted_cnt_val', 'item_ordered_cnt_val', 'user_type_median']
        elif predType == 'carts':
            toDrop_col = ['item_type_median', 'item_type_median_valA', 'item_clicked_cnt_val', 'item_carted_cnt_val', 'item_ordered_cnt_val', 'user_type_median']
        elif predType == 'orders':
            # toDrop_col = ['item_type_median', 'item_type_median_valA', 'item_clicked_cnt_val', 'item_carted_cnt_val', 'item_ordered_cnt_val', 'user_type_median']
            # toDrop_col = ['item_type_median_valA', 'item_type_median', 'user_type_median', 'item_clicked_cnt_val', 'item_ordered_cnt_val']
            toDrop_col = ['covScore_buy2buy_1', 'wgt_buy2buy', 'user_ts_diff_std', 'item_carted_cnt_val', 'user_ts_min']

            
        for sub in range(SUBSETNUM):
            data4xgb = pd.read_parquet(f'../output/newSplited/data4xgb/set{SET}_top_100/{input_note}/{predType}_{sub}.pqt').drop(toDrop_col, axis=1)
            data4xgb.to_parquet(f'../output/newSplited/data4xgb/set{SET}_top_100/{output_note}/{predType}_{sub}.pqt')

mkdir: cannot create directory ‘../output/newSplited/data4xgb/set2_top_100/suggester_addLast’: File exists
mkdir: cannot create directory ‘../output/newSplited/data4xgb/set3_top_100/suggester_addLast’: File exists


### Drop columns from candidates

In [4]:
def load_toDrop_col(predType):
    if predType == 'clicks':
        toDrop_col = ['user_ordered_cnt', 'cnt_ordered', 'user_lastAid', 'user_clicked_cnt', 'user_carted_cnt', 'item_ts_min_valA', 'user_type_std', 'user_ts_min', 'item_buy_ratio_valA', 'item_type_std_valA', 'item_buy_ratio', 'item_type_std']
    elif predType == 'carts':
        toDrop_col = ['user_ordered_cnt', 'cnt_ordered', 'user_clicked_cnt', 'item_item_count', 'user_user_count', 'item_carted_cnt', 'user_type_std', 'item_user_count_valA', 'user_ts_min', 'item_ordered_cnt', 'item_ts_min_valA', 'user_lastAid']
    elif predType == 'orders':
        toDrop_col = ['user_ordered_cnt', 'item_user_count_valA', 'user_lastAid', 'item_item_count', 'user_ts_mean', 'user_clicked_cnt', 'item_clicked_cnt', 'item_item_count_valA', 'user_type_std', 'item_ts_min', 'user_user_count', 'item_ts_min_valA', 'item_type_std_valA']
    return toDrop_col

In [5]:
input_note = 'features_norm_addLast_t0'
output_note = 'features_dropped'

for SET in [2, 3]:
    ! mkdir ../output/newSplited/candidates/set{SET}_top_100/suggester_addLast_{output_note}
    for predType in predTypes:
        toDrop_col = load_toDrop_col(predType)
        
        for sub in range(SUBSETNUM):
            data4xgb = pd.read_parquet(f'../output/newSplited/candidates/set{SET}_top_100/suggester_addLast_{input_note}/{predType}_{sub}.pqt').drop(toDrop_col, axis=1)
            data4xgb.to_parquet(f'../output/newSplited/candidates/set{SET}_top_100/suggester_addLast_{output_note}/{predType}_{sub}.pqt')