In [1]:
import json
import pandas as pd
import numpy as np
from utils.data_composer import feature_engineering
import neptune.new as neptune
import torch

# 1. Load all data

In [2]:
with open("data.json", "r") as f:
    data = json.load(f)
    
# data is a list of 1, grab the core data inside
core_data = json.loads(data[0])

# Transform dataframe
df = pd.DataFrame(core_data)

Okay, so remember, at the previous stage of cleaning data (Part 1), we know that some data samples are invalid on some of their columns. When predicting genders, it's very important to tell the model to not rely on that invalid feature.
The way to do that (in our practice) is to fill data that by a "middle value", this could be the mean or the median of that particular column. In my practice, I use median because it is robust to outlier.

But first let's time them to null first

In [3]:
# Replace all invalid value with null
# Replace invalid coupon by null (< 0 and > 1)
def nullize_invalid_coupon(value):
    if value > 1 or value < 0:
        return np.nan
    else:
        # Else keep
        return value

# Replace 
def nullize_zero_revenue(value):
    if value <= 0:
        return np.nan
    else:
        return value

# Turn invalid coupon to np.nan
df.loc[:,"coupon_discount_applied"] = df.loc[:,"coupon_discount_applied"].apply(nullize_invalid_coupon)

# Turn invalid revenue to np.nan
df.loc[:,"revenue"] = df.loc[:,"revenue"].apply(nullize_zero_revenue)


In [4]:
# Run through pre-processor to get useful features
df = feature_engineering(df)

In [5]:
# Get engineered data
feature_df = df.iloc[:,33:]

# Also append column "devices" and "coupon_discount_applied" into
feature_df = pd.concat([feature_df, df.loc[:,["coupon_discount_applied","devices","customer_id"]]],axis=1)

# partial labels
with open("partial_labels.csv","r") as f:
    partial_labels_df = pd.read_csv(f)

In [6]:
partial_labels_df.rename(columns={"Unnamed: 0":"df_index"}, inplace=True)
partial_labels_df

Unnamed: 0,df_index,female_flag,customer_id
0,7,0,3.017372e+09
1,30,0,3.018459e+09
2,82,0,3.020737e+09
3,86,0,3.020872e+09
4,88,0,3.020897e+09
...,...,...,...
103805,191266,1,3.706458e+09
103806,191269,1,3.706571e+09
103807,191270,1,3.706577e+09
103808,191279,1,3.706952e+09


# Normalize

In [7]:
import joblib
scaler = joblib.load("robust_scaler.pkl")

In [8]:
# Apply scaler on first 33 features
scaled_feature_df = feature_df.copy()
scaled_feature_df.iloc[:,:33] = scaler.transform(feature_df.iloc[:,:33])

Here we'll handle missing values in a way that will confuse the model, to let it rely on other features to predict genders. So we fill this with median value

In [9]:
scaled_feature_df.columns

Index(['items_per_order', 'vouchers_per_order', 'male_items_per_order',
       'unisex_items_per_order', 'female_items_per_order', 'revenue_per_order',
       'msite_orders_rate', 'desktop_orders_rate', 'android_orders_rate',
       'ios_orders_rate', 'shipping_addresses_rate', 'home_orders_rate',
       'parcelpoint_orders_rate', 'work_orders_rate', 'items_per_day',
       'orders_per_day', 'returns_per_item', 'different_addresses_rate',
       'male_items_rate', 'female_items_rate', 'unisex_items_rate',
       'wapp_items_rate', 'wftw_items_rate', 'mapp_items_rate',
       'wacc_items_rate', 'macc_items_rate', 'mftw_items_rate',
       'cc_payments_rate', 'paypal_payments_rate', 'afterpay_payments_rate',
       'revenue_per_items', 'coupon_discount_applied', 'devices',
       'customer_id'],
      dtype='object')

# Clean data for prediction

In [10]:
# add label
full_df = pd.merge(scaled_feature_df, partial_labels_df, how="left", left_on=scaled_feature_df.index, right_on="df_index")

# drop duplicate columns
full_df.drop(['customer_id_y','df_index'], axis=1, inplace=True)
full_df.rename(columns={'customer_id_x':"customer_id"}, inplace=True)


# Rename to denote our current self-labeled
full_df = full_df.rename(columns={"female_flag":"pseudo_female_flag"})

# Replace nan with median value to confuse the model on the feature
coupon_median = full_df["coupon_discount_applied"].median()
full_df["coupon_discount_applied"] = full_df["coupon_discount_applied"].fillna(coupon_median)

median_revenue_per_order = full_df["revenue_per_order"].median()
full_df["revenue_per_order"] = full_df["revenue_per_order"].fillna(median_revenue_per_order)

median_revenue_per_item = full_df["revenue_per_items"].median()
full_df["revenue_per_items"] = full_df["revenue_per_items"].fillna(median_revenue_per_item)

# "2" denotes unlabeled class
full_df.pseudo_female_flag = full_df.pseudo_female_flag.fillna(2)

In [11]:
X = full_df.iloc[:,:33].to_numpy()
Y = full_df.loc[:,"pseudo_female_flag"].to_numpy()

# Visualized Embedding features

In [12]:
import optuna
import plotly 

from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from models.embeddingnet import EmbeddingNet
import torch
from collections import OrderedDict
from sklearn.metrics import v_measure_score

# Search for best dropout models

In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [14]:
def run_random_trial(trial, min_val, max_val):
    # Suggest dropout rate
    dropout_rate = round(trial.suggest_float("dropout_rate",min_val,max_val,step=0.05),2)
    embedding_model = EmbeddingNet(input_dim = 33, dropout=dropout_rate)

    # load embedder weight
    ckpt_path = f"outputs/weights_dropout_{dropout_rate}.ckpt"
    checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
    state_dict = checkpoint["state_dict"]
    state_dict = OrderedDict([(k.replace("embeddingnet.",""), v) for k, v in state_dict.items()])
    embedding_model.load_state_dict(state_dict)
    embedding_model.eval()
    embedded_X = embedding_model.forward(torch.Tensor(X))
    embedded_X = embedded_X.detach().numpy()
    
    # Fit with GMM
    #pred = GaussianMixture(n_components=2, random_state=0).fit_predict(embedded_X.detach().numpy())
    pred = KMeans(n_clusters=2, random_state=0).fit_predict(embedded_X)
    
    # Constraint with V measure
    mask_pseudo_label = np.logical_or(Y == 1, Y == 0)
    return embedded_X, pred, mask_pseudo_label

def objective_v_measure(trial):
    embedded_X, pred, mask_pseudo_label = run_random_trial(trial, min_val=0.0, max_val=1.0)
    
    v_measure =  v_measure_score(Y[mask_pseudo_label],pred[mask_pseudo_label])
    return v_measure

def objective_silhouette(trial):
    embedded_X, pred, mask_pseudo_label = run_random_trial(trial, min_val=0.0, max_val=0.5)

    # because silhouette score too expensive to compute, scale quadratically with n
    # sample a small one to measure 
    embedded_X_sample, _, pred_sample, _ = train_test_split(embedded_X, pred,stratify=pred, train_size = N_SILHOUTTE_SAMPLES)
    score = silhouette_score(embedded_X_sample, pred_sample)

    return score

# Study V-measure as dropout choice

In [16]:
import neptune.new as neptune
import neptune.new.integrations.optuna as optuna_utils
from joblib import parallel_backend

N_JOBS = 12
# connect your script to Neptune
run = neptune.init(project='patricknewyen/gfg-challenge', 
                   api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5NTkzZDViOC0xOTM3LTQzMzAtODNkNi0zMTg4MjYwZGJlYzQifQ==',
                   name = "search_dropout",
                   tags = ["optuna","EmbeddingNet","dropout","v_measure"])

neptune_callback = optuna_utils.NeptuneCallback(run) # skip chart because failed plotly import

study_v_measure = optuna.create_study(direction="maximize")

with parallel_backend('threading', n_jobs=N_JOBS):
    study_v_measure.optimize(objective_v_measure, n_trials=100,n_jobs=N_JOBS, callbacks=[neptune_callback])

[32m[I 2021-09-03 11:46:52,040][0m Trial 27 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:46:52,647][0m Trial 29 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:46:53,672][0m Trial 31 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:46:53,682][0m Trial 28 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:46:53,684][0m Trial 30 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:46:54,883][0m Trial 32 finished with value: 0.9990658885055281 and parameters: {'dropou

https://app.neptune.ai/patricknewyen/gfg-challenge/e/GFGCHAL-233
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


[32m[I 2021-09-03 11:47:03,309][0m A new study created in memory with name: no-name-41f40977-a5a6-4f8f-8846-b0d338afa359[0m

`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.

[32m[I 2021-09-03 11:47:07,361][0m Trial 34 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:47:07,444][0m Trial 36 finished with value: 0.9990658885055281 and parameters: {'dropout_rate': 0.0}. Best is trial 24 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:47:42,223][0m Trial 5 finished with value: 0.9961747858816115 and parameters: {'dropout_rate': 0.35000000000000003}. Best is trial 5 with value: 0.9961747858816115.[0m
[32m[I 2021-09-03 11:47:42,601][0m Trial 0 finished with value: 0.9961747858816115 and parameters: {'dropout_rate': 0.35000000000000003}. Best is trial 5 with value: 0.996174

[32m[I 2021-09-03 11:49:10,389][0m Trial 26 finished with value: 0.9935658217959709 and parameters: {'dropout_rate': 0.2}. Best is trial 19 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:49:12,728][0m Trial 28 finished with value: 0.9935658217959709 and parameters: {'dropout_rate': 0.2}. Best is trial 19 with value: 0.9990658885055281.[0m
ERROR:neptune.new.internal.operation_processors.async_operation_processor:Error occurred during asynchronous operation processing: X-coordinates (step) must be strictly increasing for series attribute: trials/values. Invalid point: 21.0
ERROR:neptune.new.internal.operation_processors.async_operation_processor:Error occurred during asynchronous operation processing: X-coordinates (step) must be strictly increasing for series attribute: trials/values. Invalid point: 18.0
[32m[I 2021-09-03 11:49:23,998][0m Trial 29 finished with value: 0.9959785676887155 and parameters: {'dropout_rate': 0.25}. Best is trial 19 with value: 0.9990658885055

ERROR:neptune.new.internal.operation_processors.async_operation_processor:Error occurred during asynchronous operation processing: X-coordinates (step) must be strictly increasing for series attribute: trials/values. Invalid point: 55.0
[32m[I 2021-09-03 11:52:17,736][0m Trial 65 finished with value: 0.9953950107439066 and parameters: {'dropout_rate': 0.1}. Best is trial 19 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:52:21,318][0m Trial 66 finished with value: 0.9953950107439066 and parameters: {'dropout_rate': 0.1}. Best is trial 19 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:52:52,228][0m Trial 67 finished with value: 0.9953950107439066 and parameters: {'dropout_rate': 0.1}. Best is trial 19 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:53:01,346][0m Trial 68 finished with value: 0.9979937506326728 and parameters: {'dropout_rate': 0.15000000000000002}. Best is trial 19 with value: 0.9990658885055281.[0m
[32m[I 2021-09-03 11:53:06,400][0

In [None]:
optuna.visualization.plot_slice(study_v_measure, target_name="V measure")

# Study Silhouette score as dropout choice

In [24]:
V_MEASURE_THRESH = 0.95

# because silhouette score too expensive to compute, sample then compute
N_SILHOUTTE_SAMPLES = 10000

In [25]:
import neptune.new as neptune
import neptune.new.integrations.optuna as optuna_utils
from joblib import parallel_backend

N_JOBS = 12
# connect your script to Neptune
run = neptune.init(project='patricknewyen/gfg-challenge', 
                   api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5NTkzZDViOC0xOTM3LTQzMzAtODNkNi0zMTg4MjYwZGJlYzQifQ==',
                   name = "search_dropout",
                   tags = ["optuna", "SuperTiny","dropout","silhouette"])

neptune_callback = optuna_utils.NeptuneCallback(run) # skip chart because failed plotly import

study_silhouette = optuna.create_study(direction="maximize")

with parallel_backend('threading', n_jobs=N_JOBS):
    study_silhouette.optimize(objective_silhouette, n_trials=100,n_jobs=N_JOBS, callbacks=[neptune_callback])



https://app.neptune.ai/patricknewyen/gfg-challenge/e/GFGCHAL-230
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


[32m[I 2021-09-03 11:31:47,854][0m A new study created in memory with name: no-name-a45144c4-784c-4786-8464-2dcfe77b555a[0m

`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.

[32m[I 2021-09-03 11:32:18,145][0m Trial 4 finished with value: 0.8744240403175354 and parameters: {'dropout_rate': 0.05}. Best is trial 4 with value: 0.8744240403175354.[0m
[32m[I 2021-09-03 11:32:33,782][0m Trial 1 finished with value: 0.9009379148483276 and parameters: {'dropout_rate': 0.35000000000000003}. Best is trial 1 with value: 0.9009379148483276.[0m
[32m[I 2021-09-03 11:32:33,972][0m Trial 9 finished with value: 0.8691591620445251 and parameters: {'dropout_rate': 0.45}. Best is trial 1 with value: 0.9009379148483276.[0m
[32m[I 2021-09-03 11:32:34,124][0m Trial 0 finished with value: 0.9034616947174072 and parameters: {'dropout_rate': 0.35000000000000003}. Best is trial 0 with value: 0.90346169

[32m[I 2021-09-03 11:34:23,125][0m Trial 30 finished with value: 0.8498943448066711 and parameters: {'dropout_rate': 0.5}. Best is trial 25 with value: 0.904593288898468.[0m
ERROR:neptune.new.internal.operation_processors.async_operation_processor:Error occurred during asynchronous operation processing: X-coordinates (step) must be strictly increasing for series attribute: trials/values. Invalid point: 21.0
ERROR:neptune.new.internal.operation_processors.async_operation_processor:Error occurred during asynchronous operation processing: X-coordinates (step) must be strictly increasing for series attribute: trials/values. Invalid point: 20.0
[32m[I 2021-09-03 11:34:25,230][0m Trial 31 finished with value: 0.8484081029891968 and parameters: {'dropout_rate': 0.5}. Best is trial 25 with value: 0.904593288898468.[0m
[32m[I 2021-09-03 11:34:28,580][0m Trial 34 finished with value: 0.8694320321083069 and parameters: {'dropout_rate': 0.45}. Best is trial 25 with value: 0.904593288898468

[32m[I 2021-09-03 11:36:01,362][0m Trial 61 finished with value: 0.8552014827728271 and parameters: {'dropout_rate': 0.4}. Best is trial 25 with value: 0.904593288898468.[0m
[32m[I 2021-09-03 11:36:34,344][0m Trial 64 finished with value: 0.8520992398262024 and parameters: {'dropout_rate': 0.4}. Best is trial 25 with value: 0.904593288898468.[0m
[32m[I 2021-09-03 11:36:34,717][0m Trial 63 finished with value: 0.8523929715156555 and parameters: {'dropout_rate': 0.4}. Best is trial 25 with value: 0.904593288898468.[0m
[32m[I 2021-09-03 11:36:38,446][0m Trial 65 finished with value: 0.8565658330917358 and parameters: {'dropout_rate': 0.4}. Best is trial 25 with value: 0.904593288898468.[0m
[32m[I 2021-09-03 11:36:38,549][0m Trial 66 finished with value: 0.8575294017791748 and parameters: {'dropout_rate': 0.4}. Best is trial 25 with value: 0.904593288898468.[0m
[32m[I 2021-09-03 11:36:39,046][0m Trial 67 finished with value: 0.8571264743804932 and parameters: {'dropout_rat

[32m[I 2021-09-03 11:38:11,359][0m Trial 99 finished with value: 0.8938578367233276 and parameters: {'dropout_rate': 0.15000000000000002}. Best is trial 98 with value: 0.9251866340637207.[0m
