# 1. Import

In [None]:
#!pip install neptune-client

In [1]:
import sklearn
import pandas as pd
from sklearn.preprocessing import RobustScaler
import umap
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import joblib
from torch.utils.data import DataLoader
from sklearn.utils import resample

In [2]:
%load_ext autoreload
%autoreload 2

Last time we're able to "softly" label data and engineer some useful features for the modeling part. Now let's use the data for modelling!!

# 2. Load data

In [2]:
with open("modeling_data.csv","r") as f:
    df = pd.read_csv(f)

In [3]:
Y = df["female_flag"].to_numpy()
X = df.drop(["female_flag","customer_id"], axis=1).to_numpy()

# Stratified for unbalanced class
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)
with_centering = True
scaler = RobustScaler(with_centering=with_centering).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Over sample data
X_train_oversampled, y_train_oversampled = resample(X_train_scaled[y_train == 0],
                                                    y_train[y_train == 0],
                                                    replace=True,
                                                    n_samples=X_train_scaled[y_train == 1].shape[0])

X_train_scaled = np.vstack((X_train_scaled[y_train == 1], X_train_oversampled))
y_train = np.hstack((y_train[y_train == 1], y_train_oversampled))

In [4]:
df.drop(["female_flag","customer_id"], axis=1).columns

Index(['items_per_order', 'vouchers_per_order', 'male_items_per_order',
       'unisex_items_per_order', 'female_items_per_order', 'revenue_per_order',
       'msite_orders_rate', 'desktop_orders_rate', 'android_orders_rate',
       'ios_orders_rate', 'shipping_addresses_rate', 'home_orders_rate',
       'parcelpoint_orders_rate', 'work_orders_rate', 'items_per_day',
       'orders_per_day', 'returns_per_item', 'different_addresses_rate',
       'male_items_rate', 'female_items_rate', 'unisex_items_rate',
       'wapp_items_rate', 'wftw_items_rate', 'mapp_items_rate',
       'wacc_items_rate', 'macc_items_rate', 'mftw_items_rate',
       'cc_payments_rate', 'paypal_payments_rate', 'afterpay_payments_rate',
       'revenue_per_items', 'coupon_discount_applied', 'devices'],
      dtype='object')

In [5]:
X_train_scaled.shape, y_train.shape, X_test_scaled.shape, y_test.shape

((159368, 33), (159368,), (10381, 33), (10381,))

In [6]:
# Save scaler
if with_centering:
    joblib.dump(scaler,"robust_scaler.pkl")
elif not with_centering:
    joblib.dump(scaler,"robust_scaler_without_centering.pkl")
scaler_load = joblib.load("robust_scaler.pkl")

# 3. Simple visualization

In [10]:
reducer = umap.UMAP()
reducer.fit(X_train_scaled)

X_train_plot = reducer.transform(X_train_scaled)

# Only first 1k data to save resource
plt.scatter(X_train_plot [:1000, 0], X_train_plot[:1000, 1], c=y_train, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the GFG customers dataset', fontsize=18);



KeyboardInterrupt: 

Pretty neat, they are kinda linearly separable, but data in each class also has very large deviation from the mean. This raw feature might not be enough for a simple linear model to be robust to new dataset, hence we need another type of model.

# 4. Modeling

In [7]:
from models.tripletnet import TripletNet
from models.embeddingnet import EmbeddingNetSmall
from utils.dataset import TripletDataset
import pytorch_lightning as pl  
from pytorch_lightning.loggers import NeptuneLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch

In [8]:
train_dataset = TripletDataset(X_train_scaled, y_train)
test_dataset = TripletDataset(X_test_scaled, y_test)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32, num_workers=24)
test_dataloader = DataLoader(test_dataset,batch_size=32, num_workers=24)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
dropouts = [0 + round(i*0.05,2) for i in range(21)] # [1->9 step 0.5]

OUTPUT_DIR = "outputs_net_small/"

for dropout in dropouts:
    # Neptune logger
    NEPTUNE_API_TOKEN = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI5NTkzZDViOC0xOTM3LTQzMzAtODNkNi0zMTg4MjYwZGJlYzQifQ=="
    neptune_logger = NeptuneLogger(
        api_key=NEPTUNE_API_TOKEN,
        project_name='patricknewyen/gfg-challenge',
        experiment_name=f"gfg_drop_{dropout}",  # Optional,
        tags=["gfg", "contrastive","EmbeddingNetSmall"],  # Optional,
    )

    # Model
    embedding_net = EmbeddingNetSmall(input_dim=X_train.shape[1],dropout=dropout)
    triplet_net = TripletNet(embedding_net, use_scheduler=False)

    # Callback
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=3, verbose=False, mode="min")

    # Trainer
    trainer = pl.Trainer(
        gpus=1, precision=32,  
        logger=[neptune_logger], 
        callbacks=[early_stop_callback], 
        max_epochs=1)
    trainer.validate(triplet_net, dataloaders=[test_dataloader])
    trainer.fit(triplet_net, train_dataloader, test_dataloader)
    
    # Save weights
    checkpoint_path = os.path.join(OUTPUT_DIR,f"weights_dropout_{dropout}.ckpt")
    trainer.save_checkpoint(checkpoint_path)
    
    # Plot and save
    X_test_emb = embedding_net(torch.Tensor(X_test_scaled)).detach().cpu().numpy()
    reducer = umap.UMAP(n_components=2)
    reducer.fit(X_test_emb)

    X_test_emb_plot = reducer.transform(X_test_emb)

    fig = plt.figure()
    plt.scatter(X_test_emb_plot [:, 0], X_test_emb_plot[:, 1], c=y_test, cmap='Spectral', s=5)
    plt.gca().set_aspect('equal', 'datalim')
    plt.colorbar(boundaries=np.arange(3)-0.5).set_ticks(np.arange(2))
    plt.title(f'UMAP of X_test deep embed (dropout={dropout})', fontsize=18);
    graph_path = os.path.join(OUTPUT_DIR,f"graph_dropout_{dropout}.png")
    fig.savefig(graph_path)
    plt.close()

NeptuneLogger will work in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]


Validating: 0it [00:00, ?it/s]

psutil is not installed. You will not be able to abort this experiment from the UI.
psutil is not installed. Hardware metrics will not be collected.


https://app.neptune.ai/patricknewyen/gfg-challenge/e/GFGCHAL-107


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_loss': 0.05470443144440651}
--------------------------------------------------------------------------------



  | Name         | Type              | Params
---------------------------------------------------
0 | embeddingnet | EmbeddingNetSmall | 5.3 K 
1 | criterion    | MarginRankingLoss | 0     
---------------------------------------------------
5.3 K     Trainable params
0         Non-trainable params
5.3 K     Total params
0.021     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

# Visualize deep embed

In [None]:
fig.savefig("hello.png")

In [None]:
X_test_emb = embedding_net(torch.Tensor(X_test_scaled)).detach().cpu().numpy()
reducer = umap.UMAP(n_components=2)
reducer.fit(X_test_emb)

X_test_emb_plot = reducer.transform(X_test_emb)

plt.scatter(X_test_emb_plot [:, 0], X_test_emb_plot[:, 1], c=y_test, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(2))
plt.title('UMAP projection of the GFG deep embed', fontsize=18);

In [None]:
reducer = umap.UMAP()
reducer.fit(X_test_scaled)

X_test_plot = reducer.transform(X_test_scaled)

plt.scatter(X_test_plot [:, 0], X_test_plot[:, 1], c=y_test, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the GFG raw embed', fontsize=18);

In [None]:
X_train_emb = embedding_net(torch.Tensor(X_train)).detach().cpu().numpy()
reducer = umap.UMAP()
reducer.fit(X_train_emb)

X_train_emb_plot = reducer.transform(X_test_emb)

plt.scatter(X_train_emb_plot [:, 0], X_train_emb_plot[:, 1], c=y_train, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the GFG deep embed (X train)', fontsize=18);