In [1]:
from torch import optim, nn, utils, Tensor
import torch.nn.functional as F
import pytorch_lightning as pl
import torch

from sklift.datasets import fetch_lenta

import pandas as pd 
import numpy as np

from sklift.models import ClassTransformation
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


### data prep

In [2]:
dataset = fetch_lenta()

In [3]:
pd.crosstab(dataset.treatment, dataset.target, normalize='index')

response_att,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0.897421,0.102579
test,0.889874,0.110126


In [4]:
# make treatment binary
treat_dict = {
    'test': 1,
    'control': 0
}

dataset.treatment = dataset.treatment.map(treat_dict)

In [17]:
numeric_cols = dataset.data.columns[dataset.data.apply(lambda x: pd.api.types.is_numeric_dtype(x))]
X = dataset.data[numeric_cols].fillna(0).values

In [18]:
from sklearn.model_selection import train_test_split
#stratify by two columns: treatment and target.
# Intuition: In a binary classification problem definition we stratify 
# train set by splitting target 0/1 column. In uplift modeling we have two
# columns instead of one.

stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)

X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(
    X,
    dataset.treatment,
    dataset.target,
    stratify=stratify_cols,
    test_size=0.3,
    random_state=42
)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")

Train shape: (480920, 192)
Validation shape: (206109, 192)


In [19]:
z_tr = trmnt_train*y_train + (1-y_train)*(1-trmnt_train)
z_ts = trmnt_val*y_val + (1-y_val)*(1-trmnt_val)

### Data loaders

In [30]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [42]:
class TreatmentDataset(Dataset):
    
    def __init__(self, X, y, t):
        self.X = torch.Tensor(X)
        self.y = torch.Tensor(y)
        self.t = torch.Tensor(t) # treatment 
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx],self.y[idx],self.t[idx]
        

In [43]:
tr_set = TreatmentDataset(X_train, y_train, z_tr)
tr_loader = DataLoader(tr_set, batch_size=32, shuffle=True)

In [44]:
for batch in tr_loader:
    print("hi")
    break

hi
