In [1]:
import os
import datetime
import joblib

from tqdm import tqdm

import numpy as np
import pandas as pd

import paddle
from paddle.io import Dataset, DataLoader

# Process training sets

In [2]:
clean_train_folder = "../data/data205411/2023-cvr-contest-data/clean_train"

In [3]:
train_file_ls = os.listdir(clean_train_folder)
train_file_ls = [_file for _file in train_file_ls if not _file.startswith("train")]

train_df_ls = []

for _file in tqdm(train_file_ls[:1]):
    curr_train = joblib.load(os.path.join(clean_train_folder, _file))
    train_df_ls.append(curr_train)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:02<00:00, 62.06s/it]


In [4]:
col_names = [
    'sample_id', 't1', 't2', 't3', 'user_feat_1', 'user_feat_2',
    'user_feat_3', 'user_feat_4', 'user_feat_5', 'user_feat_6',
    'user_feat_7', 'user_feat_8', 'user_feat_9', 'user_feat_10',
    'user_feat_11', 'user_feat_12', 'user_feat_13', 'scene_feat_14',
    'scene_feat_15', 'scene_feat_16', 'ad_feat_17', 'ad_feat_18',
    'ad_feat_19', 'ad_feat_20', 'ad_feat_21', 'ad_feat_22', 'ad_feat_23',
    'ad_feat_24', 'session_feat_25', 'session_feat_26'
]

for _df in train_df_ls:
    _df.drop("index", axis=1, inplace=True, errors="ignore")
    _df.columns = col_names

In [5]:
train_df = pd.concat(train_df_ls, axis=0, ignore_index=True)

In [6]:
# joblib.dump(train_df, os.path.join(clean_train_folder, "train_file_first10.pkl"))

In [7]:
# train_df = joblib.load(os.path.join(clean_train_folder, "train_file.pkl"))

In [6]:
feature_cols = list(train_df.columns[4:])
unique_elements = {}

def fill_unique_elements(row, feature_cols, unique_elements):
    for _col in feature_cols:
        if _col not in unique_elements:
            unique_elements[_col] = set()
        
        unique_elements[_col].update(row[_col])

train_df.apply(lambda x: fill_unique_elements(x, feature_cols, unique_elements), axis=1)    

0         None
1         None
2         None
3         None
4         None
          ... 
272186    None
272187    None
272188    None
272189    None
272190    None
Length: 272191, dtype: object

In [9]:
sparse_id_cnt = -1

for k, v in unique_elements.items():
    print(f"{k}, \t length={len(v)}")
    sparse_id_cnt += len(v)

user_feat_1, 	 length=21
user_feat_2, 	 length=9
user_feat_3, 	 length=269856
user_feat_4, 	 length=231628
user_feat_5, 	 length=12
user_feat_6, 	 length=9
user_feat_7, 	 length=6
user_feat_8, 	 length=18
user_feat_9, 	 length=6
user_feat_10, 	 length=726325
user_feat_11, 	 length=111
user_feat_12, 	 length=0
user_feat_13, 	 length=24
scene_feat_14, 	 length=6
scene_feat_15, 	 length=447
scene_feat_16, 	 length=1
ad_feat_17, 	 length=1287144
ad_feat_18, 	 length=446183
ad_feat_19, 	 length=50225
ad_feat_20, 	 length=17547
ad_feat_21, 	 length=11
ad_feat_22, 	 length=2386616
ad_feat_23, 	 length=47897
ad_feat_24, 	 length=372
session_feat_25, 	 length=452
session_feat_26, 	 length=1052


In [10]:
sparse_id_cnt

5465977

In [11]:
intersection_dict = {"col1": [], "col2": [], "intersections": []}

for k1, v1 in unique_elements.items():
    for k2, v2 in unique_elements.items():
        if k1 == k2:
            continue
            
        intersection = v1.intersection(v2)
        
        intersection_dict["col1"].append(k1)
        intersection_dict["col2"].append(k2)
        intersection_dict["intersections"].append(len(intersection))

In [12]:
intersection_df = pd.DataFrame(intersection_dict)
intersection_df.describe()

Unnamed: 0,intersections
count,650.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [13]:
features = []

def construct(row, feature_cols, features):
    curr_features = []
    
    for _col in feature_cols:
        if len(row[_col]) == 0:
            curr_features.append(torch.tensor([0], dtype=torch.long))
        else:
            tmp = []
            for i in row[_col]:
                tmp.append(int(i))
            
            curr_features.append(torch.tensor(tmp, dtype=torch.long))
        
    features.append(curr_features)

In [24]:
features = []

for idx, row in train_df.head(1000).iterrows():
    if idx % 1000 == 0:
        print(idx)
        print(datetime.datetime.now())
    
    curr_features = []
    
    curr_features.append([row["sample_id"]])
    curr_features.append([row["t1"]])
    curr_features.append([row["t2"]])
    curr_features.append([row["t3"]])
    
    for _col in feature_cols:
        if len(row[_col]) == 0:
            curr_features.append(paddle.to_tensor([0], dtype="int64"))
        else:
            tmp = []
            for i in row[_col]:
                tmp.append(int(i))
            
            curr_features.append(paddle.to_tensor(tmp, dtype="int64"))
        
    features.append(curr_features)

0
2023-06-22 10:35:53.156854


# Tensor dim experiments

## Sum Pooling

In [9]:
emb = paddle.nn.Embedding(num_embeddings=10, embedding_dim=5)

In [11]:
ls = [[[1], [2], [3]], [[4], [5], [6, 7, 4]]]

tensor_ls = []

for e1 in ls:
    curr = []
    
    for e2 in e1:
        curr.append(paddle.to_tensor(e2))
        
    tensor_ls.append(curr)

In [12]:
tensor_ls

[[Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
         [1]),
  Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
         [2]),
  Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
         [3])],
 [Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
         [4]),
  Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
         [5]),
  Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
         [6, 7, 4])]]

In [16]:
def sum_pooling(in_list):    
    return paddle.sum(in_list, axis=0, keepdim=True)

In [17]:
def flatten_features(input_list):
    embedded_list = list(map(emb, input_list))
    summed_list = list(map(sum_pooling, embedded_list))

    concatted_list = paddle.concat(summed_list, axis=1)
    
    return concatted_list

In [18]:
flattened_list = list(map(flatten_features, tensor_ls))
flattened_list

[Tensor(shape=[1, 15], dtype=float32, place=Place(cpu), stop_gradient=False,
        [[-0.52190387,  0.07194221,  0.03799814, -0.49266735,  0.30468524,
          -0.55060506, -0.23874933,  0.12390065, -0.51309252, -0.06103414,
          -0.51643449, -0.14627051, -0.06078577, -0.17923266,  0.47464710]]),
 Tensor(shape=[1, 15], dtype=float32, place=Place(cpu), stop_gradient=False,
        [[ 0.51779944, -0.52479392, -0.50556695,  0.52239817, -0.14765447,
          -0.50262690, -0.21480486,  0.24494112, -0.34089866, -0.40321022,
           1.04577839, -1.59343457, -0.57175392,  0.36989284,  0.67018896]])]

In [19]:
paddle.concat(flattened_list)

Tensor(shape=[2, 15], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[-0.52190387,  0.07194221,  0.03799814, -0.49266735,  0.30468524,
         -0.55060506, -0.23874933,  0.12390065, -0.51309252, -0.06103414,
         -0.51643449, -0.14627051, -0.06078577, -0.17923266,  0.47464710],
        [ 0.51779944, -0.52479392, -0.50556695,  0.52239817, -0.14765447,
         -0.50262690, -0.21480486,  0.24494112, -0.34089866, -0.40321022,
          1.04577839, -1.59343457, -0.57175392,  0.36989284,  0.67018896]])

# Dataloader utils

In [61]:
class EarlyStopper:
    def __init__(self, num_trials, delta):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_accuracy = -1
        self.delta = delta
        # self.save_path = save_path

    def is_continuable(self, model, accuracy):
        if accuracy > self.best_accuracy + self.delta:
            self.best_accuracy = accuracy
            self.trial_counter = 0
            # torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False

In [75]:
class CVRDataset(Dataset):
    def __init__(self, features):
        self.t1 = []
        self.t2 = []
        self.t3 = []
        
        self.features = []
        
        for feature in features:
            if feature[1][0] == "-":
                self.t1.append([0])
            else:
                self.t1.append([int(feature[1][0])])
            
            if feature[2][0] == "-":
                self.t2.append([0])
            else:
                self.t2.append([int(feature[2][0])])
                
            if feature[3][0] == "-":
                self.t3.append([0])
            else:
                self.t3.append([int(feature[3][0])])
                
            self.features.append(feature[4:])
            
        self.t1 = paddle.to_tensor(self.t1, dtype="float32")
        self.t2 = paddle.to_tensor(self.t2, dtype="float32")
        self.t3 = paddle.to_tensor(self.t3, dtype="float32")
        
    def __len__(self):
        return len(self.t1)

    def __getitem__(self, idx):
        return self.t1[[idx]], self.t2[[idx]], self.t3[[idx]], self.features[idx]

In [76]:
cvr_dataset = CVRDataset(features)

In [77]:
def train(model, optimizer, data_loader, criterion, log_interval=100):
    model.train()
    total_loss = 0

    tk0 = tqdm(data_loader, smoothing=0, mininterval=1.0)

    for i, (t1, t2, t3, fields) in enumerate(tk0):
        # t1, t2, t3 = t1.to(device), t2.to(device), t3.to(device)
        y = model(fields)
        
        cat = paddle.concat([t1, t2, t3], axis=1)
        target = paddle.max(cat, axis=1, keepdim=True)
                
        loss = criterion(y, target)

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        if (i + 1) % log_interval == 0:
            tk0.set_postfix(loss=total_loss / log_interval)
            total_loss = 0

In [43]:
def test(model, data_loader):
    model.eval()
    targets, predicts = list(), list()

    with paddle.no_grad():
        for i, (t1, t2, t3, fields) in tqdm(data_loader, smoothing=0, mininterval=1.0):
            # t1, t2, t3 = t1.to(device), t2.to(device), t3.to(device)
            
            y = model(fields)
            cat = paddle.concat([t1, t2, t3], axis=1)
            target = paddle.max(cat, axis=1, keepdim=True)
            
            targets.extend(target.tolist())
            predicts.extend(y.tolist())

    return roc_auc_score(targets, predicts)

In [45]:
def collate_fn(data):
    t1, t2, t3, fields = [], [], [], []
    
    for d in data:
        t1.append(d[0])
        t2.append(d[1])
        t3.append(d[2])
        
        fields.append(d[3])
        
    return paddle.concat(t1, axis=0), paddle.concat(t2, axis=0), paddle.concat(t3, axis=0), fields

In [78]:
def run(dataset, model, epoch, learning_rate, batch_size, weight_decay, num_trials, delta, collate_fn=None):
    train_length = int(len(dataset) * 0.8)
    valid_length = int(len(dataset) * 0.1)
    test_length = len(dataset) - train_length - valid_length

    print(f"training set size: {train_length}, validation set size: {valid_length}, test set size: {test_length}")

    train_dataset, valid_dataset, test_dataset = paddle.io.random_split(dataset, (train_length, valid_length, test_length))

    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

    criterion = paddle.nn.BCELoss()
    optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=learning_rate, weight_decay=weight_decay)
    early_stopper = EarlyStopper(num_trials=num_trials, delta=delta)

    for epoch_i in range(epoch):
        train(model, optimizer, train_data_loader, criterion)

        auc = test(model, valid_data_loader)
        print(f"epoch: {epoch_i}, validation auc: {auc}")

        if not early_stopper.is_continuable(model, auc):
            print(f"validation best auc: {early_stopper.best_accuracy}")
            break

    print(f"testing using {device}")
    auc = test(model, test_data_loader)
    print(f"test auc: {auc}")

# Networks

## Deep Crossing

In [52]:
class DeepCrossingResidualBlock(paddle.nn.Layer):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()

        self.linear1 = paddle.nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.linear2 = paddle.nn.Linear(in_features=hidden_dim, out_features=input_dim)
        self.relu = paddle.nn.ReLU()

    def forward(self, x):
        out1 = self.relu(self.linear1(x))
        out2 = self.linear2(out1) + x

        return self.relu(out2)

In [79]:
class DeepCrossing(paddle.nn.Layer):
    def __init__(self, sparse_feature_cnt, sparse_id_num, embedding_dim, hidden_layers):
        super().__init__()
        
        self.embedding = paddle.nn.Embedding(
            num_embeddings=sparse_id_num, 
            embedding_dim=embedding_dim, 
        )

        self.residual_layers = paddle.nn.LayerList(
            [DeepCrossingResidualBlock(sparse_feature_cnt * embedding_dim, layer) for layer in hidden_layers]
        )

        self.lin = paddle.nn.Linear(sparse_feature_cnt * embedding_dim, 1)
        self.relu = paddle.nn.ReLU()
        self.sigmoid = paddle.nn.Sigmoid()

        
    def forward(self, x):
        flattened_list = list(map(self.__flatten_features, x))        
        features = paddle.concat(flattened_list)
                        
        for residual_layer in self.residual_layers:
            features = residual_layer(features)

        output = self.lin(features)
        return self.sigmoid(output)
    
    
    def __sum_pooling(self, input_list):    
        return paddle.sum(input_list, axis=0, keepdim=True)
    
    
    def __flatten_features(self, input_list):
        embedded_list = list(map(self.embedding, input_list))
        summed_list = list(map(self.__sum_pooling, embedded_list))

        concatted_list = paddle.concat(summed_list, axis=1)

        return concatted_list

In [54]:
paddle.device.set_device("cpu")

Place(cpu)

In [80]:
deepcrossing = DeepCrossing(
    sparse_feature_cnt=26, 
    sparse_id_num=88000000, 
    embedding_dim=5, 
    hidden_layers=[64, 32, 16], 
)

In [56]:
epoch = 1000
learning_rate = 0.005
batch_size = 16
weight_decay = 1e-6
num_trials = 3
delta = 0.001

In [None]:
run(
    dataset=cvr_dataset, 
    model=deepcrossing, 
    epoch=epoch, 
    learning_rate=learning_rate, 
    batch_size=batch_size, 
    weight_decay=weight_decay, 
    num_trials=num_trials, 
    delta=delta,
    collate_fn=collate_fn
)

training set size: 800, validation set size: 100, test set size: 100


  2%|███▉                                                                                                                                                                                                 | 1/50 [11:54<9:43:16, 714.22s/it]

In [None]:
clean_test = joblib.load("../data/data204194/test_data/clean_test.pkl")

clean_test.head()

In [None]:
feature_cols = list(clean_test.columns[4:])
unique_elements = {}


clean_test.apply(lambda x: fill_unique_elements(x, feature_cols, unique_elements), axis=1)    

In [None]:
# single_element_cols = [
#     'user_feat_1',
#     'user_feat_11',
#     'user_feat_12',
#     'user_feat_13',
#     'user_feat_2',
#     'user_feat_3',
#     'user_feat_4',
#     'user_feat_5',
#     'user_feat_6',
#     'user_feat_7',
#     'user_feat_8',
#     'user_feat_9',
#     'scene_feat_14',
#     'scene_feat_15',
#     'scene_feat_16',
#     'ad_feat_19',
#     'ad_feat_21',
#     'ad_feat_23',
#     'ad_feat_24'
# ]

# for _col in single_element_cols:
#     clean_test[_col] = clean_test[_col].map(lambda x: x[0] if len(x) != 0 else "-1")