In [25]:
import os
import datetime
import joblib

from tqdm import tqdm

import numpy as np
import pandas as pd

import torch

In [4]:
clean_train_folder = "../data/data205411/2023-cvr-contest-data/clean_train"

In [8]:
train_file_ls = os.listdir(clean_train_folder)
train_file_ls = [_file for _file in train_file_ls if not _file.startswith("train")]

train_df_ls = []

for _file in tqdm(train_file_ls[:1]):
    curr_train = joblib.load(os.path.join(clean_train_folder, _file))
    train_df_ls.append(curr_train)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:59<00:00, 59.93s/it]


In [9]:
col_names = [
    'sample_id', 't1', 't2', 't3', 'user_feat_1', 'user_feat_2',
    'user_feat_3', 'user_feat_4', 'user_feat_5', 'user_feat_6',
    'user_feat_7', 'user_feat_8', 'user_feat_9', 'user_feat_10',
    'user_feat_11', 'user_feat_12', 'user_feat_13', 'scene_feat_14',
    'scene_feat_15', 'scene_feat_16', 'ad_feat_17', 'ad_feat_18',
    'ad_feat_19', 'ad_feat_20', 'ad_feat_21', 'ad_feat_22', 'ad_feat_23',
    'ad_feat_24', 'session_feat_25', 'session_feat_26'
]

for _df in train_df_ls:
    _df.drop("index", axis=1, inplace=True, errors="ignore")
    _df.columns = col_names

In [10]:
train_df = pd.concat(train_df_ls, axis=0, ignore_index=True)

In [6]:
# joblib.dump(train_df, os.path.join(clean_train_folder, "train_file_first10.pkl"))

In [7]:
# train_df = joblib.load(os.path.join(clean_train_folder, "train_file.pkl"))

In [13]:
feature_cols = list(train_df.columns[4:])
unique_elements = {}

def fill_unique_elements(row, feature_cols, unique_elements):
    for _col in feature_cols:
        if _col not in unique_elements:
            unique_elements[_col] = set()
        
        unique_elements[_col].update(row[_col])

train_df.apply(lambda x: fill_unique_elements(x, feature_cols, unique_elements), axis=1)    

0         None
1         None
2         None
3         None
4         None
          ... 
272186    None
272187    None
272188    None
272189    None
272190    None
Length: 272191, dtype: object

In [17]:
sparse_id_cnt = 0

for k, v in unique_elements.items():
    print(f"{k}, \t length={len(v)}")
    sparse_id_cnt += len(v)

user_feat_1, 	 length=21
user_feat_2, 	 length=9
user_feat_3, 	 length=269856
user_feat_4, 	 length=231628
user_feat_5, 	 length=12
user_feat_6, 	 length=9
user_feat_7, 	 length=6
user_feat_8, 	 length=18
user_feat_9, 	 length=6
user_feat_10, 	 length=726325
user_feat_11, 	 length=111
user_feat_12, 	 length=0
user_feat_13, 	 length=24
scene_feat_14, 	 length=6
scene_feat_15, 	 length=447
scene_feat_16, 	 length=1
ad_feat_17, 	 length=1287144
ad_feat_18, 	 length=446183
ad_feat_19, 	 length=50225
ad_feat_20, 	 length=17547
ad_feat_21, 	 length=11
ad_feat_22, 	 length=2386616
ad_feat_23, 	 length=47897
ad_feat_24, 	 length=372
session_feat_25, 	 length=452
session_feat_26, 	 length=1052


In [15]:
intersection_dict = {"col1": [], "col2": [], "intersections": []}

for k1, v1 in unique_elements.items():
    for k2, v2 in unique_elements.items():
        if k1 == k2:
            continue
            
        intersection = v1.intersection(v2)
        
        intersection_dict["col1"].append(k1)
        intersection_dict["col2"].append(k2)
        intersection_dict["intersections"].append(len(intersection))

In [16]:
intersection_df = pd.DataFrame(intersection_dict)
intersection_df.describe()

Unnamed: 0,intersections
count,650.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [39]:
emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=5, dtype=torch.double)

In [45]:
ls = [torch.tensor([1]), torch.tensor([2]), torch.tensor([1,3,4])]

emb_ls = list(map(emb, ls))
emb_ls

[tensor([[-0.6060, -0.2022,  2.1819,  1.1488,  0.9632]], dtype=torch.float64,
        grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.5114,  0.5864, -0.6509,  0.7173,  0.0510]], dtype=torch.float64,
        grad_fn=<EmbeddingBackward0>),
 tensor([[-0.6060, -0.2022,  2.1819,  1.1488,  0.9632],
         [ 0.0649, -0.1210, -1.0362, -0.1746,  1.1854],
         [-0.9366, -0.8541, -0.9004,  0.6696, -1.4345]], dtype=torch.float64,
        grad_fn=<EmbeddingBackward0>)]

In [66]:
def sum_pooling(in_list):    
    return torch.sum(in_list, dim=0)

In [52]:
torch.cat(list(map(sum_pooling, emb_ls)), dim=0)

tensor([-0.6060, -0.2022,  2.1819,  1.1488,  0.9632,  0.5114,  0.5864, -0.6509,
         0.7173,  0.0510, -1.4777, -1.1773,  0.2453,  1.6437,  0.7141],
       dtype=torch.float64, grad_fn=<CatBackward0>)

In [50]:
class DeepCrossingResidualBlock(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, device):
        super().__init__()

        self.linear1 = torch.nn.Linear(in_features=input_dim, out_features=hidden_dim, dtype=torch.double, device=device)
        self.linear2 = torch.nn.Linear(in_features=hidden_dim, out_features=input_dim, dtype=torch.double, device=device)

    def forward(self, x):
        out1 = torch.relu(self.linear1(x))
        out2 = self.linear2(out1) + x

        return torch.relu(out2)

In [None]:
class DeepCrossing(torch.nn.Module):
    def __init__(self, sparse_feature_cnt, sparse_feature_num, embedding_dim, hidden_layers, device):
        super().__init__()

        self.embedding_layers = torch.nn.Embedding(
            num_embeddings=sparse_feature_num, 
            embedding_dim=embedding_dim, 
            dtype=torch.double, 
            device=device
        )

        self.residual_layers = torch.nn.ModuleList(
            [DeepCrossingResidualBlock(sparse_feature_cnt * embedding_dim, layer, device) for layer in hidden_layers]
        )

        self.lin = torch.nn.Linear(sparse_feature_cnt * embedding_dim, 1, dtype=torch.double, device=device)
        self.relu = torch.nn.ReLU()

        
    def forward(self, x):
        
        
        
        sparse_embedding = [
            relu(self.embedding_layers[idx](x[:, pos[0]:pos[1]])) for idx, pos in enumerate(self.sparse_indices)
        ]

        sparse_embedding = torch.cat(sparse_embedding, dim=1)

        dense_input = [x[:, pos[0]:pos[1]] for pos in self.dense_indices]
        dense_input = torch.cat(dense_input, dim=1)

        concat = torch.cat([sparse_embedding, dense_input], dim=1)

        for residual_layer in self.residual_layers:
            concat = residual_layer(concat)

        output = self.lin(concat)
        return torch.sigmoid(output)
    
    
    def __sum_pooling(self, _input):
        return torch.sum(_input, dim=0)

In [19]:
features = []

def construct(row, feature_cols, features):
    curr_features = []
    
    for _col in feature_cols:
        if len(row[_col]) == 0:
            curr_features.append(torch.tensor([0], dtype=torch.long))
        else:
            tmp = []
            for i in row[_col]:
                tmp.append(int(i))
            
            curr_features.append(torch.tensor(tmp, dtype=torch.long))
        
    features.append(curr_features)

In [None]:
features = []

for idx, row in train_df.iterrows():
    if idx % 1000 == 0:
        print(idx)
        print(datetime.datetime.now())
    
    curr_features = []
    
    for _col in feature_cols:
        if len(row[_col]) == 0:
            curr_features.append(torch.tensor([0], dtype=torch.long))
        else:
            tmp = []
            for i in row[_col]:
                tmp.append(int(i))
            
            curr_features.append(torch.tensor(tmp, dtype=torch.long))
        
    features.append(curr_features)

0
2023-06-21 14:16:03.919021
1000
2023-06-21 14:16:04.637345
2000
2023-06-21 14:16:05.531209
3000
2023-06-21 14:16:07.805612
4000
2023-06-21 14:16:20.223100
5000
2023-06-21 14:16:20.880082
6000
2023-06-21 14:16:21.636150
7000
2023-06-21 14:16:22.715235
8000
2023-06-21 14:16:23.952421
9000
2023-06-21 14:16:25.752009
10000
2023-06-21 14:16:39.828557
11000
2023-06-21 14:16:58.081680
12000
2023-06-21 14:17:17.344098
13000
2023-06-21 14:17:33.314061
14000
2023-06-21 14:17:50.356067
15000
2023-06-21 14:18:09.069149
16000
2023-06-21 14:18:25.455331
17000
2023-06-21 14:18:47.345555
18000
2023-06-21 14:19:15.165556
19000
2023-06-21 14:19:37.740834
20000
2023-06-21 14:19:57.507229
21000
2023-06-21 14:20:13.923012
22000
2023-06-21 14:20:28.007484
23000
2023-06-21 14:20:44.724634
24000
2023-06-21 14:21:01.809572
25000
2023-06-21 14:21:18.107988
26000
2023-06-21 14:21:34.998896
27000
2023-06-21 14:21:55.659191
28000
2023-06-21 14:22:13.462655
29000
2023-06-21 14:22:32.067111
30000
2023-06-21 14:22:

In [74]:
train_df.head()

Unnamed: 0,sample_id,t1,t2,t3,user_feat_1,user_feat_2,user_feat_3,user_feat_4,user_feat_5,user_feat_6,...,ad_feat_17,ad_feat_18,ad_feat_19,ad_feat_20,ad_feat_21,ad_feat_22,ad_feat_23,ad_feat_24,session_feat_25,session_feat_26
0,2508928549,-,0,-,[647],[160],[53969151],[53969152],[163],[164],...,"[17008585, 53969153, 215231, 2701633, 215228, ...","[5443472, 48817, 53969156, 2357473, 22149274, ...",[25877104],"[3282, 39155, 676, 111764, 341173, 225674]",[224],"[53969157, 53969158, 1975821, 53969159, 486020...",[25877111],[15224],[],"[312, 60, 152, 442, 510, 185, 186, 155, 2699, ..."
1,64787212,-,0,-,[258],[259],[53969162],[53969163],[163],[164],...,"[30990708, 40077635, 30556615, 53969164, 50916...",[],[42168349],"[47819, 629, 401805]",[179],"[51405889, 53969166, 53969167, 53969168, 53969...",[42168350],[2696],[311],"[150, 151, 182, 183, 184, 185, 154, 155, 188, ..."
2,3102011900,0,-,-,[906],[975],[30111173],[5335],[317],[318],...,"[742548, 40461, 94199, 44785472, 539079, 19679...","[898426, 108899, 119236, 30111175, 89088, 2618...",[19349153],"[291, 961, 4845, 92, 30690, 91288]","[337, 1570]","[53969175, 25632633, 24487436, 53969176, 24487...",[19349171],[2043],[2185],"[700, 60, 61, 183, 443, 1032, 65, 2046, 4108, ..."
3,2227614234,-,0,-,[647],[160],[53969184],[53969185],[163],[164],...,"[858528, 516152, 891974, 29232660, 34520906, 3...",[],[732455],"[291, 2394, 48449, 91, 1661]",[298],"[24695866, 53969187, 24695864, 45471879, 53969...",[732466],[15343],[],"[59, 151, 182, 183, 184, 185, 154, 155, 188, 1..."
4,1825065264,-,-,0,[1],[2],[53969193],[51433330],[230],[102],...,"[23262256, 7336975, 34140545, 17933073, 118682...",[],[24899422],"[23775, 37500, 25700]",[46],"[53969197, 53969198, 53969199, 53969200, 53969...",[53969207],[394582],[5952],"[10164, 4105, 182, 343, 184, 185, 65, 257, 269..."


In [70]:
ls = [[[1],[2],[3]],[[4],[5],[6,7,4]]]

tensor_ls = []

for e1 in ls:
    curr = []
    
    for e2 in e1:
        curr.append(torch.tensor(e2))
        
    tensor_ls.append(curr)

In [71]:
tensor_ls

[[tensor([1]), tensor([2]), tensor([3])],
 [tensor([4]), tensor([5]), tensor([6, 7, 4])]]

In [72]:
emb_ls = list(map(emb, tensor_ls[1]))

emb_ls

[tensor([[-0.9366, -0.8541, -0.9004,  0.6696, -1.4345]], dtype=torch.float64,
        grad_fn=<EmbeddingBackward0>),
 tensor([[-1.8182, -0.2008,  0.1664, -0.0886,  0.2453]], dtype=torch.float64,
        grad_fn=<EmbeddingBackward0>),
 tensor([[-1.8054, -2.6050, -0.6814,  1.1065,  0.2614],
         [-1.5546, -1.0629,  0.9547,  0.9318,  0.5174],
         [-0.9366, -0.8541, -0.9004,  0.6696, -1.4345]], dtype=torch.float64,
        grad_fn=<EmbeddingBackward0>)]

In [73]:
list(map(sum_pooling, emb_ls))

[tensor([-0.9366, -0.8541, -0.9004,  0.6696, -1.4345], dtype=torch.float64,
        grad_fn=<SumBackward1>),
 tensor([-1.8182, -0.2008,  0.1664, -0.0886,  0.2453], dtype=torch.float64,
        grad_fn=<SumBackward1>),
 tensor([-4.2966, -4.5220, -0.6271,  2.7079, -0.6556], dtype=torch.float64,
        grad_fn=<SumBackward1>)]

In [44]:
train_df.head(3)

Unnamed: 0,sample_id,t1,t2,t3,user_feat_1,user_feat_2,user_feat_3,user_feat_4,user_feat_5,user_feat_6,...,ad_feat_17,ad_feat_18,ad_feat_19,ad_feat_20,ad_feat_21,ad_feat_22,ad_feat_23,ad_feat_24,session_feat_25,session_feat_26
0,2508928549,-,0,-,[647],[160],[53969151],[53969152],[163],[164],...,"[17008585, 53969153, 215231, 2701633, 215228, ...","[5443472, 48817, 53969156, 2357473, 22149274, ...",[25877104],"[3282, 39155, 676, 111764, 341173, 225674]",[224],"[53969157, 53969158, 1975821, 53969159, 486020...",[25877111],[15224],[],"[312, 60, 152, 442, 510, 185, 186, 155, 2699, ..."
1,64787212,-,0,-,[258],[259],[53969162],[53969163],[163],[164],...,"[30990708, 40077635, 30556615, 53969164, 50916...",[],[42168349],"[47819, 629, 401805]",[179],"[51405889, 53969166, 53969167, 53969168, 53969...",[42168350],[2696],[311],"[150, 151, 182, 183, 184, 185, 154, 155, 188, ..."
2,3102011900,0,-,-,[906],[975],[30111173],[5335],[317],[318],...,"[742548, 40461, 94199, 44785472, 539079, 19679...","[898426, 108899, 119236, 30111175, 89088, 2618...",[19349153],"[291, 961, 4845, 92, 30690, 91288]","[337, 1570]","[53969175, 25632633, 24487436, 53969176, 24487...",[19349171],[2043],[2185],"[700, 60, 61, 183, 443, 1032, 65, 2046, 4108, ..."


In [None]:
clean_test = joblib.load("../data/data204194/test_data/clean_test.pkl")

clean_test.head()

In [None]:
feature_cols = list(clean_test.columns[4:])
unique_elements = {}


clean_test.apply(lambda x: fill_unique_elements(x, feature_cols, unique_elements), axis=1)    

In [None]:
# single_element_cols = [
#     'user_feat_1',
#     'user_feat_11',
#     'user_feat_12',
#     'user_feat_13',
#     'user_feat_2',
#     'user_feat_3',
#     'user_feat_4',
#     'user_feat_5',
#     'user_feat_6',
#     'user_feat_7',
#     'user_feat_8',
#     'user_feat_9',
#     'scene_feat_14',
#     'scene_feat_15',
#     'scene_feat_16',
#     'ad_feat_19',
#     'ad_feat_21',
#     'ad_feat_23',
#     'ad_feat_24'
# ]

# for _col in single_element_cols:
#     clean_test[_col] = clean_test[_col].map(lambda x: x[0] if len(x) != 0 else "-1")