In [None]:
!pip install -q ../input/faiss-163/faiss_gpu-1.6.3-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import os
import gc
import cv2
import math
import copy
import time
import random

import cudf
import cupy
import pandas as pd
import numpy as np
import xgboost as xgb
import numpy as np
import pandas as pd
from tqdm import tqdm

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# For Transformer Models
import transformers
from transformers import AutoTokenizer, AutoModel, AdamW, AutoConfig

from cuml.neighbors import NearestNeighbors 
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.metrics import pairwise_distances

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold


In [None]:
class CFG:
    seed = 42 # 随机种子
    n_candidates = 50 # 候选集数量
    n_splits = 4 # 折数
    max_length = 64 # 最大长度
    device = torch.device('cuda') # GPU

    reuse_dir1 = "../input/bert_embed_1/" 
    reuse_dir2 = "../input/bert_embed_2/" 
    reuse_dir3 = "../input/bert_embed_3/"
    reuse_dir4 = "../input/bert_embed_4/"

    # Metric loss and its params 
    s = 30.0 # arcface 参数 scale
    m = 0.5  # arcface 参数 margin
    ls_eps = 0.0 # arcface 参数 eps
    easy_margin = False # easy_margin
    n_classes = 739972 # 分类数量

In [None]:
import pickle

def save_pickle(data, file_name):
    '''
    保存 pickle 文件
    '''
    with open(f"{file_name}.pickle", "wb") as f:
        pickle.dump(data, f)


def load_pickle(file_name):
    '''
    加载 pickle 文件
    '''
    with open(f"{file_name}.pickle", "rb") as f:
        d = pickle.load(f)
    return d

In [None]:

def set_seed(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CFG.seed)


In [None]:
# Arcface
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features  # input的维度
        self.out_features = out_features # output的维度
        self.s = s # re-scale
        self.m = m # margin
        self.ls_eps = ls_eps  # label smoothing
        # 初始化权重
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin # easy_margin 模式
        self.cos_m = math.cos(m) # cos margin
        self.sin_m = math.sin(m) # sin margin
        self.threshold = math.cos(math.pi - m) # cos(pi - m) = -cos(m)
        self.mm = math.sin(math.pi - m) * m # sin(pi - m)*m = sin(m)*m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight)) # 获得cosθ (vector)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) # 获得cosθ
        phi = cosine * self.cos_m - sine * self.sin_m # cosθ*cosm – sinθ*sinm = cos(θ + m)
        phi = phi.float() # phi to float
        cosine = cosine.float() # cosine to float
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            # 以下代码控制了 θ+m 应该在 range[0, pi]
            # if cos(θ) > cos(pi - m) means θ + m < math.pi, so phi = cos(θ + m);
            # else means θ + m >= math.pi, we use Talyer extension to approximate the cos(θ + m).
            # if fact, cos(θ + m) = cos(θ) - m * sin(θ) >= cos(θ) - m * sin(math.pi - m)
            phi = torch.where(cosine > self.threshold, phi, cosine - self.mm) # https://github.com/ronghuaiyang/arcface-pytorch/issues/48
        # --------------------------- convert label to one-hot ---------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        # 对label形式进行转换，假设batch为2、有3类的话，即将label从[1,2]转换成[[0,1,0],[0,0,1]]
        one_hot = torch.zeros(cosine.size(), device=CFG.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        # 进行re-scale
        output *= self.s

        return output

class FourSquareDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.fulltext = df['fulltext'].values # 全文
        self.latitudes = df['latitude'].values # 纬度
        self.longitudes = df['longitude'].values # 经度
        self.coord_x = df['coord_x'].values # 经纬度坐标 x
        self.coord_y = df['coord_y'].values # 经纬度坐标 y
        self.coord_z = df['coord_z'].values # 经纬度坐标 z
        self.labels = df['point_of_interest'].values # point_of_interest 标签
        self.tokenizer = tokenizer # 词表
        self.max_length = max_length # 最大长度
        
    def __len__(self):
        return len(self.fulltext) # 返回数据长度
    
    def __getitem__(self, index):
        fulltext = self.fulltext[index] # 全文
        latitude = self.latitudes[index] # 纬度
        longitude = self.longitudes[index] # 经度
        label = self.labels[index] # 标签
        coord_x = self.coord_x[index] # 经纬度坐标 x
        coord_y = self.coord_y[index] # 经纬度坐标 y
        coord_z = self.coord_z[index] # 经纬度坐标 z
        
        inputs = self.tokenizer(
            fulltext, # 全文
            truncation=True, # 截断
            add_special_tokens=True, # 添加特殊字符
            max_length=self.max_length, # 最大长度
            padding='max_length', # 填充方式
            return_tensors="pt" # 返回张量
        )

        return {
            'ids': inputs['input_ids'][0], # input_ids
            'mask': inputs['attention_mask'][0], # 注意力掩码
            'latitude': torch.tensor(latitude, dtype=torch.float), # 纬度
            'longitude': torch.tensor(longitude, dtype=torch.float), # 经度
            'coord_x': torch.tensor(coord_x), # 经纬度坐标 x
            'coord_y': torch.tensor(coord_y), # 经纬度坐标 y
            'coord_z': torch.tensor(coord_z), # 经纬度坐标 z
            'label': torch.tensor(label, dtype=torch.long) # 标签
        }
    

class FSMultiModalNet(nn.Module):
    def __init__(self, model_name, fc_dim, num_features=3):
        super(FSMultiModalNet, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name) # 加载预训练模型
        self.bert_model = AutoModel.from_pretrained(model_name, config=self.config) # 加载预训练模型
        # self.embedding = nn.Linear(self.config.hidden_size + 2, embedding_size)

        self.fc = nn.Linear(self.bert_model.config.hidden_size + num_features, fc_dim) # 全连接层 hidden_size + x'y'z'
        self.bn = nn.BatchNorm1d(fc_dim) # BatchNorm1d
        self._init_params() # 初始化参数

        self.margin = ArcMarginProduct(
            fc_dim, # 输入维度
            CFG.n_classes, # 输出维度
            s=CFG.s, # scale
            m=CFG.m, # margin 
            easy_margin=CFG.easy_margin, # easy_margin
            ls_eps=CFG.ls_eps # label smoothing epsilon
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight) # 初始化全连接层权重
        nn.init.constant_(self.fc.bias, 0) # 初始化全连接层偏置
        nn.init.constant_(self.bn.weight, 1) # 初始化 BatchNorm1d 权重
        nn.init.constant_(self.bn.bias, 0) # 初始化 BatchNorm1d 偏置

    def forward(self, ids, mask, lat, lon, coord_x, coord_y, coord_z, labels):
        feature = self.extract_feature(ids, mask, lat, lon, coord_x, coord_y, coord_z) # 提取特征
        output = self.margin(feature, labels) # ArcMarginProduct 输出

        return output
    
    def extract_feature(self, input_ids, attention_mask, lat, lon, coord_x, coord_y, coord_z):
        x = self.bert_model(input_ids=input_ids, attention_mask=attention_mask) # 获取 BERT 特征
        x = torch.sum(x.last_hidden_state * attention_mask.unsqueeze(-1), dim=1) / attention_mask.sum(dim=1, keepdims=True) # 将 BERT 特征attention_mask部分求平均

        x = torch.cat([x, coord_x.view(-1, 1), coord_y.view(-1, 1), coord_z.view(-1, 1)], axis=1) # 将 bert输出 和 x'y'z' 合并

        x = self.fc(x) # 全连接层
        x = self.bn(x) # BatchNorm1d

        return x # 返回特征

In [None]:
df = cudf.read_csv("../input/trainfilled/train_filled.csv") # 经过空值填充的训练集

for col in ["name", "address", "city", "state", "zip", "country", "url", "phone", "categories"]:
    df[col] = df[col].fillna("") # 填充缺失值
    
# fulltext = 拼接name、address、city、state、country、categories
df["fulltext"] = (
    df["name"] + " " + df["address"] + " " + df["city"] + " " + df["state"] + " "  + df["country"] + " " + df["categories"]
).to_pandas().replace(r'\s+', ' ', regex=True)

# Standardization of coordinates.
# https://datascience.stackexchange.com/questions/13567/ways-to-deal-with-longitude-latitude-feature
df["coord_x"] = cupy.cos(df["latitude"]) * cupy.cos(df["longitude"]) # 经度和纬度转换成x坐标
df["coord_y"] = cupy.cos(df["latitude"]) * cupy.sin(df["longitude"]) # 经度和纬度转换成y坐标
df["coord_z"] = cupy.sin(df["latitude"]) # 经度和纬度转换成z坐标
                       
print(df.shape)
df.head()

In [None]:
encoder = LabelEncoder() # 创建标签编码器
df['point_of_interest'] = encoder.fit_transform(df['point_of_interest'].to_array()) # 对point_of_interest做编码标签

# Embeding

In [None]:
V_embed_bert1 = load_pickle(f"{CFG.reuse_dir1}/V_embed_bert") # 加载bert的embedding1
print(V_embed_bert1.shape)

V_embed_bert2 = load_pickle(f"{CFG.reuse_dir2}/V_embed_bert") # 加载bert的embedding2 
print(V_embed_bert2.shape)

V_embed_bert3 = load_pickle(f"{CFG.reuse_dir3}/V_embed_bert") # 加载bert的embedding3
print(V_embed_bert3.shape)

V_embed_bert4 = load_pickle(f"{CFG.reuse_dir4}/V_embed_bert") # 加载bert的embedding4
print(V_embed_bert4.shape)

In [None]:
# concatenate
V_embed_concat = cupy.concatenate([
    V_embed_bert1,
    V_embed_bert2,
    V_embed_bert3,
    V_embed_bert4,
], axis=1)

del V_embed_bert1,V_embed_bert2,V_embed_bert3,V_embed_bert4
gc.collect()
torch.cuda.empty_cache()

#V_embed_concat.shape
V_embed_concat = V_embed_concat / cupy.linalg.norm(V_embed_concat, ord=2, axis=1, keepdims=True) # l2 正则化

print(V_embed_concat.shape)
#del dataset, loader, model_bert_multi

# Generate Candidates

In [None]:
# Create candidate index country by country
import faiss # 导入faiss

def gen_candidate_ranks(df, V_embed, n_candidates):
    '''
    Generate candidate ranks for each row in df.
    '''
    res = faiss.StandardGpuResources() # GPU资源
    index = faiss.IndexFlatIP(V_embed.shape[1]) # 创建索引
    index = faiss.index_cpu_to_gpu(res, 0, index) # 将索引放到GPU上
    index.add(V_embed) # 将embedding加入索引
    D, I = index.search(V_embed, n_candidates) # 搜索最近的n_candidates个索引
    D = np.clip(D, 0, 1) # 将D值限制在0和1之间
    return D, I # 返回D和I


# 寻找每个样本的n个最近邻值的距离和索引
D_concat, I_concat = gen_candidate_ranks(df, cupy.asnumpy(V_embed_concat), CFG.n_candidates) 
np.save("D_concat.npy", D_concat) # 保存D_concat
np.save("I_concat.npy", I_concat) # 保存I_concat

# D_concat.shape == (1138812, 50) # 第一个值是和自己的距离
# array([[0.99999994, 0.83693177, 0.34076163, ..., 0.22953738, 0.22911589,
#         0.22874717],
#        [1.        , 0.80496347, 0.41403773, ..., 0.23809814, 0.2379353 ,
#         0.23778985],
#        [0.99999934, 0.45569178, 0.35417274, ..., 0.18390407, 0.18360165,
#         0.18332995],
#        ...,
#        [0.9999999 , 0.98086345, 0.9773909 , ..., 0.9356541 , 0.93557185,
#         0.9349697 ],
#        [0.99999917, 0.6578855 , 0.3543492 , ..., 0.23836766, 0.23802386,
#         0.23755664],
#        [1.        , 0.99317944, 0.47047997, ..., 0.22384529, 0.22381721,
#         0.22305818]], dtype=float32)

# I_concat.shape == (1138812, 50) # 第一个值是和自己的索引
# array([[      0,  972683,  386676, ...,  199642,  655271,  925360],
#        [      1, 1032766,  665938, ...,   94111,   70948,   68941],
#        [      2,  657609,   85624, ...,  379627,  980143,  832934],
#        ...,
#        [1138809,  269067,  177162, ...,  614297,  381139,  974775],
#        [1138810,   20470,  126901, ...,  357689,  135823,  794488],
#        [1138811,  119981,  670304, ...,  201677,  931374,  137607]])


# DBA/QE weighted

In [None]:
# https://www.kaggle.com/code/lyakaap/2nd-place-solution/notebook
def query_expansion(V, D, I, alpha=3, k=2):
    '''
    对V_embed_concat进行查询扩展. 
    当前样本的embedding, 用最邻近的2个embedding乘以距离权重, 然后求和.

    V: embedding matrix
    D: distance matrix
    I: index matrix
    alpha: 次方系数
    k: Top-k 邻近值
    '''
    weights = cupy.array(np.expand_dims(D[:, :k] ** alpha, axis=-1).astype(np.float32)) # 每个样本前k个邻近距离的3次方， shape==(1138812, 2, 1) 
    chunk_size = 100_000 # chunk大小, 防止OOM
    for i in range(0, len(df), chunk_size): 
        # V.shape: (1138812, 960)
        # I[i:i+chunk_size, :k].shape: (100000, 2) # 纯作为索引
        # V[I[i:i+chunk_size, :k]].shape: (100000, 2, 960)
        # weights[i:i+chunk_size].shape: (100000, 2, 1)
        V[i:i+chunk_size] = (V[I[i:i+chunk_size, :k]] * weights[i:i+chunk_size]).sum(axis=1) # 将V中的2个embedding乘上，以距离为权重，然后求和
    return V


V_embed_concat = query_expansion(V_embed_concat, D_concat, I_concat) # 对V_embed_concat进行查询扩展. 当前样本的embedding, 用最邻近的2个embedding乘以距离权重, 然后求和.
V_embed_concat /= np.linalg.norm(V_embed_concat, 2, axis=1, keepdims=True) # l2 正则化

del D_concat,I_concat # 删除变量
gc.collect() # 清空缓存
torch.cuda.empty_cache() # 清空显存

D_concat, I_concat = gen_candidate_ranks(df, cupy.asnumpy(V_embed_concat), CFG.n_candidates) # 对V_embed_concat进行索引
np.save("D_concat_dbaqe.npy", D_concat) # 保存D_concat
np.save("I_concat_dbaqe.npy", I_concat) # 保存I_concat

# Spatial candidates

In [None]:
from sklearn.neighbors import BallTree
def gen_candidate_rank_spatial(df, n_candidates):
    '''
    对每个country内的样本进行k邻近查找
    按经度/纬度的haversine距离查找candidates。
    Returns:
        I (np.array): I[s_index][r] -> s_index 的第r个最近的点的索引。
    '''
    for column in df[["latitude", "longitude"]]:
        rad = np.deg2rad(df[column].values) # 将经度或纬度 --> 弧度
        df[f'{column}_rad'] = rad # 弧度 存入df

    I = np.full((len(df), n_candidates), -1, dtype=np.int32) # 创建I矩阵, shape=(sample_number, n_candidates), 全为-1
    for country, country_df in tqdm(df[["country", "latitude_rad", "longitude_rad"]].to_pandas().groupby("country")): # 对每个国家进行遍历
        clip_n_candidates = min(len(country_df), n_candidates) # 最大只允许n_candidates个样本
        country_df = country_df.reset_index() # 重置索引
        ball = BallTree(country_df[["latitude_rad", "longitude_rad"]].values, metric='haversine') # 创建BallTree
        # 计算country_df中每个点到其他点的距离
        _, indices = ball.query( 
            country_df[["latitude_rad", "longitude_rad"]].values,  # 待查询点的经纬度
            k = clip_n_candidates # k个最邻近点
        )
        
        indices = np.concatenate(
            [
                indices,  # BallTree查询结果索引
                np.zeros((len(indices), n_candidates - clip_n_candidates), dtype=np.int32) # 当candidates列数过少时，用0补全至n_candidates列数
            ], axis=1
        )
        for i in range(n_candidates):
            I[country_df["index"].values, i] = country_df.loc[indices[:, i], "index"].values

    return I


I_spatial = gen_candidate_rank_spatial(df, 10) # 对每个country内的样本进行k邻近查找 shpe==(1138812, 10)

# tfdfi

In [None]:
tfidf = TfidfVectorizer(stop_words='english')  # 创建tfidf
V_name = tfidf.fit_transform(df["name"])  # 对name进行tfidf
V_name.shape

In [None]:
tfidf = TfidfVectorizer(stop_words='english') # 创建tfidf
df["full_address"] = df["address"] + ", " + df["city"] + ", " + df["state"] + ", "  + df["country"] # 创建full_address列, 为 address, city, state, country
V_full_address = tfidf.fit_transform(df["full_address"]) # 对full_address进行tfidf
V_full_address.shape 

In [None]:
tfidf = TfidfVectorizer(stop_words='english') # 创建tfidf
V_cat = tfidf.fit_transform(df["categories"].fillna("nocategory")) # 对categories进行tfidf
V_cat.shape

# Train & Test

In [None]:
R = 6371.0  # 地球半径, 单位km

def manhattan(lat1, long1, lat2, long2):
    '''
    计算曼哈顿距离
    '''
    return np.abs(lat2 - lat1) + np.abs(long2 - long1)


def haversine_np(lon1, lat1, lon2, lat2):
    """
    https://www.kaggle.com/code/justfor/speedup-haversine/script
    计算地球上两点之间的大圆距离的距离（以小数点后的度数指定）。
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1 # 经度差
    dlat = lat2 - lat1 # 纬度差

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = R * c
    return km


def create_features(df, i, indices):
    '''
    创建特征
    '''
    prev_i = max(i-1, 0) # i-1,最小值为0
    next_i = min(i+1, indices.shape[1] - 1) # i+1,最大值为indices长度
    
    prev_cand_index = indices[:, prev_i] # 第i-1个相似度的candidate的索引
    next_cand_index = indices[:, next_i] # 第i+1个相似度的candidate的索引
    cand_index = indices[:, i] # 第i个相似度的candidate索引
    
    lon1 = df["longitude"].to_pandas().to_numpy() # longitude列的值
    lat1 = df["latitude"].to_pandas().to_numpy() # latitude列的值
    lon2 = df["longitude"][cand_index].to_pandas().to_numpy() # 第i个相似度的candidate顺序的longitude列值
    lat2 = df["latitude"][cand_index].to_pandas().to_numpy() # 第i个相似度的candidate顺序的latitude列值
    #df["diff_lon"] = lon1 - lon2
    #df["diff_lat"] = lat1 - lat2
    df["diff_lon"] = np.abs(lon1 - lon2) # 经度差绝对值
    df["diff_lat"] = np.abs(lat1 - lat2) # 纬度差绝对值
    
    df["lonlat_eucdist"] =  (df['diff_lon'] ** 2 + df['diff_lat'] ** 2) ** 0.5 # 经纬度差的平方根
    df["lonlat_manhattan"] = manhattan(lat1, lon1, lat2, lon2) # 曼哈顿距离
    df["lonlat_haversine_dist"] = haversine_np(lon1, lat1, lon2, lat2) # 地球距离
    
    df["name_cossim"] = V_name.multiply(V_name[cand_index]).sum(axis=1).ravel() # 名称相似度
    df["full_address_cossim"] = V_full_address.multiply(V_full_address[cand_index]).sum(axis=1).ravel() # 地址相似度
    df["cat_cossim"] = V_cat.multiply(V_cat[cand_index]).sum(axis=1).ravel() # 类别相似度
    
    df["cand_hit_count_02"] = df["hit_count_02"][cand_index].to_pandas().to_numpy()   # 预测概率>0.2的样本的candidate的概率
    df["cand_hit_count_03"] = df["hit_count_03"][cand_index].to_pandas().to_numpy()   # 预测概率>0.3的样本的candidate的概率
    df["cand_hit_count_04"] = df["hit_count_04"][cand_index].to_pandas().to_numpy()   # 预测概率>0.4的样本的candidate的概率 
    df["cand_hit_count_05"] = df["hit_count_05"][cand_index].to_pandas().to_numpy()   # 预测概率>0.5的样本的candidate的概率
    df["cand_hit_count_sum"] = df["hit_count_sum"][cand_index].to_pandas().to_numpy() # 所有样本的candidate的概率
    
    df["hit_count_02_min"] = df[["hit_count_02", "cand_hit_count_02"]].min(axis=1)  # 样本预测概率和其candidate的概率 的最小值（0.2）
    df["hit_count_03_min"] = df[["hit_count_03", "cand_hit_count_03"]].min(axis=1) # 样本预测概率和其candidate的概率 的最小值（0.3）
    df["hit_count_04_min"] = df[["hit_count_04", "cand_hit_count_04"]].min(axis=1) # 样本预测概率和其candidate的概率 的最小值（0.4）
    df["hit_count_05_min"] = df[["hit_count_05", "cand_hit_count_05"]].min(axis=1) # 样本预测概率和其candidate的概率 的最小值（0.5）
    df["hit_count_sum_min"] = df[["hit_count_sum", "cand_hit_count_sum"]].min(axis=1) # 样本预测概率和其candidate的概率 的最小值（所有样本）
    
    df["hit_count_02_max"] = df[["hit_count_02", "cand_hit_count_02"]].max(axis=1) # 样本预测概率和其candidate的概率 的最大值（0.2）
    df["hit_count_03_max"] = df[["hit_count_03", "cand_hit_count_03"]].max(axis=1) # 样本预测概率和其candidate的概率 的最大值（0.3）
    df["hit_count_04_max"] = df[["hit_count_04", "cand_hit_count_04"]].max(axis=1) # 样本预测概率和其candidate的概率 的最大值（0.4）
    df["hit_count_05_max"] = df[["hit_count_05", "cand_hit_count_05"]].max(axis=1) # 样本预测概率和其candidate的概率 的最大值（0.5）
    df["hit_count_sum_max"] = df[["hit_count_sum", "cand_hit_count_sum"]].max(axis=1) # 样本预测概率和其candidate的概率 的最大值（所有样本）
    
    cossim = []
    eucdist = []
    
    eucdist1=[]
    eucdist2=[]
    eucdist3=[]
    eucdist4=[]
    eucdist5=[]
    
    chunk_size = 100_000 # 每次处理的数据量
    for i in range(0, len(df), chunk_size):  # 以chunk取数据，防止OOM 
        cossim.append(cupy.multiply(V_embed_concat[i:i+chunk_size], V_embed_concat[cand_index[i:i+chunk_size]]).sum(axis=1)) # V_embed_concat 相似度
        eucdist.append(cupy.sqrt(((V_embed_concat[i:i+chunk_size] - V_embed_concat[cand_index[i:i+chunk_size]]) ** 2).sum(axis=1))) # V_embed_concat 欧式距离 cur - cand_index
        
        eucdist1.append(cupy.sqrt(((V_embed_concat[prev_cand_index[i:i+chunk_size]] - V_embed_concat[cand_index[i:i+chunk_size]]) ** 2).sum(axis=1))) # 欧式距离1 prev_cand_index - cand_index
        eucdist2.append(cupy.sqrt(((V_embed_concat[next_cand_index[i:i+chunk_size]] - V_embed_concat[cand_index[i:i+chunk_size]]) ** 2).sum(axis=1))) # 欧式距离2 next_cand_index - cand_index
        eucdist3.append(cupy.sqrt(((V_embed_concat[i:i+chunk_size]                  - V_embed_concat[prev_cand_index[i:i+chunk_size]]) ** 2).sum(axis=1))) # 欧式距离3 cur - prev_cand_index
        eucdist4.append(cupy.sqrt(((V_embed_concat[i:i+chunk_size]                  - V_embed_concat[next_cand_index[i:i+chunk_size]]) ** 2).sum(axis=1))) # 欧式距离4 cur - next_cand_index
        eucdist5.append(cupy.sqrt(((V_embed_concat[prev_cand_index[i:i+chunk_size]] - V_embed_concat[next_cand_index[i:i+chunk_size]]) ** 2).sum(axis=1))) # 欧式距离5 prev_cand_index - next_cand_index
    
    # 将cupy数据转换为pandas数据
    df["embed_cossim"] = cupy.concatenate(cossim) 
    df["embed_eucdist"] = cupy.concatenate(eucdist)
    df["embed_eucdist1"] = cupy.concatenate(eucdist1)
    df["embed_eucdist2"] = cupy.concatenate(eucdist2)
    df["embed_eucdist3"] = cupy.concatenate(eucdist3)
    df["embed_eucdist4"] = cupy.concatenate(eucdist4)
    df["embed_eucdist5"] = cupy.concatenate(eucdist5)
    
    # 欧式距离差
    df["d0_d1"] = df["embed_eucdist"] - df["embed_eucdist1"] 
    df["d0_d2"] = df["embed_eucdist"] - df["embed_eucdist2"]
    df["d0_d3"] = df["embed_eucdist"] - df["embed_eucdist3"]
    df["d0_d4"] = df["embed_eucdist"] - df["embed_eucdist4"]
    df["d0_d5"] = df["embed_eucdist"] - df["embed_eucdist5"]
    
    for col in ["id", "name", "address", "city", "state", "zip", "country", "url", "phone", "categories", "full_address"]:
        df[f"{col}_edit_dist"] = df[col].str.edit_distance(df[col][cand_index]) # Levenshtein 距离
        # 标准化 Levenshtein 距离
        df[f"norm_{col}_edit_dist"] = df[f"{col}_edit_dist"] / df[col].str.len() # Levenshtein 距离/长度
        df[f"norm_{col}_edit_dist"] = df[f"norm_{col}_edit_dist"].replace([np.inf, -np.inf], 0) # 将inf替换为0
    
    features = [
        "diff_lon",
        "diff_lat",
        "lonlat_eucdist",
        "lonlat_manhattan",
        "lonlat_haversine_dist",
        "name_cossim",
        "full_address_cossim",
        "cat_cossim",
        "embed_cossim",
        "embed_eucdist",
        
        "embed_eucdist1",
        "embed_eucdist2",
        "embed_eucdist3",
        "embed_eucdist4",
        "embed_eucdist5",
        "d0_d1",
        "d0_d2",
        "d0_d3",
        "d0_d4",
        "d0_d5",

        "hit_count_02",
        "hit_count_03",
        "hit_count_04",
        "hit_count_05",
        "hit_count_sum",
        
        "hit_count_02_min",
        "hit_count_03_min",
        "hit_count_04_min",
        "hit_count_05_min",
        "hit_count_sum_min",
        "hit_count_02_max",
        "hit_count_03_max",
        "hit_count_04_max",
        "hit_count_05_max",
        "hit_count_sum_max"
    ]
    
    for col in ["id", "name", "address", "city", "state", "zip", "country", "url", "phone", "categories", "full_address"]:
        features.append(f"{col}_edit_dist") # 存下edit_dist类特征
        features.append(f"norm_{col}_edit_dist") # 存下标准化后的edit_dist类特征
    return df, features

In [None]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    '''id与poi的映射'''
    return dict(zip(input_df['id'], input_df['point_of_interest']))


def get_poi2ids(input_df: pd.DataFrame) -> dict:
    '''
    poi与id的映射
    poi对应n个id
    '''
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

id2poi = get_id2poi(df.to_pandas()) # id与poi的映射
poi2ids = get_poi2ids(df.to_pandas()) # poi与id的映射


def id2target_size(id_str: str):
    return len(poi2ids[id2poi[id_str]]) # 返回id对应的poi数量


def get_score(input_df: pd.DataFrame):
    '''
    计算得分
    '''
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()): # 循环每个id和对应的match_ids
        targets = poi2ids[id2poi[id_str]] # 获取id对应的poi
        preds = set(matches.split()) # 获取预测的poi
        score = len((targets & preds)) / len((targets | preds)) # 计算得分
        scores.append(score) # 存下得分
    scores = np.array(scores) # 转换为numpy数组
    return scores.mean() # 返回平均得分

In [None]:
THRESH = 0.4 # 阈值

In [None]:
from collections import defaultdict

def train(I, n_candidates, params, cand_type):
    '''
    训练模型
    '''
    link = defaultdict(lambda:defaultdict(lambda: 10**10)) # 临时字典
    
    models = {}
    kf = GroupKFold(n_splits=CFG.n_splits) # 分割训练集/验证集
    for i, (trn_idx, val_idx) in enumerate(kf.split(df.to_pandas(), df["point_of_interest"].to_pandas(), df["point_of_interest"].to_pandas())):
        df.loc[val_idx, "fold"] = i

    folds = df["fold"].to_pandas().to_numpy()

    # 初始化预测结果
    df["hit_count_02"] = 0
    df["hit_count_03"] = 0
    df["hit_count_04"] = 0
    df["hit_count_05"] = 0
    df["hit_count_sum"] = 0
    
    df["cand_hit_count_02"] = 0
    df["cand_hit_count_03"] = 0
    df["cand_hit_count_04"] = 0
    df["cand_hit_count_05"] = 0
    df["cand_hit_count_sum"] = 0
    
    oof = []
    # training
    for i in range(I.shape[1]):
        tmp_df = df.copy() # 复制一份df
        tmp_df["target"] = tmp_df["point_of_interest"] == tmp_df["point_of_interest"].to_pandas().values[I[:, i]] # 第i个相似度的candidate的target为True, 其他为False
        
        tmp_df["match_id"] = tmp_df["id"].to_pandas().values[I[:, i]] # 第i个相似度的candidate的id
        tmp_df["id_target_size"] = tmp_df.id.to_pandas().apply(id2target_size) # 与id相同poi的数量
        tmp_df["match_id_target_size"] = tmp_df.match_id.to_pandas().apply(id2target_size) # 与该match_id相同poi的数量
        tmp_df["sample_weight"] = (1 / tmp_df.id_target_size) + (1 / tmp_df.match_id_target_size) # 样本权重

        tmp_df, features = create_features(tmp_df, i, I) # 创建特征

        print(f"Candidate rank {i} target mean = {tmp_df['target'].mean()}") # 打印 相似度排名第i的target平均值

        for fold in range(CFG.n_splits):
            print(f"== fold {fold} ==")
            train_idx = folds != fold # 训练集idx
            test_idx = folds == fold # 测试集idx

            X_train, y_train, train_weights = (
                tmp_df.loc[train_idx, features], # 训练集特征
                tmp_df.loc[train_idx, "target"].astype(int), # 训练集target
                tmp_df.loc[train_idx, "sample_weight"], # 训练集权重
            )
            X_test, y_test = (
                tmp_df.loc[test_idx, features], # 测试集特征
                tmp_df.loc[test_idx, "target"].astype(int), # 测试集target
            )
            X_all, y_all = (
                tmp_df[features], # 所有特征
                tmp_df["target"].astype(int) # 所有target
            )

            _oof = tmp_df.loc[test_idx, ["id", "point_of_interest", "match_id"]].to_pandas() # 测试集的id, point_of_interest, match_id

            dtrain = xgb.DMatrix(X_train, y_train, weight=train_weights) # 训练集dmatrix
            dtest = xgb.DMatrix(X_test, y_test) # 测试集dmatrix
            dall = xgb.DMatrix(X_all, y_all) # 所有数据dmatrix

            xgb_model = xgb.train(
                params=params, # 参数
                dtrain=dtrain, # 训练集dmatrix
                num_boost_round=5_000, # 迭代次数
                evals=[(dtrain, "train"), (dtest, "test")], # 评估集
                early_stopping_rounds=100, # 早停
                verbose_eval=500, # 每500次迭代打印一次
            )
            xgb_model.save_model(f"fs_xgb_model_{cand_type}_candidate{i}_fold{fold}.json") # 保存模型
            models[i] = xgb_model # 保存模型
            _oof["pred"] = xgb_model.predict(dtest, ntree_limit=xgb_model.best_iteration) # 预测

            oof.append(_oof.query("pred >= @THRESH")) # 预测结果（大于某个阈值）
            all_pred = xgb_model.predict(dall, ntree_limit=xgb_model.best_iteration) # 预测所有数据
            
            ids = _oof.id.values # 测试集id
            match_ids = _oof.match_id.values # 测试集match_id
            pred_lis = _oof.pred.values # 测试集预测结果
            for i in range(len(_oof)):
                dist = 1 - pred_lis[i] # 距离 = 1-预测概率

                if dist > 0.9: # 如果距离大于0.9，则跳过
                    continue
                current_min = min([link[ids[i]][match_ids[i]], link[match_ids[i]][ids[i]], dist]) # 更新最小距离
                # 保存最小id和match_id的距离
                link[match_ids[i]][ids[i]] = current_min 
                link[ids[i]][match_ids[i]] = current_min 

        #hit_count 
        df["hit_count_02"] += (all_pred > 0.2) # 预测结果大于0.2
        df["hit_count_03"] += (all_pred > 0.3) # 预测结果大于0.3
        df["hit_count_04"] += (all_pred > 0.4) # 预测结果大于0.4 
        df["hit_count_05"] += (all_pred > 0.5) # 预测结果大于0.5
        df["hit_count_sum"] += all_pred # 预测结果
        
        gc.collect() # 清空内存
        torch.cuda.empty_cache() # 清空cuda缓存

    oof = pd.concat(oof, axis=0).reset_index(drop=True) # 合并测试集的预测结果
    print("Done training.")

    return models, oof, link


In [None]:
gc.collect() # 清空内存
torch.cuda.empty_cache() # 清空cuda缓存

models_embed, oof_embed, link_embed = train(I_concat, CFG.n_candidates, params={
    "objective": "binary:logistic", # 二分类
    "tree_method": "gpu_hist", # GPU搜索
    "verbosity": 0, # 不打印日志
    "learning_rate": 0.05, # 学习率
    "max_depth": 8, # 树最大深度
    "min_child_weight": 0, # 叶子节点最小权重
    "lambda": 1.0, # L2正则化
    "alpha": 0.1, # L1正则化
}, cand_type="embed") # 训练bert embedding特征

models_spatial, oof_spatial, link_spatial = train(I_spatial, CFG.n_candidates, params={
    "objective": "binary:logistic", # 二分类
    "tree_method": "gpu_hist", # GPU搜索
    "verbosity": 0, # 不打印日志
    "learning_rate": 0.05, # 学习率
    "max_depth": 8, # 树最大深度 
    "min_child_weight": 0, # 叶子节点最小权重
    "lambda": 1.0,  # L2正则化
    "alpha": 0.1,   # L1正则化
}, cand_type="spatial") # 训练spatial特征

oof = pd.concat([oof_embed, oof_spatial], axis=0) # 合并两个模型的预测结果

# del df
# gc.collect()

In [None]:
tmp = oof_embed.groupby("id")["match_id"].apply(list).reset_index() 
tmp["matches"] = tmp["match_id"].apply(lambda x: " ".join(set(x)))
print(f"Score {get_score(tmp)}, THRESH {THRESH}") # embedding 分数 0.997

tmp = oof_spatial.groupby("id")["match_id"].apply(list).reset_index()
tmp["matches"] = tmp["match_id"].apply(lambda x: " ".join(set(x)))
print(f"Score {get_score(tmp)}, THRESH {THRESH}") # spatial 分数 0.877

tmp = oof.groupby("id")["match_id"].apply(list).reset_index() # 
tmp["matches"] = tmp["match_id"].apply(lambda x: " ".join(set(x)))
print(f"Score {get_score(tmp)}, THRESH {THRESH}") # 合并两种特征的分数 0.996