In [None]:
import os
import gc
import cv2
import math
import copy
import time
import random

import cudf
import cupy
import pandas as pd
import numpy as np
import xgboost as xgb
import numpy as np
import pandas as pd
from tqdm import tqdm

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# For Transformer Models
import transformers
from transformers import AutoTokenizer, AutoModel, AdamW, AutoConfig

from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.metrics import pairwise_distances

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold

from unidecode import unidecode


class CFG:
    seed = 42 # 随机种子
    device = torch.device('cuda') # GPU

    epochs = 30 # 训练 epochs
    lr = 5e-5 # 学习率
    max_length = 64 # 最大长度

    # Arcface
    s = 30.0 # arcface 参数 scale
    m = 0.5  # arcface 参数 margin
    ls_eps = 0.0 # arcface 参数 eps
    easy_margin = False # easy_margin
    n_classes = 739972 # 分类数量
    
    
# 四个模型
class CFG_Model1:
    model_name = "../input/huggingface-roberta-variants/xlm-roberta-large/xlm-roberta-large" 
    weight = "../input/model1/xlm-roberta-large_epoch30.bin"
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

class CFG_Model2:
    model_name = "../input/sentence-transformers/LaBSE/0_Transformer"
    weight = "../input/model2/0_Transformer_epoch30.bin"
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    
class CFG_Model3:
    model_name = "../input/paraphrasemultilingualmpnetbasev2/paraphrase-multilingual-mpnet-base-v2"
    weight = "../input/model3/paraphrase-multilingual-mpnet-base-v2_epoch30.bin"
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    
class CFG_Model4:
    model_name = "../input/rembert-pt"
    weight = "../input/model4/rembert-pt_epoch30.bin"
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [None]:
# train.csv
df_train = cudf.read_csv("../input/trainfilled/train_filled.csv", dtype={
    "id": str, "name": str, "latitude": float, "longitude": float, "address": str, 
    "city": str, "state": str, "zip": str, "country": str, "url": str, "phone": str, "categories": str, "point_of_interest": str
})


for col in ["name", "address", "city", "state", "zip", "country", "url", "phone", "categories"]:
    df_train[col] = df_train[col].fillna("") # 对空值填充空字符串

# fulltext = 拼接name、address、city、state、country、categories
df_train["fulltext"] = (
    df_train["name"] + " " + df_train["address"] + " " + df_train["city"] + " " + df_train["state"] + " "  + df_train["country"] + " " + df_train["categories"]
).to_pandas().replace(r'\s+', ' ', regex=True) # 

# Standardization of coordinates.
# https://datascience.stackexchange.com/questions/13567/ways-to-deal-with-longitude-latitude-feature
df_train["coord_x"] = cupy.cos(df_train["latitude"]) * cupy.cos(df_train["longitude"]) # 经度和纬度转换成x坐标
df_train["coord_y"] = cupy.cos(df_train["latitude"]) * cupy.sin(df_train["longitude"]) # 经度和纬度转换成y坐标
df_train["coord_z"] = cupy.sin(df_train["latitude"]) # 经度和纬度转换成z坐标


encoder = LabelEncoder() # 创建标签编码器
df_train['point_of_interest'] = encoder.fit_transform(df_train['point_of_interest'].to_array()) # 对point_of_interest做编码标签
                       
print(df_train.shape)
df_train.head()

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CFG.seed)

## Load ArcFace model

In [None]:
class FourSquareDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.fulltext = df['fulltext'].values # 全文
        self.latitudes = df['latitude'].values # 纬度
        self.longitudes = df['longitude'].values # 经度
        self.coord_x = df['coord_x'].values # 经纬度坐标 x
        self.coord_y = df['coord_y'].values # 经纬度坐标 y
        self.coord_z = df['coord_z'].values # 经纬度坐标 z
        self.labels = df['point_of_interest'].values # point_of_interest 标签
        self.tokenizer = tokenizer # 词表
        self.max_length = max_length # 最大长度
        
    def __len__(self):
        return len(self.fulltext) # 返回数据长度
    
    def __getitem__(self, index):
        fulltext = self.fulltext[index] # 全文
        latitude = self.latitudes[index] # 纬度
        longitude = self.longitudes[index] # 经度
        label = self.labels[index] # 标签
        coord_x = self.coord_x[index] # 经纬度坐标 x
        coord_y = self.coord_y[index] # 经纬度坐标 y
        coord_z = self.coord_z[index] # 经纬度坐标 z
        
        inputs = self.tokenizer(
            fulltext, # 全文
            truncation=True, # 截断
            add_special_tokens=True, # 添加特殊字符
            max_length=self.max_length, # 最大长度
            padding='max_length', # 填充方式
            return_tensors="pt" # 返回张量
        )

        return {
            'ids': inputs['input_ids'][0], # input_ids
            'mask': inputs['attention_mask'][0], # 注意力掩码
            'latitude': torch.tensor(latitude, dtype=torch.float), # 纬度
            'longitude': torch.tensor(longitude, dtype=torch.float), # 经度
            'coord_x': torch.tensor(coord_x), # 经纬度坐标 x
            'coord_y': torch.tensor(coord_y), # 经纬度坐标 y
            'coord_z': torch.tensor(coord_z), # 经纬度坐标 z
            'label': torch.tensor(label, dtype=torch.long) # 标签
        }

In [None]:
# Arcface
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features  # input的维度
        self.out_features = out_features # output的维度
        self.s = s # re-scale
        self.m = m # margin
        self.ls_eps = ls_eps  # label smoothing
        # 初始化权重
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin # easy_margin 模式
        self.cos_m = math.cos(m) # cos margin
        self.sin_m = math.sin(m) # sin margin
        self.threshold = math.cos(math.pi - m) # cos(pi - m) = -cos(m)
        self.mm = math.sin(math.pi - m) * m # sin(pi - m)*m = sin(m)*m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight)) # 获得cosθ (vector)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) # 获得cosθ
        phi = cosine * self.cos_m - sine * self.sin_m # cosθ*cosm – sinθ*sinm = cos(θ + m)
        phi = phi.float() # phi to float
        cosine = cosine.float() # cosine to float
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            # 以下代码控制了 θ+m 应该在 range[0, pi]
            # if cos(θ) > cos(pi - m) means θ + m < math.pi, so phi = cos(θ + m);
            # else means θ + m >= math.pi, we use Talyer extension to approximate the cos(θ + m).
            # if fact, cos(θ + m) = cos(θ) - m * sin(θ) >= cos(θ) - m * sin(math.pi - m)
            phi = torch.where(cosine > self.threshold, phi, cosine - self.mm) # https://github.com/ronghuaiyang/arcface-pytorch/issues/48
        # --------------------------- convert label to one-hot ---------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        # 对label形式进行转换，假设batch为2、有3类的话，即将label从[1,2]转换成[[0,1,0],[0,0,1]]
        one_hot = torch.zeros(cosine.size(), device=CFG.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        # 进行re-scale
        output *= self.s

        return output

class FSMultiModalNet(nn.Module):
    def __init__(self, model_name, fc_dim, num_features=3):
        super(FSMultiModalNet, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name) # 加载预训练模型
        self.bert_model = AutoModel.from_pretrained(model_name, config=self.config) # 加载预训练模型
        # self.embedding = nn.Linear(self.config.hidden_size + 2, embedding_size)

        self.fc = nn.Linear(self.bert_model.config.hidden_size + num_features, fc_dim) # 全连接层 hidden_size + x'y'z'
        self.bn = nn.BatchNorm1d(fc_dim) # BatchNorm1d
        self._init_params() # 初始化参数

        self.margin = ArcMarginProduct(
            fc_dim, # 输入维度
            CFG.n_classes, # 输出维度
            s=CFG.s, # scale
            m=CFG.m, # margin 
            easy_margin=CFG.easy_margin, # easy_margin
            ls_eps=CFG.ls_eps # label smoothing epsilon
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight) # 初始化全连接层权重
        nn.init.constant_(self.fc.bias, 0) # 初始化全连接层偏置
        nn.init.constant_(self.bn.weight, 1) # 初始化 BatchNorm1d 权重
        nn.init.constant_(self.bn.bias, 0) # 初始化 BatchNorm1d 偏置

    def forward(self, ids, mask, lat, lon, coord_x, coord_y, coord_z, labels):
        feature = self.extract_feature(ids, mask, lat, lon, coord_x, coord_y, coord_z) # 提取特征
        output = self.margin(feature, labels) # ArcMarginProduct 输出

        return output
    
    def extract_feature(self, input_ids, attention_mask, lat, lon, coord_x, coord_y, coord_z):
        x = self.bert_model(input_ids=input_ids, attention_mask=attention_mask) # 获取 BERT 特征
        x = torch.sum(x.last_hidden_state * attention_mask.unsqueeze(-1), dim=1) / attention_mask.sum(dim=1, keepdims=True) # 将 BERT 特征attention_mask部分求平均

        x = torch.cat([x, coord_x.view(-1, 1), coord_y.view(-1, 1), coord_z.view(-1, 1)], axis=1) # 将 bert输出 和 x'y'z' 合并

        x = self.fc(x) # 全连接层
        x = self.bn(x) # BatchNorm1d

        return x # 返回特征

In [None]:
model_name = CFG_Model3.model_name
model_arch = model_name.split("/")[-1]
tokenizer = CFG_Model3.tokenizer

model = FSMultiModalNet(model_name, 320) # 初始化模型
model.to(CFG.device); # 将模型转到GPU

optimizer = torch.optim.Adam(params=model.parameters(), lr=CFG.lr)
criterion = nn.CrossEntropyLoss()

In [None]:
dataset = FourSquareDataset(df_train.to_pandas(), tokenizer=tokenizer, max_length=CFG.max_length) # Dataset
dataloader = DataLoader(dataset, batch_size=2, num_workers=2, shuffle=True, pin_memory=True) # DataLoader
def train_fn(model, dataloader):
    model.train()
    bar = tqdm(dataloader)
    losses = []
    for idx, data in enumerate(bar): 
        ids = data['ids'].to(CFG.device, dtype=torch.long) # input_ids
        mask = data['mask'].to(CFG.device, dtype=torch.long) # attention_mask
        latitude =  data['latitude'].to(CFG.device, dtype=torch.float) # latitude
        longitude = data['longitude'].to(CFG.device, dtype=torch.float) # longitude
        coord_x =   data['coord_x'].to(CFG.device, dtype=torch.float) # coord_x
        coord_y =   data['coord_y'].to(CFG.device, dtype=torch.float) # coord_y
        coord_z =   data['coord_z'].to(CFG.device, dtype=torch.float) # coord_z
        labels =    data['label'].to(CFG.device, dtype=torch.long) # label

        optimizer.zero_grad()  # 优化器置零
        y_preds = model(ids, mask, latitude, longitude, coord_x, coord_y, coord_z, labels) # 把数据放入模型训练
        loss = criterion(y_preds, labels) # 计算loss
        loss.backward() # 反向传播
        optimizer.step() # 优化器迭代

        losses.append(loss.item()) # 存下当前的loss
        smooth_loss = np.mean(losses[-30:]) # 求近30步的平均loss
        bar.set_description(f'loss: {loss.item():.5f}, smth: {smooth_loss:.5f}, lr: {optimizer.param_groups[0]["lr"]:.7f}') # 更新进度条
    loss_train = np.mean(losses) # 求全体平均loss    
    return loss_train

In [None]:
for epoch in range(CFG.epochs):
    loss_train = train_fn(model, dataloader)
    torch.save(model.state_dict(), f'{model_arch}_epoch{epoch}.bin')
    torch.cuda.empty_cache()
    gc.collect()
    print(f"{epoch} Training loss    : {loss_train:.4f}")
torch.cuda.empty_cache()
gc.collect()

# Get Embedding

In [None]:
df = cudf.read_csv("../input/trainfilled/train_filled.csv", dtype={
    "id": str, "name": str, "latitude": float, "longitude": float, "address": str, 
    "city": str, "state": str, "zip": str, "country": str, "url": str, "phone": str, "categories": str, "point_of_interest": str
})


for col in ["name", "address", "city", "state", "zip", "country", "url", "phone", "categories"]:
    df[col] = df[col].fillna("") # 对空值填充空字符串

# fulltext = 拼接name、address、city、state、country、categories
df["fulltext"] = (
    df["name"] + " " + df["address"] + " " + df["city"] + " " + df["state"] + " "  + df["country"] + " " + df["categories"]
).to_pandas().replace(r'\s+', ' ', regex=True) # 

# Standardization of coordinates.
# https://datascience.stackexchange.com/questions/13567/ways-to-deal-with-longitude-latitude-feature
df["coord_x"] = cupy.cos(df["latitude"]) * cupy.cos(df["longitude"]) # 经度和纬度转换成x坐标
df["coord_y"] = cupy.cos(df["latitude"]) * cupy.sin(df["longitude"]) # 经度和纬度转换成y坐标
df["coord_z"] = cupy.sin(df["latitude"]) # 经度和纬度转换成z坐标


encoder = LabelEncoder() # 创建标签编码器
df['point_of_interest'] = encoder.fit_transform(df['point_of_interest'].to_array()) # 对point_of_interest做编码标签
                       
print(df.shape)
df.head()

In [None]:
def get_embed(model,tokenizer):
    # NN embeddings 
    dataset = FourSquareDataset(df.to_pandas(), tokenizer=tokenizer, max_length=CFG.max_length) # Dataset
    loader = DataLoader(dataset, batch_size=256, num_workers=5, shuffle=False, pin_memory=True) # DataLoader

    embeds = []
    with torch.no_grad():
        for data in tqdm(loader): 
            ids = data['ids'].to(CFG.device, dtype=torch.long) # input_ids
            mask = data['mask'].to(CFG.device, dtype=torch.long) # attention_mask

            latitude = data['latitude'].to(CFG.device, dtype=torch.float) # latitude
            longitude = data['longitude'].to(CFG.device, dtype=torch.float) # longitude
            coord_x = data['coord_x'].to(CFG.device, dtype=torch.float) # coord_x
            coord_y = data['coord_y'].to(CFG.device, dtype=torch.float) # coord_y
            coord_z = data['coord_z'].to(CFG.device, dtype=torch.float) # coord_z
            # labels = data['label'].to(CFG.device, dtype=torch.long)

            emb = model.extract_feature(ids, mask, latitude, longitude, coord_x, coord_y, coord_z) # embeddings
            embeds.append(emb.detach().cpu().numpy()) # embeddings list

    V_embed_bert = cupy.array(np.concatenate(embeds)) # concatenate embeddings --> array
    V_embed_bert = V_embed_bert / cupy.linalg.norm(V_embed_bert, ord=2, axis=1, keepdims=True) # l2 normalization
    return V_embed_bert # 返回 embeddings

def get_embed_model(model_name,fcdim, state_dict,tokenizer):
    model = FSMultiModalNet(model_name, fcdim) # 初始化模型
    model.to(CFG.device); # 将模型转到GPU
    model.load_state_dict(torch.load(state_dict)) # 加载模型参数
    embed = get_embed(model,tokenizer) # 获取embedding

    del model
    gc.collect()
    torch.cuda.empty_cache()
    return embed

In [None]:
import pickle 

# Embedding1
embed1 = get_embed_model(CFG_Model1.model_name, 320, CFG_Model1.weight, CFG_Model1.tokenizer)
with open('./bert_embed_1/V_embed_bert', 'wb') as f:
    pickle.dump(embed1, f)
gc.collect()
torch.cuda.empty_cache()

# Embedding2
embed2 = get_embed_model(CFG_Model2.model_name, 320, CFG_Model2.weight, CFG_Model2.tokenizer)
with open('./bert_embed_2/V_embed_bert', 'wb') as f:
    pickle.dump(embed2, f)
gc.collect()
torch.cuda.empty_cache()

# Embedding3
embed3 = get_embed_model(CFG_Model3.model_name, 320, CFG_Model3.weight, CFG_Model3.tokenizer)
with open('./bert_embed_3/V_embed_bert', 'wb') as f:
    pickle.dump(embed3, f)
gc.collect()
torch.cuda.empty_cache()

# Embedding4
embed4 = get_embed_model(CFG_Model4.model_name, 320, CFG_Model4.weight, CFG_Model4.tokenizer)
with open('./bert_embed_4/V_embed_bert', 'wb') as f:
    pickle.dump(embed4, f)
gc.collect()
torch.cuda.empty_cache()