In [145]:
import pandas as pd 

from sklearn.metrics.pairwise import haversine_distances as dist
import numpy as np 
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

def edgify(l):
    return [(min([u,v]),max([u,v])) for u,v in l]
def overlap(A,B):
    return len(A.intersection(B))/min([len(A),len(B)])

In [77]:
# -*- coding: utf-8 -*-
"""
Created on Thu May 12 11:29:55 2022

@author: remit
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import haversine_distances
from sklearn.neighbors import NearestNeighbors
import torch.nn.functional as F
import torch.nn as nn
import torch
import time
from transformers import  BertModel,  BertTokenizerFast
import sys
from torch.utils.data import DataLoader, Dataset
# from cuml.neighbors import NearestNeighbors

class TokenizedDataset(Dataset):
    
    def __init__(self,df,max_len):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
        
    def __getitem__(self, index):
        line=self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(line.text,
                                            padding="max_length",
                                            max_length=self.max_len,
                                            add_special_tokens=True,
                                            return_token_type_ids=True,
                                            truncation=True)
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])
        return ids,mask
    
    def __len__(self):
        return self.df.shape[0]
    
class Cat2VecModel(nn.Module):
    def __init__(self):
        super(Cat2VecModel, self).__init__()
        self.model = BertModel.from_pretrained("setu4993/LaBSE")
        
    def forward(self, ids, mask):
        x = self.model(ids, mask)[0]
        x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x
    


def inference(ds,model):
    BS = 256
    NW = 0
    loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                        pin_memory=False, drop_last=False)
    tbar = tqdm(loader, file=sys.stdout)
    
    vs = []
    with torch.no_grad():
        for idx, (ids, masks) in enumerate(tbar):
            v = model(ids, masks)
            vs.append(v)
    return np.concatenate(vs)      

def embed():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MAX_LEN = 64
    train = pd.read_csv("./data/train.csv")
    
    test = train.sample(n=50000)
    Ids = list(test['id'])

    test['text'] = test[['name', 'categories']].fillna('').agg(' '.join, axis=1)#test['name'].map(str)+' ' +test['categories'].map(str)
    test['text'].drop_duplicates()
    tk=TokenizedDataset(test, MAX_LEN)
    

    cat2vec_model = Cat2VecModel().to(device)
    
    V = inference(tk,cat2vec_model)
    

    return Ids,V


In [None]:
kneighbors_graph

In [66]:
import pickle

ID_file = open('./data/ID_50k.pkl',"wb")
pickle.dump(Ids,ID_file)

Emb_file = open('./data/Embeddings_50k.pkl',"wb")
pickle.dump(X,Emb_file)

#with open('./data/Embeddings.pkl','rb') as file:
#    X = pickle.load(file)

#with open('./data/ID.pkl','rb') as file:
#    Ids = pickle.load(file)

In [None]:
with open('./data/Embeddings.pkl','rb') as file:
    X = pickle.load(file)

with open('./data/ID.pkl','rb') as file:
    Ids = pickle.load(file)

In [58]:
DF  =  pd.read_csv("./data/train.csv")
df = DF[DF['id'].isin(Ids)]
df = df.reset_index(drop=True)


In [None]:
from sklearn.neighbors import kneighbors_graph
k_emb = 5
k_spat =100 

for k_emb in [5,10,15,20]:
    A_emb = kneighbors_graph(X,n_neighbors = k_emb,metric = 'minkowski')
    for k_spat in [10,20,50,100]:
        A_spat = kneighbors_graph(np.array(df[["latitude","longitude"]]),n_neighbors = k_spat,metric = 'haversine')
        T1_spat,T2_spat = np.where(A_spat.toarray()>0)
        T1_emb,T2_emb = np.where(A_spat.toarray()>0)
        groups = df.groupby(["point_of_interest"]).groups
        repeated = {p : groups[p] for p in groups if len(groups[p])>1}
        l = list(df["point_of_interest"])
        indexes = {l.index(p) : list(set(repeated[p])-{l.index(p)}) for p in repeated}
        edges_true = [(u,v)  for u in  indexes for v in indexes[u]]
        edges_spat = [(u,v) for u,v in zip(T1_spat,T2_spat)]
        edges_emb = [(u,v) for u,v in zip(T1_spat,T2_spat)]
        edges_knn = list(set(edges_spat).union(set(edges_emb)))

        A = set(edgify(edges_knn))
        B = set(edgify(edges_true))
        print(k_emb,k_spat,overlap(A,B))

5 10 0.7272727272727273
5 20 0.758893280632411
5 50 0.8181818181818182
5 100 0.857707509881423
10 10 0.7272727272727273
10 20 0.758893280632411
10 50 0.8181818181818182
10 100 0.857707509881423
15 10 0.7272727272727273
15 20 0.758893280632411
15 50 0.8181818181818182
15 100 0.857707509881423
20 10 0.7272727272727273
20 20 0.758893280632411
20 50 0.8181818181818182
