In [308]:
from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

from torch_geometric.loader import NeighborSampler as RawNeighborSampler
from torch_geometric.nn import SAGEConv

import numpy as np
import pandas as pd
import osmnx as ox


In [309]:
street_nodes_df = pd.read_csv("./outputs/akl_street_nodes.csv")
street_nodes_df = street_nodes_df[street_nodes_df.columns[4:]]

street_nodes_df_copy = street_nodes_df.copy()
street_nodes_df_copy.drop(["street_length", "Average_POI_Distance", "x", "y"], axis=1, inplace=True)
print(street_nodes_df_copy.head())

street_nodes_features_tensor = torch.tensor(street_nodes_df_copy.values.tolist())
number_of_nodes = len(street_nodes_features_tensor)
number_of_node_features = len(street_nodes_features_tensor[0])
print(street_nodes_features_tensor)
print(number_of_nodes)
print(number_of_node_features)

   restaurant  amenity  school  shop  healthcare  clothes
0         0.0      0.0     0.0   0.0         0.0      0.0
1         0.0      0.0     0.0   0.0         0.0      0.0
2         0.0      0.0     0.0   0.0         0.0      0.0
3         0.0      0.0     0.0   0.0         0.0      0.0
4         0.0      0.0     0.0   0.0         0.0      0.0
tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])
458252
6


In [310]:
street_edges_df = pd.read_csv("./outputs/akl_street_edges.csv")
source_street_index, targe_street_index, street_distance_weight = street_edges_df["source_street"], street_edges_df[
    "target_street"], street_edges_df["distance"]
street_edges_source_index_tensor = torch.tensor([source_street_index.values.tolist()])
street_edges_target_index_tensor = torch.tensor([targe_street_index.values.tolist()])
street_edges_index_tensor = torch.cat((street_edges_source_index_tensor, street_edges_target_index_tensor), 0)
street_edges_weight_tensor = torch.tensor(street_distance_weight.values.tolist())
print(street_edges_index_tensor)
print(street_edges_weight_tensor)

tensor([[     0,      2,      0,  ..., 458250, 458250, 458251],
        [     2,      0,      1,  ..., 458249, 458251, 458250]])
tensor([ 96.8730,  96.8730, 100.7870,  ..., 328.7770, 493.5280, 493.5280])


In [311]:
def custom_pos_sampling(
        edge_weight: Tensor,
        batch: Tensor,
) -> Union[Tensor, Tuple[Tensor, Tensor]]:
    pos_node_seq = []
    neg_node_seq = []
    for start_node_id in batch:
        current_node_seq = [start_node_id.item()]
        total_distance = 0
        current_node_id = start_node_id
        # 在edge文件里 对应的id 要 -1 比如neighbour是0， 在文件里index是1
        neighbours_edge_index = (street_edges_index_tensor == current_node_id).nonzero(as_tuple=True)[1]

        # 选出edge对应的weight
        neighbour_weights = torch.index_select(edge_weight, 0, neighbours_edge_index)
        norm_neighbour_weights = [i / sum(neighbour_weights.numpy()) for i in neighbour_weights.numpy()]
        #根据概率随机选一个
        #print(neighbours_edge_index,len(neighbour_weights))
        if len(neighbour_weights) == 0:
            current_node_seq.append(current_node_id)
            pos_node_seq.append(current_node_seq)
            #neg_node_seq.append(current_node_seq)
            continue
        neighbour_weights_index = np.random.choice(len(neighbour_weights), p=norm_neighbour_weights)

        # print("current Node id \n", current_node_id)
        #print("neighbour weights \n", neighbour_weights)
        #print("neighbour weights index  \n", neighbour_weights_index)
        #print(neighbour_weights.min(),neighbour_weights.argmin())

        # 取最近的边
        # TODO：加入别的策略，poi信息等
        next_edge_index = neighbours_edge_index[neighbour_weights_index]
        next_edge_df = street_edges_df.iloc[[next_edge_index]]
        #print("next edge \n", next_edge_df)
        next_edge = next_edge_df.values[0]
        total_distance += next_edge[2]
        # next_edge[0] = source street
        # next_edge[1] = target_street
        # next_edge[2] = distance
        if next_edge[0] != current_node_id:
            current_node_id = next_edge[0]
        else:
            current_node_id = next_edge[1]
        current_node_seq.append(current_node_id)
        pos_node_seq.append(current_node_seq)
    #if len(neg_node_seq) >0 :
    #print("Isolated node: {number} {node_list}".format(number = len(neg_node_seq),node_list = neg_node_seq))
    return torch.from_numpy(np.asarray(pos_node_seq, dtype=np.int32))

In [312]:
def custom_sampling_with_POI(
        edge_weight: Tensor,
        batch: Tensor,
) :
    pos_node_seq = []
    neg_node_seq = []
    poi_nodes = set()
    no_poi_nodes=set()
    for start_node_id in batch:
        current_node_seq = [start_node_id.item()]
        current_node_id = current_node_seq[-1]
        # 找距离？
        # current_x,current_y = street_nodes_df.iloc[[start_node_id]]["x"],street_nodes_df.iloc[[start_node_id]]["y"]
        # print(start_node_id)
        # print(street_nodes_df.iloc[[start_node_id]])
        neighbours_edge_index = (street_edges_index_tensor == current_node_id).nonzero(as_tuple=True)[1]

        neighbour_id_list = []
        neighbour_id_index = []
        for edge_index in neighbours_edge_index:
            neighbour_edge_df = street_edges_df.iloc[[edge_index]]
            neighbour_edge = neighbour_edge_df.values[0]
            if neighbour_edge[0] != current_node_id:
                neighbour_id = neighbour_edge[0]
            else:
                neighbour_id = neighbour_edge[1]
            # neighbour_id_list.append([neighbour_id])
            neighbour_id_list.append(neighbour_id)
            neighbour_id_index.append(0)

        # steps 自动-1 比如想要3步的话 就传2
        # neighbour_id_list = find_neighbours(3, neighbour_id_list, neighbour_id_index)#,current_x,current_y,500)
        neighbour_id_weights = []
        # for neighbour_ids in neighbour_id_list:
        poi_weight = 0
        for neighbour_id in neighbour_id_list:
            neighbour_features = torch.index_select(street_nodes_features_tensor, 0,
                                                    torch.tensor(int(neighbour_id), dtype=torch.int32))
            poi_weight += torch.sum(neighbour_features)
        neighbour_id_weights.append(poi_weight)

        # print(f"neighbour_id_weights = {neighbour_id_weights}")
        neighbour_id_weights = np.array(neighbour_id_weights)
        normalized_neighbour_weights = [i / sum(neighbour_id_weights) for i in neighbour_id_weights]

        neighbour_weights_index = 0

        if np.isnan(normalized_neighbour_weights).all():
            no_poi_nodes.add(current_node_id)
            if len(neighbours_edge_index) == 0:
                current_node_seq.append(current_node_id)
                pos_node_seq.append(current_node_seq)
                if len(poi_nodes) == 0:
                    init_neg_node = street_nodes_df.sample()
                    # print(init_neg_node.index.values[0])
                    neg_node_seq.append(init_neg_node.index.values[0])
                else:
                    neg_node_seq.append(np.random.choice(list(poi_nodes)))
                continue
            else:
                neighbour_weights_index = np.random.choice(len(neighbours_edge_index))
        else:
            poi_nodes.add(current_node_id)
            neighbour_weights_index = np.random.choice(len(normalized_neighbour_weights),
                                                       p=normalized_neighbour_weights)

        next_edge_index = neighbours_edge_index[neighbour_weights_index]
        next_edge_df = street_edges_df.iloc[[next_edge_index]]
        next_edge = next_edge_df.values[0]
        if next_edge[0] != current_node_id:
            current_node_id = next_edge[0]
        else:
            current_node_id = next_edge[1]
        current_node_seq.append(current_node_id)
        pos_node_seq.append(current_node_seq)
        if len(no_poi_nodes) == 0:
            init_neg_node = street_nodes_df.sample()
            # print(init_neg_node.index.values[0])
            neg_node_seq.append(init_neg_node.index.values[0])
        else:
            neg_node_seq.append(np.random.choice(list(no_poi_nodes)))
    return torch.from_numpy(np.asarray(pos_node_seq, dtype=np.int32))[:, 1],torch.from_numpy(np.asarray(neg_node_seq,dtype=np.int32))


# bfs like
def find_neighbours(steps, neighbour_id_list, neighbour_id_index_list):#,origin_x,origin_y,max_dist):
    if steps <= 0:
        return neighbour_id_list

    current = neighbour_id_list
    neighbour_id_index = []
    for i, neighbour_list in enumerate(current):
        short_neighbour_list = neighbour_list[neighbour_id_index_list[i]:]
        neighbour_id_index.append(len(neighbour_list) + 1)
        for neigh in short_neighbour_list:
            neighbours_edge_index = (street_edges_index_tensor == neigh).nonzero(as_tuple=True)[1]

            for edge_index in neighbours_edge_index:
                neighbour_edge_df = street_edges_df.iloc[[edge_index]]
                neighbour_edge = neighbour_edge_df.values[0]
                if neighbour_edge[0] != neigh:
                    neighbour_id = neighbour_edge[0]
                else:
                    neighbour_id = neighbour_edge[1]

                # 找距离？
                # loc_x,loc_y = street_nodes_df.iloc[[neighbour_id]]["x"],street_nodes_df.iloc[[neighbour_id]]["y"]
                # distance_to_origin = ox.distance.euclidean_dist_vec(loc_y,loc_x,
                #                                                  origin_y,origin_x)
                # if distance_to_origin > max_dist:
                #     continue

                neighbour_id_list[i].append(neighbour_id)
    return find_neighbours(steps - 1, neighbour_id_list, neighbour_id_index) #,origin_x,origin_y,max_dist)


In [313]:
"""
RawNeighborSampler This module iteratively samples neighbors (at each layer) and constructs bipartite graphs that simulate the actual computation flow of GNNs.

sizes: denotes how much neighbors we want to sample for each node in each layer.

NeighborSampler holds the current :obj:batch_size, the IDs :obj:n_id of all nodes involved in the computation, and a list of bipartite graph objects via the tuple :obj:(edge_index, e_id, size), where :obj:edge_index represents the bipartite edges between source and target nodes, :obj:e_id denotes the IDs of original edges in the full graph, and :obj:size holds the shape of the bipartite graph.

The actual computation graphs are then returned in reverse-mode, meaning that we pass messages from a larger set of nodes to a smaller one, until we reach the nodes for which we originally wanted to compute embeddings.
https://www.arangodb.com/2021/08/a-comprehensive-case-study-of-graphsage-using-pytorchgeometric/
"""


class NeighborSampler(RawNeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        pos_batch,neg_batch = custom_sampling_with_POI(street_edges_weight_tensor, batch)
        #neg_batch = custom_neg_sampling(street_edges_weight_tensor, batch, self.adj_t.size(1))
        # neg_batch = torch.randint(0, self.adj_t.size(1), (batch.numel(),), dtype=torch.long)
        #print("Custom nodes seq,", pos_batch)
        #print("negative batch \n ", neg_batch)
        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        sampled = super().sample(batch)
        return sampled


train_loader = NeighborSampler(street_edges_index_tensor, sizes=[8, 8], batch_size=256,
                               shuffle=True, num_nodes=number_of_nodes)

In [314]:
class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SAGE(number_of_node_features, hidden_channels=128, num_layers=5)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x, edge_index = street_nodes_features_tensor.to(device), street_edges_index_tensor.to(device)

In [315]:
def train():
    model.train()

    total_loss = 0
    i = 0
    for batch_size, n_id, adjs in train_loader:
        i += 1
        #print(i)
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)

        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)
    # print(i)
    return total_loss / number_of_nodes


@torch.no_grad()
def get_model_embedding():
    model.eval()
    embedding = model.full_forward(x, edge_index).cpu()
    return embedding


for epoch in range(1, 15):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, ')

output_embedding = get_model_embedding()

  normalized_neighbour_weights = [i / sum(neighbour_id_weights) for i in neighbour_id_weights]
  normalized_neighbour_weights = [i / sum(neighbour_id_weights) for i in neighbour_id_weights]


Epoch: 001, Loss: 1.3919, 
Epoch: 002, Loss: 1.3872, 
Epoch: 003, Loss: 1.3870, 
Epoch: 004, Loss: 1.3916, 


KeyboardInterrupt: 

In [None]:
print(output_embedding)

In [None]:
output_np = output_embedding.numpy()  #convert to Numpy array
output_df = pd.DataFrame(output_np)  #convert to a dataframe
output_df.to_csv("./outputs/akl_embedding.csv", index=False)  #save to file