In [1]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch.utils.data import Dataset
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import radius_graph  as RadiusGraph
import numpy as np
from scipy.spatial import KDTree
import torch
from torch_geometric.data import Data



In [2]:
num_classes = 11255

In [3]:
# Haversine formula for distance calculation between two points on the Earth
def haversine_dist(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

# Convert positions to Cartesian coordinates for KD-Tree (optional step for spherical distance)
def latlon_to_cartesian(lat, lon):
    R = 6371  # Earth radius in kilometers
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    x = R * np.cos(lat_rad) * np.cos(lon_rad)
    y = R * np.cos(lat_rad) * np.sin(lon_rad)
    z = R * np.sin(lat_rad)
    return x, y, z

In [4]:
# create dataset fuction

def create_graph_dataset(embeddings, sll, idx, radius_km=250):

    # create train and val embeddings
    embeddings = embeddings[idx]

    # create train and val metadata
    sll = sll.iloc[idx]

    positions = []
    for i in range(len(sll)):
        positions.append([sll.iloc[i][1], sll.iloc[i][2]])
    positions = torch.tensor(positions)
    
    cartesian_positions = np.array([latlon_to_cartesian(lat, lon) for lat, lon in positions])

    # Create KD-Tree
    kdtree = KDTree(cartesian_positions)

    # Use KD-Tree to find all neighbors within the specified radius
    print('Finding edges...')
    edges = []
    for i in range(len(positions)):
        indices = kdtree.query_ball_point(cartesian_positions[i], radius_km / 6371)  # Normalized radius for KD-Tree
        for j in indices:
            if i != j:  # Skip self-loops
                edges.append([i, j])

    # Convert edges to PyTorch Geometric format
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    # Calculate edge attributes (inverted distances in km) for the edges
    print('Calculating edge attributes...')
    edge_attr = torch.tensor([1/(haversine_dist(positions[i][0], positions[i][1], positions[j][0], positions[j][1]) + 1) for i, j in edges], dtype=torch.float)


    # Create the PyTorch Geometric Data object
    print('Creating Data object...')
    data = Data(x=embeddings, edge_index=edge_index, edge_attr=edge_attr, pos=torch.tensor(positions))
    print('Data object created.')
    return data

In [5]:
# Define a function to get labels for a batch (fetching from label_dict)
def get_labels(survey_ids):
    labels = torch.zeros(len(survey_ids), num_classes)
    for i, survey_id in enumerate(survey_ids):
        species_ids = label_dict[int(survey_id)]
        for species_id in species_ids:
            labels[i, species_id] = 1
    return labels

In [6]:
# Load your embeddings and train_metadata
embeddings = torch.load('../eda/embeddings.pt')
metadata = pd.read_csv('../data/l1/GLC24_PA_metadata_train.csv')
sll = metadata.drop_duplicates(subset=['surveyId'])
sll = sll[['surveyId', 'lat', 'lon']]

print(embeddings.shape, sll.shape)

torch.Size([88987, 2768]) (88987, 3)


In [7]:
# Create labels dictionary
metadata = metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
metadata['speciesId'] = metadata['speciesId'].astype(int)
label_dict = metadata.groupby('surveyId')['speciesId'].apply(list).to_dict()

# print number of keys in label_dict
print(len(label_dict.keys()))


88987


In [8]:
def calculate_f1_score_from_tensors(y_true, y_pred, threshold=0.5):
    
    y_pred = (y_pred >= threshold)
    y_true = y_true.cpu().bool()
    y_pred = y_pred.cpu().bool()
    
    TP = (y_true & y_pred).sum(axis=1)  # True Positives per sample
    FP = (y_true & ~y_pred).sum(axis=1)  # False Positives per sample
    FN = (~y_true & y_pred).sum(axis=1)  # False Negatives per sample

    # compute f1 score for each sample
    pre = TP/(TP+FP)
    rec = TP/(TP+FN)
    f1 = 2 * pre * rec / (pre + rec)

    # Handle division by zero
    f1 = np.nan_to_num(f1)

    # compute micro-average f1 score
    micro_f1 = np.mean(f1)
    # Return mean F1 score across all samples
    return micro_f1

In [9]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

# Create GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.sage1 = SAGEConv(dim_in, dim_h)
        self.sage2 = SAGEConv(dim_h, dim_out)

    def forward(self, x, edge_index):
        h = self.sage1(x.float(), edge_index)  # Ensure float32
        h = torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.sage2(h, edge_index)
        return h  # Returning logits instead of applying log_softmax

    def fit(self, train_loader, epochs):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        self.train()
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')
            for batch in train_loader:
                optimizer.zero_grad()

                out = self(batch.x.float().to(device), batch.edge_index.to(device))
                
                indices = batch.input_id.to(device)
                survey_ids = [list(label_dict.keys())[index] for index in indices]
                labels = get_labels(survey_ids).to(device)

                loss = criterion(out[indices], labels)
                loss.backward()
                optimizer.step()

    @torch.no_grad()
    def test(self, val_loader):
        self.eval()
        f1s = []

        for batch in val_loader:
            
            out = self(batch.x.float().to(device), batch.edge_index.to(device))

            indices = batch.input_id.to(device)
            survey_ids = [list(label_dict.keys())[index] for index in indices]
            labels = get_labels(survey_ids).to(device)

            f1 = calculate_f1_score_from_tensors(labels, out[indices])
            f1s.append(f1)

        return np.mean(f1s)
                


In [10]:
f1s_graph = []
n=5

batch_size = 16
num_neighbors = [2, 2]  # Number of neighbors for each layer

train_ratio = 0.8

n_train = int(train_ratio * embeddings.size()[0])

device = 'mps'

# create array from 0 to len(embeddings)

idx = np.arange(embeddings.size()[0])

for i in range(n):

    print('Iteration: ', i+1)

    # shuffle the array
    np.random.shuffle(idx)

    # split the array into train and val

    train_idx = idx[:n_train]
    val_idx = idx[n_train:]
 
    # create dataset
    train_dataset = create_graph_dataset(embeddings, sll, train_idx)
    val_dataset = create_graph_dataset(embeddings, sll, val_idx)

    # create loader
    train_loader = NeighborLoader(train_dataset, batch_size=batch_size, num_neighbors=num_neighbors, shuffle=True)
    val_loader = NeighborLoader(val_dataset, batch_size=batch_size, num_neighbors=num_neighbors, shuffle=False)

    print('Data completed. Proceeding with training...')

    model = GraphSAGE(dim_in=embeddings.size()[1], dim_h=128, dim_out=num_classes)
    model.to(device)

    model.fit(train_loader, epochs=3)

    print('Training complete. Proceeding with validation...')

    f1 = model.test(val_loader)
    f1s_graph.append(f1)

    print('Validation completed.')


Iteration:  1


  positions.append([sll.iloc[i][1], sll.iloc[i][2]])


Finding edges...
Calculating edge attributes...
Creating Data object...
Data object created.


  data = Data(x=embeddings, edge_index=edge_index, edge_attr=edge_attr, pos=torch.tensor(positions))


Finding edges...
Calculating edge attributes...
Creating Data object...
Data object created.
Data completed. Proceeding with training...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Training complete. Proceeding with validation...
Validation completed.
Iteration:  2
Finding edges...
Calculating edge attributes...
Creating Data object...
Data object created.
Finding edges...
Calculating edge attributes...
Creating Data object...
Data object created.
Data completed. Proceeding with training...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Training complete. Proceeding with validation...
Validation completed.
Iteration:  3
Finding edges...
Calculating edge attributes...
Creating Data object...
Data object created.
Finding edges...
Calculating edge attributes...
Creating Data object...
Data object created.
Data completed. Proceeding with training...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Training complete. Proceeding with validation...
Validation completed.
Iteration:  4
Finding edges...
Calculating edge attributes...
Creating D

In [11]:
# print the f1 score

print('F1 scores: ', f1s_graph)

F1 scores:  [0.0034528377, 0.003853952, 0.00068660255, 0.0029886272, 0.000913459]


In [12]:
import datetime

now = datetime.datetime.now()
timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')

results = pd.DataFrame(f1s_graph, columns=['fme_f1'])
results.to_csv(f'{timestamp}_graph_benchmark_results_{n}.csv', index=False)