In [9]:
#!pip install huggingface-hub
#!pip install s2sphere
#!pip install pymilvus
#!pip install datasets
#!pip install transformers 
#!pip install torch 
#!pip install earthengine-api 
#!pip install pillow

In [None]:
import ee
import torch
from datasets import load_dataset
from pymilvus import MilvusClient, DataType
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import s2sphere

In [11]:
ee.Authenticate()
ee.Initialize(project=os.getenv('GEE_PROJECT_ID'))

Initialize DB (only need to do if not created yet)

In [None]:
from pymilvus import MilvusClient, DataType

client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")
if "geoguessr" not in client.list_databases():
    client.create_database("geoguessr")

client = MilvusClient(uri="http://localhost:19530", token="root:Milvus", db_name="geoguessr")

schema = client.create_schema(auto_id=True, enable_dynamic_field=True)

#field names
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="streetclip_vec", datatype=DataType.FLOAT_VECTOR, dim=768)
schema.add_field(field_name="alphaearth_vec", datatype=DataType.FLOAT_VECTOR, dim=64)
schema.add_field(field_name="gps", datatype=DataType.JSON)
schema.add_field(field_name="s2sphere_boundary", datatype=DataType.VARCHAR, max_length=512)

index_params = client.prepare_index_params()

index_params.add_index(
    field_name="streetclip_vec",
    metric_type="COSINE",
    index_type="HNSW",
    params={"M": 16, "efConstruction": 500}
)

index_params.add_index(
    field_name="alphaearth_vec",
    metric_type="L2",
    index_type="HNSW",
    params={"M": 8, "efConstruction": 200}
)

Create Milvus Collection

In [14]:
client.create_collection(
    collection_name="world_locations",
    schema=schema,
    index_params=index_params
)

Data Extraction

In [None]:
def visual_features(image, model, processor):
    inputs = processor(images=image, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.cpu().numpy()[0].tolist()

def alphaearth_features(lat, lon):
    point = ee.Geometry.Point([lon, lat])

    ae_collection = ee.FeatureCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
    ae_image = ae_collection.sort('system:time_start', False).first()

    sample = ae_image.sample(point, 10).first().toDictionary().getInfo()
    return [sample[f'A{i:02d}'] for i in range(64)]



S2sphere token generation (segmenting locations into respective boundaries)

In [17]:
dataset = load_dataset(
    'osv5m/osv5m', 
    split='train', 
    streaming=True, 
    trust_remote_code=True
)
first_sample = next(iter(dataset))


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'osv5m/osv5m' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


RuntimeError: Dataset scripts are no longer supported, but found osv5m.py

In [None]:
def s2sphere_boundary(lat, lon):
    p = s2sphere.LatLng.from_degrees(lat, lon)
    cell = s2sphere.CellId.from_lat_lng(p)
    return cell.to_token()

image_dict = dict()

for sample in stream_ds.take(1000):
    lat, lon = sample['lat'], sample['lon']
    token = s2sphere_boundary(lat, lon)
    if token not in image_dict:
       alphaearth_vec = alphaearth_features(lat, lon)
       image_dict[token] = {
        "vector": alphaearth_vec,
        "lat": lat,
        "lon": lon,
        "count": 1
    }
    else:
        image_dict[token]["count"] += 1

indexed_anchors = []
for token, data in image_dict.items():
    indexed_anchors.append({
        "vector": data['vector'],
        "id": int(token, 16), # Convert hex token to int for Milvus ID
        "lat": data['lat'],
        "lon": data['lon']
    })


client.insert(collection_name="world_locations", data=indexed_anchors)

     


SyntaxError: invalid syntax (181222735.py, line 19)