In [1]:
import pandas as pd
import rasterio
from rasterio.warp import transform
import pprint

from oaklib import get_adapter

In [17]:
# Path to your input files
tsv_file = 'myrold_attributes.tsv'
nlcd_geotiff = 'Annual_NLCD_LndCov_2023_CU_C1V0.tif'
output_file = 'myrold_attributes_with_land_cover.tsv'

In [3]:
envo_adapter_string = 'sqlite:obo:envo'

In [4]:
lat_col = 'latitude'
lon_col = 'longitude'

In [5]:
# Function to get land cover value from NLCD
def get_land_cover(lat, lon, dataset):
    try:
        # Transform coordinates to the GeoTIFF's CRS
        dst_crs = dataset.crs  # CRS of the GeoTIFF
        lon_transformed, lat_transformed = transform('EPSG:4326', dst_crs, [lon], [lat])

        # Query the pixel value at the transformed coordinate
        coords = [(lon_transformed[0], lat_transformed[0])]
        for val in dataset.sample(coords):
            return int(val[0])
    except Exception as e:
        print(f"Error processing point ({lat}, {lon}): {e}")
        return None

In [7]:
envo_adapter = get_adapter(envo_adapter_string)

In [8]:
# Get  CURIEs from the ontology
envo_curies = envo_adapter.entities(filter_obsoletes=True)

In [9]:
# Collect metadata for each class
nlcd_to_envo = {}

for envo_curie in envo_curies:
    term_label = envo_adapter.label(envo_curie)
    term_metadata = envo_adapter.entity_metadata_map(envo_curie)
    db_xrefs = term_metadata.get('oio:hasDbXref', []) if term_metadata else []

    for xref in db_xrefs:
        if xref.startswith("NLCD:"):
            nlcd_numeric = xref.split(":")[1]
            nlcd_to_envo[nlcd_numeric] = {
                "envo_curie": envo_curie,
                "envo_label": term_label,
            }


In [10]:
# pprint.pprint(nlcd_to_envo)

{'11': {'envo_curie': 'ENVO:01000666', 'envo_label': 'area of open water'},
 '12': {'envo_curie': 'ENVO:01000746',
        'envo_label': 'area of perennial ice or snow'},
 '21': {'envo_curie': 'ENVO:01000883',
        'envo_label': 'area of developed open space'},
 '22': {'envo_curie': 'ENVO:01000884',
        'envo_label': 'area of developed space with low usage intensity'},
 '23': {'envo_curie': 'ENVO:01000885',
        'envo_label': 'area of developed space with medium usage intensity'},
 '24': {'envo_curie': 'ENVO:01000886',
        'envo_label': 'area of developed space with high usage intensity'},
 '31': {'envo_curie': 'ENVO:01000752', 'envo_label': 'area of barren land'},
 '41': {'envo_curie': 'ENVO:01000816',
        'envo_label': 'area of deciduous forest'},
 '42': {'envo_curie': 'ENVO:01000843',
        'envo_label': 'area of evergreen forest'},
 '43': {'envo_curie': 'ENVO:01000855', 'envo_label': 'area of mixed forest'},
 '51': {'envo_curie': 'ENVO:01000861', 'envo_label': '

In [11]:
# Load the TSV file
df = pd.read_csv(tsv_file, sep='\t')

In [12]:
df.columns = [col.lower().strip() for col in df.columns]

In [13]:
# Open the NLCD GeoTIFF file
with rasterio.open(nlcd_geotiff) as src:
    # Infer land cover for each row using dynamic column names
    df['land_cover'] = df.apply(
        lambda row: get_land_cover(row[lat_col], row[lon_col], src), axis=1
    )

In [15]:
# Map ENVO curies and labels from the dictionary
df['envo_curie'] = df['land_cover'].map(lambda x: nlcd_to_envo.get(str(x), {}).get('envo_curie'))
df['envo_label'] = df['land_cover'].map(lambda x: nlcd_to_envo.get(str(x), {}).get('envo_label'))


In [16]:
df

Unnamed: 0,accession,ena first public,ena last update,ena-checklist,ena-first-public,ena-last-update,external id,insdc center alias,insdc center name,insdc first public,...,scientific_name,soil type,store_cond,study_sample_number,tier,water content,water_content_soil_meth,land_cover,envo_curie,envo_label
0,SAMEA7724195,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724195,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,1,,1.2109,gravimetric,43,ENVO:01000855,area of mixed forest
1,SAMEA7724196,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724196,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,2,,0.9845,gravimetric,43,ENVO:01000855,area of mixed forest
2,SAMEA7724197,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724197,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,3,,0.7961,gravimetric,43,ENVO:01000855,area of mixed forest
3,SAMEA7724198,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724198,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,4,,2.0964,gravimetric,43,ENVO:01000855,area of mixed forest
4,SAMEA7724199,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724199,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,5,,1.704,gravimetric,43,ENVO:01000855,area of mixed forest
5,SAMEA7724200,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724200,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,6,,1.5223,gravimetric,43,ENVO:01000855,area of mixed forest
6,SAMEA7724201,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724201,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,7,,0.5692,gravimetric,42,ENVO:01000843,area of evergreen forest
7,SAMEA7724202,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724202,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,8,,0.7353,gravimetric,42,ENVO:01000843,area of evergreen forest
8,SAMEA7724203,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724203,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,9,,0.6717,gravimetric,42,ENVO:01000843,area of evergreen forest
9,SAMEA7724204,2020-12-17,2020-12-16,ERC000011,,,SAMEA7724204,UCSDMI,University of California San Diego Microbiome ...,2020-12-17T04:08:06Z,...,,,-80,10,,0.6048,gravimetric,42,ENVO:01000843,area of evergreen forest


In [18]:
df.to_csv(output_file, sep="\t", index=False)