## Generate lon/lat for ICOS 2023 sites from the SITEINFO files
This is a step that may be different for every eddy covariance data source. Before running the pipeline,
it's important that each set of sites is accompanied by a `site_meta.csv` file listing the sites, their IGBP classifications, longitude / latitude coordinates, etc.

This particular script fixes the `site_meta.csv` files for icos-2023 and icos-ww

In [2]:
import pandas as pd
import os

In [19]:
DATA_DIR = os.path.join('data', 'raw', 'icos-2023', 'unzipped')
icos_2023_sites = os.listdir(DATA_DIR)
icos_2023_data = []
icos_2023_problem_sites = []
for site in icos_2023_sites:
    files = os.listdir(os.path.join(DATA_DIR, site))
    fluxnet_compatible_files = [f for f in files if 'SITEINFO' in f and 'Variables' not in f]
    if len(fluxnet_compatible_files) == 0:
        icos_2023_problem_sites.append(site)
        continue
    
    file = fluxnet_compatible_files[0]
    filepath = os.path.join(DATA_DIR, site, file)
    df = pd.read_csv(filepath, on_bad_lines='skip')
    lat_row = df[df['VARIABLE'] == 'LOCATION_LAT']
    lat = lat_row.reset_index().loc[0]['DATAVALUE']
    lon_row = df[df['VARIABLE'] == 'LOCATION_LONG']
    lon = lon_row.reset_index().loc[0]['DATAVALUE']
    elev_row = df[df['VARIABLE'] == 'LOCATION_ELEV']
    elev = elev_row.reset_index().loc[0]['DATAVALUE']
    igbp_row = df[df['VARIABLE'] == 'IGBP']
    igbp = igbp_row.reset_index().loc[0]['DATAVALUE']


    icos_2023_data.append((site, lat, lon, elev, igbp))
icos_2023_df = pd.DataFrame(data=icos_2023_data, columns=['SITE_ID', 'LOCATION_LAT', 'LOCATION_LON', 'LOCATION_ELEV', 'IGBP'])
icos_2023_df.to_csv(os.path.join('data', 'raw', 'icos-2023', 'site_data.csv'), index=False)

In [10]:
raw_stations = pd.read_csv('misc/icos_stations.csv')
data = []
df = raw_stations[raw_stations['Location'].notna()]
for i, row in df.iterrows():
    lat, lon = [float(c.strip(',')) for c in row['Location'].strip('()').split()]
    sid = row['Id']
    elev = row['Elevation above sea']
    sitetype = row['Site type']
    data.append((sid, lat, lon, elev, sitetype))
icos_df = pd.DataFrame(data=data, columns=['SITE_ID', 'LOCATION_LAT', 'LOCATION_LON', 'LOCATION_ELEV', 'SITE_TYPE_RAW'])

In [9]:
type_map = {
    'tall tower': '',
    'evergreen needleleaf forests': 'ENF',
    'grasslands': 'GRA',
    'fos buoy': '',
    'croplands': 'CRO',
    'closed shrublands': 'CSH',
    'mixed forests': 'MF',
    'ground': '',
    'urban': 'URB',
    'mountain': '',
    'coastal': '',
    'mft': '',
    'deciduous broadleaf forests': 'DBF',
    'permanent wetlands': 'WET',
    'wetland': 'WET',
    'urban and built-up lands': 'URB',
    'cropland': 'CRO',
    'grassland': 'GRA',
    'profiling station': '',
    'forest': 'MF',
    'savannas': 'SAV',
    'soop': '',
    'water bodies': 'WAT',
    'fen': '',
    'fos': '',
    'evergreen broadleaf forests': 'EBF',
    'open shrublands': 'OSH',
    '(tall) tower': '',
    'tower': '',
    'fixed station': '',
    'buoy, open sea': 'WAT',
    'deciduous needleleaf forests': 'DNF',
    'marine remote': '',
    'coastal/continental': '',
    'continental': '',
    'baltic sea': '',
    'non-forested island on sea': '',
    'surface, land': '',
    'remote arctic': ''
}

In [16]:
icos_df['IGBP'] = icos_df['SITE_TYPE_RAW'].map(type_map)
icos_df.drop(columns=['SITE_TYPE_RAW'], inplace=True)

In [17]:
icos_ww_dir = os.path.join('data', 'raw', 'icos-ww', 'unzipped')
icos_ww_sites = os.listdir(icos_ww_dir)
icos_ww_df = icos_df[icos_df['SITE_ID'].isin(icos_ww_sites)].reset_index(drop=True)
icos_ww_df.to_csv(os.path.join('data', 'raw', 'icos-ww', 'site_data.csv'), index=False)
[s for s in icos_ww_sites if s not in icos_ww_df['SITE_ID'].values]

['DE-Akm',
 'CH-Aws',
 'SE-Ros',
 'CH-Cha',
 'CZ-KrP',
 'CH-Oe2',
 'ES-Agu',
 'IL-Yat',
 'BE-Lcr',
 'RU-Fyo',
 'ES-LJu',
 'IT-Lav',
 'CH-Lae',
 'CH-Fru',
 'DK-Gds',
 'ES-Abr',
 'ES-Cnd',
 'FI-Qvd',
 'RU-Fy2',
 'DE-Obe',
 'CZ-Stn',
 'ES-LM2',
 'ES-LM1',
 'CZ-RAJ']