In [91]:
import pandas as pd
import xarray as xr
import numpy as np
import os, cdsapi


from carbonpipeline.cli import _merge_unzipped
from carbonpipeline.constants import *
from carbonpipeline.processing_utils import *


pd.set_option('display.width', 200)          
pd.set_option('display.max_columns', None)    
pd.set_option('display.expand_frame_repr', False) 

In [92]:
df = pd.read_csv("data.csv")

# Request code for CO2

In [None]:
dataset = "satellite-carbon-dioxide"
request = {
    "processing_level": ["level_3"],
    "variable": "xco2",
    "sensor_and_algorithm": "merged_obs4mips",
    "version": ["4_5"]
}

client = cdsapi.Client()
client.retrieve(dataset, request).download()

2025-06-06 13:41:38,327 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-06-06 13:41:38,732 INFO Request ID is ccd88653-edec-4bf0-ade8-7f7f9634a392
2025-06-06 13:41:38,884 INFO status has been updated to accepted


### Opening and displaying the corresponding NetCDF file

In [None]:
ds_co2 = xr.open_dataset("unzip/200301_202212-C3S-L3_XCO2-GHG_PRODUCTS-MERGED-MERGED-OBS4MIPS-MERGED-v4.5.nc", engine="netcdf4")
df_co2 = ds_co2.isel(bnds=0, pressure=0).to_dataframe()

In [None]:
df_co2.loc[(slice(None), 47.5, 47.5)]

# Merging ERA5 NetCDF files

In [None]:
ds_era5 = _merge_unzipped([x[0] for x in os.walk("./datasets/unzip")])
df_era5 = ds_era5.to_dataframe()

In [None]:
df_era5

# Adding CO2 column to ERA5 dataframe

#### Work with df_era5 index without modifying it

In [None]:

era5_index = df_era5.index.to_frame(index=False).copy()

In [None]:
era5_index

#### Extract the month (YYYY-MM) from "valide_time"

In [None]:
era5_index["year_month"] = era5_index["valid_time"].dt.to_period("M")

In [None]:
era5_index

#### Concatenate the data with the index to recreate a flat DF

In [None]:
df_era5_flat = df_era5.reset_index(drop=True)
df_era5_temp = pd.concat([era5_index, df_era5_flat], axis=1)

In [None]:
df_era5_temp

#### Prepare df_co2

In [None]:
df_co2_index_reset               = df_co2.reset_index()
df_co2_index_reset["year_month"] = pd.to_datetime(df_co2_index_reset["time"]).dt.to_period("M")

In [None]:
df_co2_index_reset

In [None]:
df_co2         = df_co2[df_co2.columns.intersection(["xco2"])]
df_co2["xco2"] = df_co2["xco2"].replace(np.float32(1e20), np.nan)

#### Proper fusion on month + lat/lon

In [None]:
df_era5_flat = df_era5.reset_index(drop=True)
df_era5_temp = pd.concat([era5_index, df_era5_flat], axis=1)

##### Obtain available lat/lon in df_co2

In [None]:
b_lats = df_co2["lat"].unique()
b_lons = df_co2["lon"].unique()

def match_to_closest(values, reference_points):
    return np.array([reference_points[np.abs(reference_points - v).argmin()] for v in values])

##### Apply to df_era5_index

In [None]:
era5_index["latitude_rounded"]  = match_to_closest(era5_index["latitude"].values, b_lats)
era5_index["longitude_rounded"] = match_to_closest(era5_index["longitude"].values, b_lons)

In [None]:
era5_index

Unnamed: 0,valid_time,latitude,longitude,year_month,latitude_rounded,longitude_rounded
0,2003-03-01 00:00:00,90.0,-180.00,2003-03,87.5,-177.5
1,2003-03-01 00:00:00,90.0,-179.75,2003-03,87.5,-177.5
2,2003-03-01 00:00:00,90.0,-179.50,2003-03,87.5,-177.5
3,2003-03-01 00:00:00,90.0,-179.25,2003-03,87.5,-177.5
4,2003-03-01 00:00:00,90.0,-179.00,2003-03,87.5,-177.5
...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03,-87.5,177.5
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03,-87.5,177.5
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03,-87.5,177.5
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03,-87.5,177.5


##### Fuse month + approximated coordinates

In [None]:
df_era5_temp["year_month"] = era5_index["valid_time"].dt.to_period("M")
df_era5_temp["latitude_rounded"]  = era5_index["latitude_rounded"]
df_era5_temp["longitude_rounded"] = era5_index["longitude_rounded"]

df_co2["year_month"] = pd.to_datetime(df_co2["time"]).dt.to_period("M")

In [None]:
df_era5_temp

Unnamed: 0,valid_time,latitude,longitude,year_month,latitude_rounded,longitude_rounded,sp,t2m,xco2
0,2003-03-01 00:00:00,90.0,-180.00,2003-03,87.5,-177.5,101938.593750,241.047729,
1,2003-03-01 00:00:00,90.0,-179.75,2003-03,87.5,-177.5,101938.593750,241.047729,
2,2003-03-01 00:00:00,90.0,-179.50,2003-03,87.5,-177.5,101938.593750,241.047729,
3,2003-03-01 00:00:00,90.0,-179.25,2003-03,87.5,-177.5,101938.593750,241.047729,
4,2003-03-01 00:00:00,90.0,-179.00,2003-03,87.5,-177.5,101938.593750,241.047729,
...,...,...,...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03,-87.5,177.5,69759.601562,238.342026,
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03,-87.5,177.5,69759.601562,238.342026,
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03,-87.5,177.5,69759.601562,238.342026,
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03,-87.5,177.5,69759.601562,238.342026,


##### Fuse

In [None]:
merged_nearest = df_era5_temp.merge(
    df_co2,
    left_on=["year_month", "latitude_rounded", "longitude_rounded"],
    right_on=["year_month", "lat", "lon"],
    how="left"
)

In [None]:
df_era5["xco2"] = merged_nearest["xco2_y"].values

# Evaluate and compare

In [None]:
df_era5.loc[(slice(None), 47.5, 47.5)]

Unnamed: 0_level_0,sp,t2m,xco2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-03-01 00:00:00,103843.59375,261.188354,0.000378
2003-03-01 01:00:00,103870.40625,260.427399,0.000378
2003-03-01 02:00:00,103846.648438,260.079407,0.000378
2003-03-01 03:00:00,103898.492188,260.526367,0.000378
2003-03-01 04:00:00,103854.53125,261.259521,0.000378
2003-03-01 05:00:00,103901.578125,264.013062,0.000378
2003-03-01 06:00:00,103910.898438,263.359314,0.000378
2003-03-01 07:00:00,103914.601562,263.777588,0.000378


In [None]:
df_era5.loc[(slice(None), 49.5, 46.5)]

Unnamed: 0_level_0,sp,t2m,xco2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-03-01 00:00:00,103457.59375,258.496948,0.000378
2003-03-01 01:00:00,103460.40625,258.802399,0.000378
2003-03-01 02:00:00,103412.648438,258.823547,0.000378
2003-03-01 03:00:00,103456.492188,258.866211,0.000378
2003-03-01 04:00:00,103383.53125,258.74585,0.000378
2003-03-01 05:00:00,103416.578125,261.679077,0.000378
2003-03-01 06:00:00,103417.898438,261.140564,0.000378
2003-03-01 07:00:00,103446.601562,260.525635,0.000378
