In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import os, cdsapi


from carbonpipeline.cli import _merge_unzipped
from carbonpipeline.constants import *
from carbonpipeline.processing_utils import *


pd.set_option('display.width', 200)          
pd.set_option('display.max_columns', None)    
pd.set_option('display.expand_frame_repr', False) 

In [2]:
df = pd.read_csv("data.csv")

# Request code for CO2

In [3]:
dataset = "satellite-carbon-dioxide"
request = {
    "processing_level": ["level_3"],
    "variable": "xco2",
    "sensor_and_algorithm": "merged_obs4mips",
    "version": ["4_5"]
}

client = cdsapi.Client()
client.retrieve(dataset, request).download()

2025-06-06 14:53:04,808 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-06-06 14:53:06,531 INFO Request ID is 6a9979ac-6706-4782-a8f4-53f631ce798a
2025-06-06 14:53:06,699 INFO status has been updated to accepted
2025-06-06 14:53:20,875 INFO status has been updated to running
2025-06-06 14:53:57,605 INFO status has been updated to successful
                                                                                         

'9ef3ecd10dfa260ba8acdda4e58d6c6.zip'

### Opening and displaying the corresponding NetCDF file

In [4]:
ds_co2 = xr.open_dataset("unzip/200301_202212-C3S-L3_XCO2-GHG_PRODUCTS-MERGED-MERGED-OBS4MIPS-MERGED-v4.5.nc", engine="netcdf4")
df_co2 = ds_co2.isel(bnds=0, pressure=0).to_dataframe()

In [5]:
df_co2.loc[(slice(None), 47.5, 47.5)]

Unnamed: 0_level_0,time_bnds,lat_bnds,lon_bnds,pre,pre_bnds,land_fraction,xco2,xco2_nobs,xco2_stderr,xco2_stddev,column_averaging_kernel,vmr_profile_co2_apriori
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2003-01-16 12:00:00,2003-01-01,45.0,45.0,0.95,1.0,1.0,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20
2003-02-15 00:00:00,2003-02-01,45.0,45.0,0.95,1.0,1.0,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20
2003-03-16 12:00:00,2003-03-01,45.0,45.0,0.95,1.0,1.0,3.783403e-04,10.0,6.714576e-07,1.015373e-06,1.054930e+00,3.847092e-04
2003-04-16 00:00:00,2003-04-01,45.0,45.0,0.95,1.0,1.0,3.790484e-04,59.0,4.976073e-07,2.709809e-06,1.046514e+00,3.845929e-04
2003-05-16 12:00:00,2003-05-01,45.0,45.0,0.95,1.0,1.0,3.754970e-04,61.0,4.969837e-07,1.752463e-06,1.034632e+00,3.770574e-04
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-16 12:00:00,2022-08-01,45.0,45.0,0.95,1.0,1.0,4.142080e-04,16.0,1.251270e-06,3.081072e-06,1.301275e+00,4.107524e-04
2022-09-16 00:00:00,2022-09-01,45.0,45.0,0.95,1.0,1.0,4.123670e-04,24.0,9.914198e-07,1.128311e-06,9.928718e-01,4.136769e-04
2022-10-16 12:00:00,2022-10-01,45.0,45.0,0.95,1.0,1.0,4.147704e-04,12.0,2.336644e-06,1.589073e-06,9.728573e-01,4.206153e-04
2022-11-16 00:00:00,2022-11-01,45.0,45.0,0.95,1.0,1.0,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20


# Merging ERA5 NetCDF files

In [6]:
ds_era5 = _merge_unzipped([x[0] for x in os.walk("./datasets/unzip")])
df_era5 = ds_era5.to_dataframe()

In [7]:
df_era5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sp,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2003-03-01 00:00:00,90.0,-180.00,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.75,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.50,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.25,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.00,101938.593750,241.047729
...,...,...,...,...
2003-03-01 07:00:00,-90.0,178.75,69759.601562,238.342026
2003-03-01 07:00:00,-90.0,179.00,69759.601562,238.342026
2003-03-01 07:00:00,-90.0,179.25,69759.601562,238.342026
2003-03-01 07:00:00,-90.0,179.50,69759.601562,238.342026


# Adding CO2 column to ERA5 dataframe

#### Work with df_era5 index without modifying it

In [8]:

era5_index = df_era5.index.to_frame(index=False).copy()

In [9]:
era5_index

Unnamed: 0,valid_time,latitude,longitude
0,2003-03-01 00:00:00,90.0,-180.00
1,2003-03-01 00:00:00,90.0,-179.75
2,2003-03-01 00:00:00,90.0,-179.50
3,2003-03-01 00:00:00,90.0,-179.25
4,2003-03-01 00:00:00,90.0,-179.00
...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75
8305916,2003-03-01 07:00:00,-90.0,179.00
8305917,2003-03-01 07:00:00,-90.0,179.25
8305918,2003-03-01 07:00:00,-90.0,179.50


#### Extract the month (YYYY-MM) from "valide_time"

In [10]:
era5_index["year_month"] = era5_index["valid_time"].dt.to_period("M")

In [11]:
era5_index

Unnamed: 0,valid_time,latitude,longitude,year_month
0,2003-03-01 00:00:00,90.0,-180.00,2003-03
1,2003-03-01 00:00:00,90.0,-179.75,2003-03
2,2003-03-01 00:00:00,90.0,-179.50,2003-03
3,2003-03-01 00:00:00,90.0,-179.25,2003-03
4,2003-03-01 00:00:00,90.0,-179.00,2003-03
...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03


#### Concatenate the data with the index to recreate a flat DF

In [12]:
df_era5_flat = df_era5.reset_index(drop=True)
df_era5_temp = pd.concat([era5_index, df_era5_flat], axis=1)

In [13]:
df_era5_temp

Unnamed: 0,valid_time,latitude,longitude,year_month,sp,t2m
0,2003-03-01 00:00:00,90.0,-180.00,2003-03,101938.593750,241.047729
1,2003-03-01 00:00:00,90.0,-179.75,2003-03,101938.593750,241.047729
2,2003-03-01 00:00:00,90.0,-179.50,2003-03,101938.593750,241.047729
3,2003-03-01 00:00:00,90.0,-179.25,2003-03,101938.593750,241.047729
4,2003-03-01 00:00:00,90.0,-179.00,2003-03,101938.593750,241.047729
...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03,69759.601562,238.342026
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03,69759.601562,238.342026
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03,69759.601562,238.342026
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03,69759.601562,238.342026


#### Prepare df_co2

In [14]:
df_co2_index_reset               = df_co2.reset_index()
df_co2_index_reset["year_month"] = pd.to_datetime(df_co2_index_reset["time"]).dt.to_period("M")

In [15]:
df_co2_index_reset

Unnamed: 0,time,lat,lon,time_bnds,lat_bnds,lon_bnds,pre,pre_bnds,land_fraction,xco2,xco2_nobs,xco2_stderr,xco2_stddev,column_averaging_kernel,vmr_profile_co2_apriori,year_month
0,2003-01-16 12:00:00,-87.5,-177.5,2003-01-01,-90.0,-180.0,0.95,1.0,0.998163,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2003-01
1,2003-01-16 12:00:00,-87.5,-172.5,2003-01-01,-90.0,-175.0,0.95,1.0,0.999833,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2003-01
2,2003-01-16 12:00:00,-87.5,-167.5,2003-01-01,-90.0,-170.0,0.95,1.0,0.999833,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2003-01
3,2003-01-16 12:00:00,-87.5,-162.5,2003-01-01,-90.0,-165.0,0.95,1.0,0.919236,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2003-01
4,2003-01-16 12:00:00,-87.5,-157.5,2003-01-01,-90.0,-160.0,0.95,1.0,0.839939,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2003-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622075,2022-12-16 12:00:00,87.5,157.5,2022-12-01,85.0,155.0,0.95,1.0,0.000000,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2022-12
622076,2022-12-16 12:00:00,87.5,162.5,2022-12-01,85.0,160.0,0.95,1.0,0.000000,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2022-12
622077,2022-12-16 12:00:00,87.5,167.5,2022-12-01,85.0,165.0,0.95,1.0,0.000000,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2022-12
622078,2022-12-16 12:00:00,87.5,172.5,2022-12-01,85.0,170.0,0.95,1.0,0.000000,1.000000e+20,,1.000000e+20,1.000000e+20,1.000000e+20,1.000000e+20,2022-12


In [16]:
df_co2         = df_co2[df_co2.columns.intersection(["xco2"])]
df_co2["xco2"] = df_co2["xco2"].replace(np.float32(1e20), np.nan)

In [17]:
df_co2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,xco2
time,lat,lon,Unnamed: 3_level_1
2003-01-16 12:00:00,-87.5,-177.5,
2003-01-16 12:00:00,-87.5,-172.5,
2003-01-16 12:00:00,-87.5,-167.5,
2003-01-16 12:00:00,-87.5,-162.5,
2003-01-16 12:00:00,-87.5,-157.5,
...,...,...,...
2022-12-16 12:00:00,87.5,157.5,
2022-12-16 12:00:00,87.5,162.5,
2022-12-16 12:00:00,87.5,167.5,
2022-12-16 12:00:00,87.5,172.5,


#### Proper fusion on month + lat/lon

In [18]:
df_era5_flat = df_era5.reset_index(drop=True)
df_era5_temp = pd.concat([era5_index, df_era5_flat], axis=1)

##### Obtain available lat/lon in df_co2

In [19]:
b_lats = df_co2_index_reset["lat"].unique()
b_lons = df_co2_index_reset["lon"].unique()

def match_to_closest(values, reference_points):
    return np.array([reference_points[np.abs(reference_points - v).argmin()] for v in values])

##### Apply to df_era5_index

In [20]:
era5_index["latitude_rounded"]  = match_to_closest(era5_index["latitude"].values, b_lats)
era5_index["longitude_rounded"] = match_to_closest(era5_index["longitude"].values, b_lons)

In [21]:
era5_index

Unnamed: 0,valid_time,latitude,longitude,year_month,latitude_rounded,longitude_rounded
0,2003-03-01 00:00:00,90.0,-180.00,2003-03,87.5,-177.5
1,2003-03-01 00:00:00,90.0,-179.75,2003-03,87.5,-177.5
2,2003-03-01 00:00:00,90.0,-179.50,2003-03,87.5,-177.5
3,2003-03-01 00:00:00,90.0,-179.25,2003-03,87.5,-177.5
4,2003-03-01 00:00:00,90.0,-179.00,2003-03,87.5,-177.5
...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03,-87.5,177.5
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03,-87.5,177.5
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03,-87.5,177.5
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03,-87.5,177.5


##### Fuse month + approximated coordinates

In [22]:
df_era5_temp["year_month"] = era5_index["valid_time"].dt.to_period("M")
df_era5_temp["latitude_rounded"]  = era5_index["latitude_rounded"]
df_era5_temp["longitude_rounded"] = era5_index["longitude_rounded"]

df_co2["year_month"] = pd.to_datetime(df_co2_index_reset["time"]).dt.to_period("M")

In [23]:
df_era5_temp

Unnamed: 0,valid_time,latitude,longitude,year_month,sp,t2m,latitude_rounded,longitude_rounded
0,2003-03-01 00:00:00,90.0,-180.00,2003-03,101938.593750,241.047729,87.5,-177.5
1,2003-03-01 00:00:00,90.0,-179.75,2003-03,101938.593750,241.047729,87.5,-177.5
2,2003-03-01 00:00:00,90.0,-179.50,2003-03,101938.593750,241.047729,87.5,-177.5
3,2003-03-01 00:00:00,90.0,-179.25,2003-03,101938.593750,241.047729,87.5,-177.5
4,2003-03-01 00:00:00,90.0,-179.00,2003-03,101938.593750,241.047729,87.5,-177.5
...,...,...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03,69759.601562,238.342026,-87.5,177.5
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03,69759.601562,238.342026,-87.5,177.5
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03,69759.601562,238.342026,-87.5,177.5
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03,69759.601562,238.342026,-87.5,177.5


##### Fuse

In [24]:
merged_nearest = df_era5_temp.merge(
    df_co2,
    left_on=["year_month", "latitude_rounded", "longitude_rounded"],
    right_on=["year_month", "lat", "lon"],
    how="left"
)

In [25]:
merged_nearest

Unnamed: 0,valid_time,latitude,longitude,year_month,sp,t2m,latitude_rounded,longitude_rounded,xco2
0,2003-03-01 00:00:00,90.0,-180.00,2003-03,101938.593750,241.047729,87.5,-177.5,
1,2003-03-01 00:00:00,90.0,-179.75,2003-03,101938.593750,241.047729,87.5,-177.5,
2,2003-03-01 00:00:00,90.0,-179.50,2003-03,101938.593750,241.047729,87.5,-177.5,
3,2003-03-01 00:00:00,90.0,-179.25,2003-03,101938.593750,241.047729,87.5,-177.5,
4,2003-03-01 00:00:00,90.0,-179.00,2003-03,101938.593750,241.047729,87.5,-177.5,
...,...,...,...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,2003-03,69759.601562,238.342026,-87.5,177.5,
8305916,2003-03-01 07:00:00,-90.0,179.00,2003-03,69759.601562,238.342026,-87.5,177.5,
8305917,2003-03-01 07:00:00,-90.0,179.25,2003-03,69759.601562,238.342026,-87.5,177.5,
8305918,2003-03-01 07:00:00,-90.0,179.50,2003-03,69759.601562,238.342026,-87.5,177.5,


In [27]:
df_era5["xco2"] = merged_nearest["xco2"].values

# Evaluate and compare

In [28]:
df_era5.loc[(slice(None), 47.5, 47.5)]

Unnamed: 0_level_0,sp,t2m,xco2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-03-01 00:00:00,103843.59375,261.188354,
2003-03-01 01:00:00,103870.40625,260.427399,
2003-03-01 02:00:00,103846.648438,260.079407,
2003-03-01 03:00:00,103898.492188,260.526367,
2003-03-01 04:00:00,103854.53125,261.259521,
2003-03-01 05:00:00,103901.578125,264.013062,
2003-03-01 06:00:00,103910.898438,263.359314,
2003-03-01 07:00:00,103914.601562,263.777588,


In [29]:
df_era5.loc[(slice(None), 49.5, 46.5)]

Unnamed: 0_level_0,sp,t2m,xco2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-03-01 00:00:00,103457.59375,258.496948,
2003-03-01 01:00:00,103460.40625,258.802399,
2003-03-01 02:00:00,103412.648438,258.823547,
2003-03-01 03:00:00,103456.492188,258.866211,
2003-03-01 04:00:00,103383.53125,258.74585,
2003-03-01 05:00:00,103416.578125,261.679077,
2003-03-01 06:00:00,103417.898438,261.140564,
2003-03-01 07:00:00,103446.601562,260.525635,
