In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import os, cdsapi


from carbonpipeline.cli import _merge_unzipped
from carbonpipeline.constants import *
from carbonpipeline.processing_utils import *


pd.set_option('display.width', 200)          
pd.set_option('display.max_columns', None)    
pd.set_option('display.expand_frame_repr', False) 

In [2]:
df = pd.read_csv("data.csv")

# Request code for CO2

In [3]:
dataset = "satellite-carbon-dioxide"
request = {
    "processing_level": ["level_3"],
    "variable": "xco2",
    "sensor_and_algorithm": "merged_obs4mips",
    "version": ["4_5"]
}

client = cdsapi.Client()
client.retrieve(dataset, request).download()

2025-06-06 14:53:04,808 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-06-06 14:53:06,531 INFO Request ID is 6a9979ac-6706-4782-a8f4-53f631ce798a
2025-06-06 14:53:06,699 INFO status has been updated to accepted
2025-06-06 14:53:20,875 INFO status has been updated to running
2025-06-06 14:53:57,605 INFO status has been updated to successful
                                                                                         

'9ef3ecd10dfa260ba8acdda4e58d6c6.zip'

### Opening and displaying the corresponding NetCDF file

In [84]:
ds_co2 = xr.open_dataset("datasets/unzip/CO2_2003-2022/200301_202212-C3S-L3_XCO2-GHG_PRODUCTS-MERGED-MERGED-OBS4MIPS-MERGED-v4.5.nc", engine="netcdf4")
df_co2 = ds_co2.isel(bnds=0, pressure=0).to_dataframe()

In [87]:
df_co2         = df_co2[["xco2"]]
df_co2["xco2"] = df_co2["xco2"].replace(np.float32(1e20), np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_co2["xco2"] = df_co2["xco2"].replace(np.float32(1e20), np.nan)


In [88]:
df_co2.loc[(slice(None), 47.5, 47.5)]

Unnamed: 0_level_0,xco2
time,Unnamed: 1_level_1
2003-01-16 12:00:00,
2003-02-15 00:00:00,
2003-03-16 12:00:00,0.000378
2003-04-16 00:00:00,0.000379
2003-05-16 12:00:00,0.000375
...,...
2022-08-16 12:00:00,0.000414
2022-09-16 00:00:00,0.000412
2022-10-16 12:00:00,0.000415
2022-11-16 00:00:00,


# Merging ERA5 NetCDF files

In [33]:
ds_era5 = _merge_unzipped([x[0] for x in os.walk("./datasets/unzip")])
df_era5 = ds_era5.to_dataframe()

In [34]:
df_era5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sp,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2003-03-01 00:00:00,90.0,-180.00,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.75,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.50,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.25,101938.593750,241.047729
2003-03-01 00:00:00,90.0,-179.00,101938.593750,241.047729
...,...,...,...,...
2003-03-01 07:00:00,-90.0,178.75,69759.601562,238.342026
2003-03-01 07:00:00,-90.0,179.00,69759.601562,238.342026
2003-03-01 07:00:00,-90.0,179.25,69759.601562,238.342026
2003-03-01 07:00:00,-90.0,179.50,69759.601562,238.342026


# Adding CO2 column to ERA5 dataframe

#### Work with df_era5 index without modifying it

In [43]:
df_era5_index_reseted = df_era5.copy().reset_index()

In [44]:
df_era5_index_reseted

Unnamed: 0,valid_time,latitude,longitude,sp,t2m
0,2003-03-01 00:00:00,90.0,-180.00,101938.593750,241.047729
1,2003-03-01 00:00:00,90.0,-179.75,101938.593750,241.047729
2,2003-03-01 00:00:00,90.0,-179.50,101938.593750,241.047729
3,2003-03-01 00:00:00,90.0,-179.25,101938.593750,241.047729
4,2003-03-01 00:00:00,90.0,-179.00,101938.593750,241.047729
...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,69759.601562,238.342026
8305916,2003-03-01 07:00:00,-90.0,179.00,69759.601562,238.342026
8305917,2003-03-01 07:00:00,-90.0,179.25,69759.601562,238.342026
8305918,2003-03-01 07:00:00,-90.0,179.50,69759.601562,238.342026


#### Extract the month (YYYY-MM) from "valide_time"

In [45]:
df_era5_index_reseted["year_month"] = df_era5_index_reseted["valid_time"].dt.to_period("M")

In [46]:
df_era5_index_reseted

Unnamed: 0,valid_time,latitude,longitude,sp,t2m,year_month
0,2003-03-01 00:00:00,90.0,-180.00,101938.593750,241.047729,2003-03
1,2003-03-01 00:00:00,90.0,-179.75,101938.593750,241.047729,2003-03
2,2003-03-01 00:00:00,90.0,-179.50,101938.593750,241.047729,2003-03
3,2003-03-01 00:00:00,90.0,-179.25,101938.593750,241.047729,2003-03
4,2003-03-01 00:00:00,90.0,-179.00,101938.593750,241.047729,2003-03
...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,69759.601562,238.342026,2003-03
8305916,2003-03-01 07:00:00,-90.0,179.00,69759.601562,238.342026,2003-03
8305917,2003-03-01 07:00:00,-90.0,179.25,69759.601562,238.342026,2003-03
8305918,2003-03-01 07:00:00,-90.0,179.50,69759.601562,238.342026,2003-03


#### Prepare df_co2

In [None]:
df_co2_index_reseted               = df_co2.copy().reset_index()
df_co2_index_reseted["year_month"] = pd.to_datetime(df_co2_index_reseted["time"]).dt.to_period("M")

In [48]:
df_co2_index_reseted

Unnamed: 0,time,lat,lon,xco2,year_month
0,2003-01-16 12:00:00,-87.5,-177.5,,2003-01
1,2003-01-16 12:00:00,-87.5,-172.5,,2003-01
2,2003-01-16 12:00:00,-87.5,-167.5,,2003-01
3,2003-01-16 12:00:00,-87.5,-162.5,,2003-01
4,2003-01-16 12:00:00,-87.5,-157.5,,2003-01
...,...,...,...,...,...
622075,2022-12-16 12:00:00,87.5,157.5,,2022-12
622076,2022-12-16 12:00:00,87.5,162.5,,2022-12
622077,2022-12-16 12:00:00,87.5,167.5,,2022-12
622078,2022-12-16 12:00:00,87.5,172.5,,2022-12


##### Obtain available lat/lon in df_co2

In [None]:
b_lats = df_co2_index_reseted["lat"].unique()
b_lons = df_co2_index_reseted["lon"].unique()

def match_to_closest(values, reference_points):
    return np.array([reference_points[np.abs(reference_points - v).argmin()] for v in values])

##### Apply to df_era5_index

In [51]:
df_era5_index_reseted["latitude_rounded"]  = match_to_closest(df_era5_index_reseted["latitude"].values, b_lats)
df_era5_index_reseted["longitude_rounded"] = match_to_closest(df_era5_index_reseted["longitude"].values, b_lons)

In [52]:
df_era5_index_reseted

Unnamed: 0,valid_time,latitude,longitude,sp,t2m,year_month,latitude_rounded,longitude_rounded
0,2003-03-01 00:00:00,90.0,-180.00,101938.593750,241.047729,2003-03,87.5,-177.5
1,2003-03-01 00:00:00,90.0,-179.75,101938.593750,241.047729,2003-03,87.5,-177.5
2,2003-03-01 00:00:00,90.0,-179.50,101938.593750,241.047729,2003-03,87.5,-177.5
3,2003-03-01 00:00:00,90.0,-179.25,101938.593750,241.047729,2003-03,87.5,-177.5
4,2003-03-01 00:00:00,90.0,-179.00,101938.593750,241.047729,2003-03,87.5,-177.5
...,...,...,...,...,...,...,...,...
8305915,2003-03-01 07:00:00,-90.0,178.75,69759.601562,238.342026,2003-03,-87.5,177.5
8305916,2003-03-01 07:00:00,-90.0,179.00,69759.601562,238.342026,2003-03,-87.5,177.5
8305917,2003-03-01 07:00:00,-90.0,179.25,69759.601562,238.342026,2003-03,-87.5,177.5
8305918,2003-03-01 07:00:00,-90.0,179.50,69759.601562,238.342026,2003-03,-87.5,177.5


##### Fuse

In [69]:
merged_nearest = df_era5_index_reseted.merge(
    df_co2_index_reseted,
    left_on=["year_month", "latitude_rounded", "longitude_rounded"],
    right_on=["year_month", "lat", "lon"],
    how="left"
)

In [89]:
df_era5["xco2"] = merged_nearest["xco2"].values

# Evaluate and compare

In [74]:
df_era5.loc[(slice(None), 47.5, 47.5)]

Unnamed: 0_level_0,sp,t2m,xco2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-03-01 00:00:00,103843.59375,261.188354,0.000378
2003-03-01 01:00:00,103870.40625,260.427399,0.000378
2003-03-01 02:00:00,103846.648438,260.079407,0.000378
2003-03-01 03:00:00,103898.492188,260.526367,0.000378
2003-03-01 04:00:00,103854.53125,261.259521,0.000378
2003-03-01 05:00:00,103901.578125,264.013062,0.000378
2003-03-01 06:00:00,103910.898438,263.359314,0.000378
2003-03-01 07:00:00,103914.601562,263.777588,0.000378


In [75]:
df_era5.loc[(slice(None), 49.5, 46.5)]

Unnamed: 0_level_0,sp,t2m,xco2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-03-01 00:00:00,103457.59375,258.496948,0.000378
2003-03-01 01:00:00,103460.40625,258.802399,0.000378
2003-03-01 02:00:00,103412.648438,258.823547,0.000378
2003-03-01 03:00:00,103456.492188,258.866211,0.000378
2003-03-01 04:00:00,103383.53125,258.74585,0.000378
2003-03-01 05:00:00,103416.578125,261.679077,0.000378
2003-03-01 06:00:00,103417.898438,261.140564,0.000378
2003-03-01 07:00:00,103446.601562,260.525635,0.000378
