# 将模式数据的$,S,T,U,V,W$截取到对应的研究区域($\eta$之前截取过)

In [6]:
import numpy as np
# from PyEMD import EMD, Visualisation, EEMD
import matplotlib.pyplot as plt
import matplotlib as mpt
import matplotlib.colors as mcolors
from matplotlib import ticker  
from matplotlib.gridspec import GridSpec
from matplotlib.ticker import MultipleLocator
from numcodecs import Blosc
import os
import pandas as pd 
import xarray as xr
import xeofs as xe
import cartopy.mpl.ticker
from cartopy import crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.ticker import LongitudeFormatter,LatitudeFormatter
from scipy.io import loadmat, savemat
from scipy.fft import fft, fftfreq, ifft
from scipy.signal import detrend, hilbert, welch, find_peaks, correlate
from scipy.ndimage import gaussian_filter
from scipy.stats import chi2, pearsonr
# from pykalman import KalmanFilter
import zarr
plt.rcParams['font.family'] = 'Times New Roman'   
mpt.rcParams['axes.unicode_minus']=False
np.set_printoptions(suppress=True)
# font={'family':'SimHei',
#     'weight':'bold'}
# mpt.rc('font',**font)

# %matplotlib inline
%matplotlib qt
%pwd



'c:\\Users\\eddy\\data_preprocess'

## 法1(折中)：
- 将每年的数据各生成一个nc文件，这样每运行`concat_variable`函数一次，只需要加载12个月的数据，不会爆内存。
- 总共生成20年的数据，由于nc文件比旧版的mat文件省空间得多，因此可以全加载进来保存为一个nc文件

In [None]:
model_filename0 = r'D:\monthly_mean\s_mm_2008_09.mat'
model_parameter = loadmat(r"D:\metrics_io.mat")
model_lon = model_parameter['xc'][:,1]
model_lat = model_parameter['yc'][-1,:]
rc = model_parameter['rc'].squeeze()
rf = model_parameter['rf'].squeeze()




def concat_variable(variable, year):
    nc_list = []
    variable_folder = r'D:\monthly_mean'
    output_dir = r'D:\EDDY_data\cutted_{}'.format(variable)
    os.makedirs(output_dir, exist_ok=True) 
    output_filename = os.path.join(output_dir, r'{}_{}.nc'.format(variable, year))
      
    for month in range(1, 13):
        variable_filename = r'\{}_mm_{}_{:02d}.mat'.format(variable, year, month)
        variable_path = variable_folder + variable_filename
        variable_name = variable + '0'
        data = loadmat(variable_path)[variable_name][144:421, 317:471]
        nc_lon = model_lon[144:421]
        nc_lat = model_lat[317:471]
        time = [pd.to_datetime(str(year)+'-'+str(month), format='%Y-%m')]

        if variable in ['s', 't']:
            nc_data = xr.DataArray(data, coords=dict(lon=nc_lon, lat=nc_lat, rc_depth=rc)
                                ).expand_dims(time=time)
        else:
            nc_data = xr.DataArray(data, coords=dict(lon=nc_lon, lat=nc_lat, rf_depth=rf)
                                ).expand_dims(time=time)     

        nc_list.append(nc_data)
    nc_variable = xr.concat(nc_list, dim='time')           
    nc_variable.to_netcdf(output_filename)


variable_list = ['s', 't', 'u', 'v', 'w']
for variable in variable_list:
    print(variable)
    for year in range(2001, 2021):
        concat_variable(variable, year)

# def concat_variable(variable):
#     variable_folder = r'D:\monthly_mean'
#     output_path = r'D:\EDDY_data\cutted_{}.nc'.format(variable)
    
#     # 初始化坐标（一次性生成）
#     nc_lon = model_lon[144:421]
#     nc_lat = model_lat[317:471]
#     coords = {'lon': nc_lon, 'lat': nc_lat}
    
#     # 初始化NetCDF文件（首次写入时创建）
#     first_file = True
    
#     for year in range(2001, 2002):
#         print(year)
#         for month in range(1, 13):
#             variable_path = os.path.join(
#                 variable_folder,
#                 r'{}_mm_{}_{:02d}.mat'.format(variable, year, month)
#             )
#             data = loadmat(variable_path)[variable + '0'][144:421, 317:471, :]
#             time = [pd.to_datetime(f'{year}-{month}', format='%Y-%m')]
            
#             # 创建单月DataArray
#             if variable in ['s', 't']:
#                 ds = xr.Dataset(
#                     {variable: (['time', 'lon', 'lat', 'rc_depth'], data[None, ...])},  # 增加时间维度
                    
#                     coords={'time': time, **coords, 'rc_depth': rc}
#                 )
#             else:
#                 ds = xr.Dataset(
#                     {variable: (['time', 'lon', 'lat', 'rc_depth'], data[None, ...])},  # 增加时间维度
                    
#                     coords={'time': time, **coords, 'rc_depth': rf}
#                 )


#             # 增量写入NetCDF
#             if first_file:
#                 ds.to_netcdf(output_path, mode='w', unlimited_dims=['time'])
#                 first_file = False
#             else:
#                 ds.to_netcdf(output_path, mode='a')
    


                



s


### 将每个变量的所有年份拼接起来

In [7]:
def concat_year(variable):
    input_file_path = r'D:\EDDY_data\cutted_{}'.format(variable)
    output_filename = os.path.join(input_file_path, f'{variable}.nc')
    nc_list = []
    for year in range(2001, 2021):
        input_filename = os.path.join(input_file_path, f'{variable}_{year}.nc')
        with xr.open_dataarray(input_filename) as da:
            nc_list.append(da)
    nc_data = xr.concat(nc_list, dim='time')
    nc_data.to_netcdf(output_filename)

for variable in ['s', 't', 'u', 'v', 'w']:
    print(variable)
    concat_year(variable)


s
t
u
v
w


## 法2：利用zarr数组支持增量写入的特性，将数据保存为zarr格式(也可以转成nc格式)

In [8]:
model_filename0 = r'D:\monthly_mean\s_mm_2008_09.mat'
model_parameter = loadmat(r"D:\metrics_io.mat")
model_lon = model_parameter['xc'][:,1]
model_lat = model_parameter['yc'][-1,:]
rc = model_parameter['rc'].squeeze()
rf = model_parameter['rf'].squeeze()
variable_list = ['s', 't', 'u', 'v', 'w']


def concat_variable(variable):
    variable_folder = r'D:\monthly_mean'
    output_path = r'D:\EDDY_data\zarr_cutted_{}.zarr'.format(variable)
    
    # 初始化坐标（一次性生成）
    nc_lon = model_lon[144:421]
    nc_lat = model_lat[317:471]
    coords = {'lon': nc_lon, 'lat': nc_lat}
    
    # 初始化NetCDF文件（首次写入时创建）
    first_file = True

    # 配置压缩参数（关键修改点）
    compressor = Blosc(
        cname='zstd',  # 压缩算法：zstd/lz4/lz4hc/zlib等
        clevel=3,      # 压缩级别（1-9）
        shuffle=2,  # 字节洗牌增强压缩
       
    )
    
    encoding = {variable:{'chunks':(1, len(nc_lon), len(nc_lat), len(rc))
                          }
                }
    for year in range(2001, 2021):
        print(year)
        for month in range(1, 13):
            variable_path = os.path.join(
                variable_folder,
                r'{}_mm_{}_{:02d}.mat'.format(variable, year, month)
            )
            data = loadmat(variable_path)[variable + '0'][144:421, 317:471, :]
            time = [pd.to_datetime(f'{year}-{month}', format='%Y-%m')]
            
            # 创建单月DataArray
            if variable in ['s', 't']:
                ds = xr.Dataset(
                    {variable: (['time', 'lon', 'lat', 'rc_depth'], data[None, ...])},  # 增加时间维度
                    
                    coords={'time': time, **coords, 'rc_depth': rc}
                )
            else:
                ds = xr.Dataset(
                    {variable: (['time', 'lon', 'lat', 'rc_depth'], data[None, ...])},  # 增加时间维度
                    
                    coords={'time': time, **coords, 'rc_depth': rf}
                )


            # 增量写入zarr
            if first_file:
                ds.to_zarr(output_path, mode='w',
                           encoding=encoding,  # 分块策略
                           consolidated=True  # 合并元数据
                           )
                first_file = False
            else:
                ds.to_zarr(output_path, mode='a', 
                           append_dim='time', 
                           consolidated=True
                           )
                
for variable in variable_list:
    concat_variable(variable)

2001




2002




2003




2004




2005




2006




2007




2008




2009




2010




2011




2012




2013




2014




2015




2016




2017




2018




2019




2020




2001




2002




2003




2004




2005




2006




2007




2008




2009




2010




2011




2012




2013




2014




2015




2016




2017




2018




2019




2020




2001




2002




2003




2004




2005




2006




2007




2008




2009




2010




2011




2012




2013




2014




2015




2016




2017




2018




2019




2020




2001




2002




2003




2004




2005




2006




2007




2008




2009




2010




2011




2012




2013




2014




2015




2016




2017




2018




2019




2020




2001




2002




2003




2004




2005




2006




2007




2008




2009




2010




2011




2012




2013




2014




2015




2016




2017




2018




2019




2020




In [14]:
xr.open_zarr(r'D:\EDDY_data\zarr_cutted_s.zarr')

Unnamed: 0,Array,Chunk
Bytes,195.27 MiB,16.27 MiB
Shape,"(12, 277, 154, 50)","(1, 277, 154, 50)"
Dask graph,12 chunks in 2 graph layers,12 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 195.27 MiB 16.27 MiB Shape (12, 277, 154, 50) (1, 277, 154, 50) Dask graph 12 chunks in 2 graph layers Data type float64 numpy.ndarray",12  1  50  154  277,

Unnamed: 0,Array,Chunk
Bytes,195.27 MiB,16.27 MiB
Shape,"(12, 277, 154, 50)","(1, 277, 154, 50)"
Dask graph,12 chunks in 2 graph layers,12 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [7]:
print(xr.__version__, '     ', zarr.__version__)
xr.open_dataarray(r'D:\AVISO_SLA_copernicusmarine\SLA_2001.nc'
                  ).to_zarr(r'D:\AVISO_SLA_copernicusmarine\SLA_2001111.zarr', mode='w')


2025.4.0       3.0.7




<xarray.backends.zarr.ZarrStore at 0x1a9a2c78ee0>