### **第一步**
在运行环境中安装对应的库、数据集，并解压到对应的目录
执行命令即可

In [1]:
# !pip install xarray[complete]
# !apt update&&apt install axel

Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
Collecting xarray[complete]
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/ce/78/7a78d5197e409371c4fd9734ad9ab41ed6f9147b3ac23256c4e6c81295f2/xarray-2024.6.0-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting hypothesis (from xarray[complete])
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/4d/a7/8ec00d12fc4a7a9deb6000edf74342e831e5b4b707646dbd2cd8704bbad8/hypothesis-6.108.4-py3-none-any.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.2/465.2 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mypy (from xarray[complete])
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/40/93/2d36405a6a0c512cd167200f483af3bd14d15717a33ba60bf4dd5ce4b4bc/mypy-1.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2

In [3]:
# !axel -n 5 -o weather.round1.train.gt.2019-2021.zip 'https://tianchi-race-prod-sh.oss-cn-shanghai.aliyuncs.com/file/race/documents/532234/bigFile/weather.round1.train.gt.2019-2021.zip?Expires=1721908298&OSSAccessKeyId=LTAI5t7fj2oKqzKgLGz6kGQc&Signature=PHY8yu0xUzWjVqBfpVNPAVr5MCM%3D&response-content-disposition=attachment%3B%20'

# !axel -n 5 -o weather.round1.test.zip 'https://tianchi-race-prod-sh.oss-cn-shanghai.aliyuncs.com/file/race/documents/532234/bigFile/weather.round1.test.zip?Expires=1721908392&OSSAccessKeyId=LTAI5t7fj2oKqzKgLGz6kGQc&Signature=wOhZYyc2meJj5UrIvKyYRiW3oYs%3D&response-content-disposition=attachment%3B%20'

Initializing download: https://tianchi-race-prod-sh.oss-cn-shanghai.aliyuncs.com/file/race/documents/532234/bigFile/weather.round1.train.gt.2019-2021.zip?Expires=1721852797&OSSAccessKeyId=LTAI5t7fj2oKqzKgLGz6kGQc&Signature=%2B4JoTbsP%2FQuqmG%2FZbWWH2oYZPRM%3D&response-content-disposition=attachment%3B%20
File size: 139.237 Megabyte(s) (146000256 bytes)
Opening output file weather.round1.train.gt.2019-2021.zip
Starting download

Connection 0 finished1     ..........2....3    ........4  ] [ 148.7MB/s] [00:00][2K
Connection 2 finished1  0  ...............3    ........4  ] [ 148.7MB/s] [00:00][2K
Connection 4 finished1  0  ...............3  2 ...........] [ 157.2MB/s] [00:00][2K
Connection 3 finished..1.04..................3 ...........] [ 136.9MB/s] [00:00][2K
Connection 0 finished...1..4.................23...........] [ 140.0MB/s] [00:00][2K
Connection 1 finished...1..4.................23...........] [ 141.2MB/s] [00:00][2K
Connection 2 finished......4..................31..........

In [4]:
!unzip -q -n weather.round1.train.gt.2019-2021.zip -d groundtruth
!unzip -q -n weather.round1.test.zip -d test

### **第二步**
导入运行所需要的库函数 

In [2]:
import os
import pandas as pd
import xarray as xr
from torch.utils.data import Dataset, DataLoader

### **第三步**
数据集路径配置设置
- 比赛的数据部分分为**数据特征**和**数据真值**两部分，数据特征是模型训练的**输入**，数据真值是模型训练的**标签**
- 其中数据特征部分 输入的路径目录下包含年份文件夹 
 - 例如示例给出的 "输入路径/2021/..." 各年份文件夹下包含从官网下载的压缩包(e.g. weather.round1.train.ft.2021.1.zip) 解压后文件夹下有不同时段的数据文件夹(e.g. 20210101-00), 内部包含6个nc文件, 是从伏羲大模型中获取的从第6小时到第72小时的数据

- 数据真值部分 输入的路径目录下包含3个年份的.nc数据, 其中选择哪些年份的特征数据作为输入, 就在years中添加哪些年份
- fcst_steps指预测的时间步长, 从第1小时到第72小时, 间隔为1小时



In [5]:
# path config
feature_path = 'feature' #自定义路径并修改为自己的路径
gt_path = 'groundtruth' #自定义路径并修改为自己的路径
years = ['2021']
fcst_steps = list(range(1, 73, 1))

### **第四步**
Feature类和GroundTruth类是数据集的定义
方便后续自定义数据集和数据加载类, 方便我们训练时取数据

In [6]:
# Feature部分
class Feature:
    def __init__(self):
        self.path = feature_path
        self.years = years
        self.fcst_steps = fcst_steps
        self.features_paths_dict = self.get_features_paths()

    def get_features_paths(self):
        init_time_path_dict = {}
        for year in self.years:
            init_time_dir_year = os.listdir(os.path.join(self.path, year))
            for init_time in sorted(init_time_dir_year):
                init_time_path_dict[pd.to_datetime(init_time)] = os.path.join(self.path, year, init_time)
        return init_time_path_dict

    def get_fts(self, init_time):
        return xr.open_mfdataset(self.features_paths_dict.get(init_time) + '/*').sel(lead_time=self.fcst_steps).isel(
            time=0)
    
# GroundTruth部分
class GT:
    def __init__(self):
        self.path = gt_path
        self.years = years
        self.fcst_steps = fcst_steps
        self.gt_paths = [os.path.join(self.path, f'{year}.nc') for year in self.years]
        self.gts = xr.open_mfdataset(self.gt_paths)

    def parser_gt_timestamps(self, init_time):
        return [init_time + pd.Timedelta(f'{fcst_step}h') for fcst_step in self.fcst_steps]

    def get_gts(self, init_time):

        return self.gts.sel(time=self.parser_gt_timestamps(init_time))

### **第五步**
mydataset类的定义, 整合了加载特征和特征对应真值的功能, 方便后续训练时取数据

In [7]:
# 构建Dataset部分
class mydataset(Dataset):
    def __init__(self):
        self.ft = Feature()
        self.gt = GT()
        self.features_paths_dict = self.ft.features_paths_dict
        self.init_times = list(self.features_paths_dict.keys())

    def __getitem__(self, index):
        init_time = self.init_times[index]
        ft_item = self.ft.get_fts(init_time).to_array().isel(variable=0).values
        print(type(ft_item))
        gt_item = self.gt.get_gts(init_time).to_array().isel(variable=0).values
        print(type(gt_item))
        return ft_item, gt_item

    def __len__(self):
        return len(list(self.init_times))

### **第六步**
前五步已经完成了数据预处理加载的相关类和函数的准备, 这里我们可以通过实例化mydataset类来查看数据数量
同时完成数据集的构建后, 我们可以通过DataLoader来查看数据集的数据

In [8]:
# 可以查看一下已经构建的dataset
# define dataset
my_data = mydataset()
print('sample num:', mydataset().__len__())
train_loader = DataLoader(my_data, batch_size=1, shuffle=True)

sample num: 4


### **第七步**
- 完成了数据的准备工作, 接下来就是构建模型的部分
- Model这个类, 对我们的模型进行定义, 方便后续训练时调用
- 这里我们以一个简单的只有一个卷积层的网络为例
- 在本次比赛中, 我们的输入数据维度是(1, 24, 72, W, H), 输出数据维度是(1, 72, W, H) 可以在赛题中查看

In [9]:
# 模型构建部分
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, num_in_ch, num_out_ch):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(num_in_ch, num_out_ch, 3, 1, 1)

    def forward(self, x):
        B, S, C, W, H = tuple(x.shape)
        x = x.reshape(B, -1, W, H)
        out = self.conv1(x)
        out = out.reshape(B, S, W, H)
        return out

# define model
in_varibales = 24
in_times = len(fcst_steps)
out_varibales = 1
out_times = len(fcst_steps)
input_size = in_times * in_varibales
output_size = out_times * out_varibales
model = Model(input_size, output_size).cuda()

### **第八步**
定义模型的损失函数部分， 用于模型训练做反向传播

In [10]:
# define loss
loss_func = nn.MSELoss()

### **第九步**
模型训练部分

In [11]:
import numpy as np
import torch
# from tqdm import tqdm
# Train the model
num_epochs = 1
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# for epoch in tqdm(range(num_epochs)):
for epoch in range(num_epochs):
    for index, (ft_item, gt_item) in enumerate(train_loader):
        ft_item = ft_item.cuda().float()
        gt_item = gt_item.cuda().float()
        print(type(ft_item))
        print(type(gt_item))
        
        # Forward pass
        output_item = model(ft_item)
        loss = loss_func(output_item, gt_item)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Print the loss for every 10 steps
        if (index+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{index+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

# Save the model weights
torch.save(model.state_dict(), 'model_weights.pth')

  from .autonotebook import tqdm as notebook_tqdm


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


### **第十步**
- 模型推理部分, 通过加载模型使用测试数据作为输入, 得到预测结果
- 其中test_data_path需要给出从下载测试数据解压后的目录路径

In [12]:
# Inference
# Load the model weights
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()
import os

test_data_path = "test/weather.round1.test"
os.makedirs("./output", exist_ok=True)
for index, test_data_file in enumerate(os.listdir(test_data_path)):
    test_data = torch.load(os.path.join(test_data_path, test_data_file))
    test_data = test_data.cuda().float()
    
    # Forward pass
    output_item = model(test_data)
    
    # Print the output shape
    print(f"Output shape for sample {test_data_file.split('.')[0]}: {output_item.shape}")
    
    # Save the output
    output_path = f"output/{test_data_file}"
    torch.save(output_item.cpu(), output_path)
    # Load the model weights
    model.load_state_dict(torch.load("model_weights.pth"))

Output shape for sample 250: torch.Size([1, 72, 57, 81])
Output shape for sample 220: torch.Size([1, 72, 57, 81])
Output shape for sample 075: torch.Size([1, 72, 57, 81])
Output shape for sample 136: torch.Size([1, 72, 57, 81])
Output shape for sample 242: torch.Size([1, 72, 57, 81])
Output shape for sample 074: torch.Size([1, 72, 57, 81])
Output shape for sample 251: torch.Size([1, 72, 57, 81])
Output shape for sample 048: torch.Size([1, 72, 57, 81])
Output shape for sample 006: torch.Size([1, 72, 57, 81])
Output shape for sample 170: torch.Size([1, 72, 57, 81])
Output shape for sample 190: torch.Size([1, 72, 57, 81])
Output shape for sample 072: torch.Size([1, 72, 57, 81])
Output shape for sample 240: torch.Size([1, 72, 57, 81])
Output shape for sample 028: torch.Size([1, 72, 57, 81])
Output shape for sample 162: torch.Size([1, 72, 57, 81])
Output shape for sample 029: torch.Size([1, 72, 57, 81])
Output shape for sample 217: torch.Size([1, 72, 57, 81])
Output shape for sample 284: to

In [15]:
!zip -r output.zip output

  adding: output/ (stored 0%)
  adding: output/250.pt (deflated 13%)
  adding: output/220.pt (deflated 12%)
  adding: output/075.pt (deflated 13%)
  adding: output/136.pt (deflated 11%)
  adding: output/242.pt (deflated 8%)
  adding: output/074.pt (deflated 10%)
  adding: output/251.pt (deflated 9%)
  adding: output/048.pt (deflated 10%)
  adding: output/006.pt (deflated 10%)
  adding: output/170.pt (deflated 10%)
  adding: output/190.pt (deflated 9%)
  adding: output/072.pt (deflated 8%)
  adding: output/240.pt (deflated 8%)
  adding: output/028.pt (deflated 8%)
  adding: output/162.pt (deflated 12%)
  adding: output/029.pt (deflated 8%)
  adding: output/217.pt (deflated 8%)
  adding: output/284.pt (deflated 12%)
  adding: output/094.pt (deflated 11%)
  adding: output/214.pt (deflated 12%)
  adding: output/273.pt (deflated 13%)
  adding: output/110.pt (deflated 9%)
  adding: output/146.pt (deflated 10%)
  adding: output/175.pt (deflated 12%)
  adding: output/153.pt (deflated 11%)
  ad