In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch.optim as optim

In [12]:
import numpy as np
import torch

# 生成双峰分布数据
data3 = np.random.multivariate_normal([3, 9], [[1, 0.5], [0.5, 1]], 10000)
coef1 = [5, 6]
coef2 = [-5, -6]
noise = np.random.randn(10000)

# 生成双峰目标变量
data4_1 = data3[:, 0] * coef1[0] + data3[:, 1] * coef1[1] + noise
data4_2 = data3[:, 0] * coef2[0] + data3[:, 1] * coef2[1] + noise
data4 = np.concatenate([data4_1[:5000], data4_2[5000:]])

# 合并数据
data = np.concatenate([data3, data4.reshape(-1, 1)], axis=1)
dataset = torch.tensor(data).float()

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads  # 每个头的维度
        
        # 可训练的投影矩阵
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # 线性投影：计算 Q, K, V
        Q = self.q_proj(query)  # [batch_size, seq_len, embed_dim]
        K = self.k_proj(key)    # [batch_size, seq_len, embed_dim]
        V = self.v_proj(value)  # [batch_size, seq_len, embed_dim]
        
        # 分头：变形为 [batch_size, num_heads, seq_len, head_dim]
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # 计算注意力分数：scaled dot-product attention
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
        attention_weights = F.softmax(attention_scores, dim=-1)  # [batch_size, num_heads, seq_len, seq_len]
        
        # 注意力加权
        attended = torch.matmul(attention_weights, V)  # [batch_size, num_heads, seq_len, head_dim]
        
        # 拼接多头的结果
        attended = attended.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)
        
        # 输出投影
        output = self.out_proj(attended)  # [batch_size, seq_len, embed_dim]
        return output


In [14]:
class AttentionBlock(nn.Module):
    def __init__(self, embed_dim):
        super(AttentionBlock, self).__init__()
        self.query_proj = nn.Linear(embed_dim, embed_dim)
        self.key_proj = nn.Linear(embed_dim, embed_dim)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value):
        # Compute attention scores
        query = self.query_proj(query)
        key = self.key_proj(key)
        value = self.value_proj(value)
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (query.size(-1) ** 0.5)
        attention_weights = self.softmax(attention_scores)
        
        # Apply attention weights
        attended = torch.matmul(attention_weights, value)
        return attended


In [15]:



class UNetWithAttention(nn.Module):
    def __init__(self, input_dim=2, output_dim=1, hidden_dims=[64, 128, 256], embed_dim=128,num_heads=8):
        super(UNetWithAttention, self).__init__()
        
        # Encoding layers
        self.encoder1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU()
        )
        self.encoder2 = nn.Sequential(
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.encoder3 = nn.Sequential(
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.ReLU()
        )
        
        # Embedding projection for skip connections
        self.embedding1 = nn.Linear(hidden_dims[0], embed_dim)
        self.embedding2 = nn.Linear(hidden_dims[1], embed_dim)
        
        # Attention blocks
        self.attention1 =  MultiHeadAttention(embed_dim, num_heads)
        self.attention2 =  MultiHeadAttention(embed_dim,num_heads)
        
        # Decoding layers
        self.decoder1 = nn.Sequential(
            nn.Linear(hidden_dims[2], embed_dim),
            nn.ReLU()
        )
        self.decoder2 = nn.Sequential(
            nn.Linear(2* embed_dim, embed_dim),  # Use embeddings from attention
            nn.ReLU()
        )
        self.decoder3 = nn.Linear(embed_dim*2, output_dim)  # Final layer

    def forward(self, x):
        # Encoding
        enc1 = self.encoder1(x)  # First encoding layer
        enc2 = self.encoder2(enc1)  # Second encoding layer
        enc3 = self.encoder3(enc2)  # Third encoding layer
        
        # Project to embedding space
        emb1 = self.embedding1(enc1)
        emb2 = self.embedding2(enc2)
        
        # Decoding with attention
        dec1 = self.decoder1(enc3)  # First decoding layer
        # Attention on first skip connection
        attended1 = self.attention1(dec1.unsqueeze(1), emb2.unsqueeze(1), emb2.unsqueeze(1)).squeeze(1)
        dec2 = self.decoder2(torch.cat([dec1, attended1], dim=1))  # Combine with attended skip connection

        # Attention on second skip connection
        attended2 = self.attention2(dec2.unsqueeze(1), emb1.unsqueeze(1), emb1.unsqueeze(1)).squeeze(1)
        out = self.decoder3(torch.cat([dec2, attended2], dim=1))  # Combine with attended skip connection
        
        return out


In [16]:
# criterion = nn.MSELoss()  # 均方误差损失函数
# 或者使用 Huber 损失
criterion = nn.HuberLoss(delta=1.0)


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 生成数据
X = data[:, :2]  # 输入特征
y = data[:, 2]   # 目标变量

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化数据
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)
y_train = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test = scaler_y.transform(y_test.reshape(-1, 1)).flatten()

# 转换为 PyTorch 张量
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)


In [18]:
X_train_torch.shape

torch.Size([8000, 2])

In [19]:
# 数据加载
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)
model = UNetWithAttention(input_dim=2, output_dim=1, hidden_dims=[64, 128, 256],embed_dim=64)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
# 训练循环
epochs = 200
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    y_pred = model(X_train_torch)
    loss = criterion(y_pred, y_train_torch)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        model.eval()
        with torch.no_grad():
            y_test_pred = model(X_test_torch)
            test_loss = criterion(y_test_pred, y_test_torch)
        print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}")


Epoch 0, Train Loss: 0.4958, Test Loss: 0.4957
Epoch 10, Train Loss: 0.4938, Test Loss: 0.4976
Epoch 20, Train Loss: 0.4927, Test Loss: 0.4965
Epoch 30, Train Loss: 0.4915, Test Loss: 0.4969
Epoch 40, Train Loss: 0.4906, Test Loss: 0.4975
Epoch 50, Train Loss: 0.4897, Test Loss: 0.4984
Epoch 60, Train Loss: 0.4884, Test Loss: 0.4986
Epoch 70, Train Loss: 0.4882, Test Loss: 0.4986
Epoch 80, Train Loss: 0.4868, Test Loss: 0.4990
Epoch 90, Train Loss: 0.4859, Test Loss: 0.5004
Epoch 100, Train Loss: 0.4858, Test Loss: 0.5001
Epoch 110, Train Loss: 0.4845, Test Loss: 0.5014
Epoch 120, Train Loss: 0.4843, Test Loss: 0.5006
Epoch 130, Train Loss: 0.4834, Test Loss: 0.5009
Epoch 140, Train Loss: 0.4818, Test Loss: 0.5020
Epoch 150, Train Loss: 0.4816, Test Loss: 0.5018
Epoch 160, Train Loss: 0.4807, Test Loss: 0.5022
Epoch 170, Train Loss: 0.4804, Test Loss: 0.5056
Epoch 180, Train Loss: 0.4802, Test Loss: 0.5062
Epoch 190, Train Loss: 0.4780, Test Loss: 0.5037
