In [4]:
import numpy as np
import torch
import os
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader,TensorDataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split

In [7]:
def load_npy_data_with_labels(folder_path, label):
    data_list = []  # 用于存储所有读取的数据
    labels_list = []  # 用于存储所有的标签
    for file in os.listdir(folder_path):
        if file.endswith('.npy'):
            file_path = os.path.join(folder_path, file)
            data = np.load(file_path)
            data_list.append(data)
            labels_list.append(label)
    return data_list, labels_list

# 使用你的文件夹路径替换这里
base_folder_path = 'train_data'  # e.g., 'path_to_your_data/train_data'

# 加载 language_0 的数据并分配标签 0
language_0_folder_path = os.path.join(base_folder_path, 'language_0')
language_0_data, language_0_labels = load_npy_data_with_labels(language_0_folder_path, 0)

# 加载 language_1 的数据并分配标签 1
language_1_folder_path = os.path.join(base_folder_path, 'language_1')
language_1_data, language_1_labels = load_npy_data_with_labels(language_1_folder_path, 1)

# 合并两种语言的数据和标签
train_data_raw = language_0_data + language_1_data
train_labels = language_0_labels + language_1_labels
train_data_cropped = [matrix[:40] for matrix in train_data_raw]
def min_max_normalize(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    normalized_tensor = (tensor - min_val) / (max_val - min_val)
    return normalized_tensor
def standardize(tensor):
    mean = torch.mean(tensor)
    std = torch.std(tensor)
    standardized_tensor = (tensor - mean) / std
    return standardized_tensor
# 打印出一些信息来确认数据已被加载
print(f"Loaded {len(language_0_data)} samples for language 0 and {len(language_1_data)} samples for language 1.")
train_data_cropped[0]


Loaded 2000 samples for language 0 and 2000 samples for language 1.


array([[-11.512925 , -11.512925 , -11.512925 , ..., -11.512925 ,
        -11.512925 , -11.512925 ],
       [-11.512925 , -11.512925 , -11.512925 , ..., -11.512925 ,
        -11.512925 , -11.512925 ],
       [-11.512925 , -11.512925 , -11.512925 , ..., -11.512925 ,
        -11.512925 , -11.512925 ],
       ...,
       [ -5.672583 ,  -6.862608 ,  -6.7738132, ...,  -6.6783767,
         -7.703056 ,  -8.840825 ],
       [ -6.2198935,  -6.979812 ,  -6.4984074, ...,  -6.659245 ,
         -7.500062 ,  -8.744839 ],
       [ -6.584874 ,  -7.218131 ,  -7.068345 , ...,  -6.657686 ,
         -7.625363 ,  -8.85662  ]], dtype=float32)

In [8]:
# 创建 TensorDataset 和 DataLoader
train_dataset = TensorDataset(standardize(torch.tensor(train_data_cropped)), torch.tensor(train_labels))
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size

# 随机分割成训练集和验证集
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=64,shuffle=True)
train_dataset[0]

  train_dataset = TensorDataset(standardize(torch.tensor(train_data_cropped)), torch.tensor(train_labels))


(tensor([[-0.5467, -0.5467, -0.5467,  ..., -0.5467, -0.5467, -0.5467],
         [-0.5467, -0.5467, -0.5467,  ..., -0.5467, -0.5467, -0.5467],
         [-0.5467, -0.5467, -0.5467,  ..., -0.5467, -0.5467, -0.5467],
         ...,
         [ 2.6199,  2.3542,  2.2991,  ...,  1.2457,  1.2652,  1.2718],
         [ 2.6727,  2.3404,  1.9967,  ...,  1.1394,  1.0364,  0.9644],
         [ 2.5546,  2.1321,  1.9312,  ...,  1.0189,  0.8640,  0.5961]]),
 tensor(0))

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [20]:
class ImprovedRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=2, dropout_rate=0.2):
        super(ImprovedRNN, self).__init__()
        # 使用多层 LSTM，并添加 Dropout
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, 
                            batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # x 是一个 PackedSequence
        packed_output, (hidden, _) = self.lstm(x)
        # 使用最后一个时间步的隐藏状态
        out = self.fc(hidden[-1])
        out = self.dropout(out)
        out = torch.sigmoid(out).squeeze()
        return out
model = ImprovedRNN(input_size=80, hidden_size=128, num_classes=1).to(device)  # 适当调整参数

# 定义损失函数和优化器
criterion = nn.BCEWithLogitsLoss()  # 二元分类
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
num_epochs=200
# 训练循环
for epoch in range(num_epochs):
    model.train()
    for sequence,label in train_loader:
        sequence, label = sequence.to(device), label.to(device)
        # 前向传播
        label = label.float()
        outputs = model(sequence)
        loss = criterion(outputs, label)
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    total=0
    correct=0
    with torch.no_grad():
        for sequences, labels in val_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    print('Accuracy of the model on the validation data: %.2f %%' % accuracy)


Epoch 1/200, Loss: 0.7016828060150146
Accuracy of the model on the validation data: 48.12 %
Epoch 2/200, Loss: 0.7181267142295837
Accuracy of the model on the validation data: 48.12 %
Epoch 3/200, Loss: 0.7004460692405701
Accuracy of the model on the validation data: 65.25 %
Epoch 4/200, Loss: 0.6730477809906006
Accuracy of the model on the validation data: 70.88 %
Epoch 5/200, Loss: 0.6367615461349487
Accuracy of the model on the validation data: 74.62 %
Epoch 6/200, Loss: 0.6273844838142395
Accuracy of the model on the validation data: 80.25 %
Epoch 7/200, Loss: 0.5981584787368774
Accuracy of the model on the validation data: 82.25 %
Epoch 8/200, Loss: 0.6228981614112854
Accuracy of the model on the validation data: 82.62 %
Epoch 9/200, Loss: 0.6266378164291382
Accuracy of the model on the validation data: 85.50 %
Epoch 10/200, Loss: 0.5739012360572815
Accuracy of the model on the validation data: 86.25 %
Epoch 11/200, Loss: 0.5736496448516846
Accuracy of the model on the validation 

In [21]:
filename = "model_state_dict.pth"
torch.save(model.state_dict(), filename)

In [5]:
def load_npy_data_with_labels(folder_path, label):
    data_list = []  # 用于存储所有读取的数据
    labels_list = []  # 用于存储所有的标签
    for file in os.listdir(folder_path):
        if file.endswith('.npy'):
            file_path = os.path.join(folder_path, file)
            data = np.load(file_path)
            data_list.append(data)
            
    return data_list, labels_list
test_data_raw,_ = load_npy_data_with_labels('test_data', 0)

In [22]:
test_data_cropped=[matrix[:40] for matrix in test_data_raw]
model.eval()
test_data_cropped=torch.tensor(test_data_cropped)
# 假设 test_loader 是您的无标签测试数据加载器
predictions = []

with torch.no_grad():
    for data in test_data_cropped:
        data = data.to(device)  # 如果使用 GPU
        outputs = model(data)
        predicted_labels = (outputs > 0.5).float()
        # 确保 predicted_labels 是一维数组
        predicted_labels = predicted_labels.view(-1)
        predictions.extend(predicted_labels.cpu().numpy())

In [27]:
len(predictions)

2000

In [24]:
import pandas as pd

# 读取原始 CSV 文件
original_df = pd.read_csv('test.csv')
predictions_int = [int(x) for x in predictions]

# 假设 `predictions` 是包含预测结果的列表
# 检查预测结果的长度与原始 DataFrame 的长度是否一致
if len(predictions) == len(original_df):
    # 将预测结果作为 'label' 列的值
    original_df['label'] = predictions_int

    # 保存修改后的 DataFrame 到 CSV 文件
    original_df.to_csv('test_with_predictions.csv', index=False)
else:
    print("Error: The length of the predictions does not match the original data.")