In [1]:
import torch
import torch.nn as nn
import re
import numpy as np
import json
import os
# from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader,TensorDataset
from torch.utils.tensorboard import SummaryWriter 
# from google.colab import drive
# drive.mount('/content/drive')



In [2]:
def split_get_label_from_filename(filename):
    return label_mapping.get(filename[8:-5],-1)

def get_label_from_filename(filename):
    match = re.search(r'_(\d+_?\d*)\.json$', filename)
    print(match)
    if match:
        label_key = match.group(1)
        return label_mapping.get(label_key, -1)  # 如果标签不匹配返回 -1

# 获取文件夹中的文件及其标签
def load_files_and_labels(folder_path,split=False):
    file_list = []
    labels = []
    for file in os.listdir(folder_path):
        # print("file: ", file)
        if file.endswith(".json"):
            if split:
                label = split_get_label_from_filename(file)
            else: label = get_label_from_filename(file)
            if label != -1:
                file_list.append(os.path.join(folder_path, file))
                labels.append(label)
            #print(f"File: {file}, Label: {label}")

    return file_list, labels




In [3]:
import json
import numpy as np

# 1. 提取字符并编码为整数索引
def hex_to_sequence(hex_feature):
    """
    将十六进制字符串转换为整数索引序列。
    去掉冒号并转换为字符对应的索引。
    """
    hex_chars = hex_feature.replace(":", "")
    char_to_index = {char: idx for idx, char in enumerate("0123456789abcdef")}
    return [char_to_index[char] for char in hex_chars]

def process_json(file_path):
    """
    处理 JSON 文件，提取时间戳、包长度和 raw data 特征，并对特征进行归一化。
    """
    features = []
    try:
        # 打开 JSON 文件
        with open(file_path, 'r', encoding='utf-8') as file:
            datas = json.load(file)

        # 遍历 JSON 数据，提取时间戳和包长度
        initial_timestamp = None
        for data in datas:
            try:
                timestamp = float(data["_source"]["layers"]["frame"]["frame.time_relative"])  # 时间戳
                packet_length = int(data["_source"]["layers"]["frame"]["frame.len"])  # 包长度

                # 获取数据部分，若不存在则为0
                if 'data' in data["_source"]["layers"]:
                    rawdata = data["_source"]["layers"]["data"]["data.data"]
                else:
                    rawdata = '0'

                # 将原始数据转换为整数序列
                data_feature = hex_to_sequence(rawdata)

                # 将数据填充或截断为指定长度 2832
                if len(data_feature) < 2832:  # 如果长度小于 2832，填充 0
                    data_feature += [0] * (2832 - len(data_feature))
                elif len(data_feature) > 2832:  # 如果长度大于 2832，截断
                    data_feature = data_feature[:2832]

                # 如果数据长度不符合预期，打印出来
                if len(data_feature) != 2832:
                    print(f"Data feature length mismatch: {len(data_feature)}")

                # 初始化时间戳
                if initial_timestamp is None:
                    initial_timestamp = timestamp  # 设置初始时间戳

                # 计算相对时间戳
                relative_timestamp = timestamp - initial_timestamp
                # print(type(relative_timestamp))


                timestamp_array = np.array([relative_timestamp], dtype=float)
                packet_length_array = np.array([packet_length], dtype=float)/1512
                data_feature = np.array(data_feature, dtype=float)/15
           
                # 将特征按顺序组合为 [时间戳, 包长度, 数据特征]
                feature = np.hstack((timestamp_array, packet_length_array,data_feature))
                
                # 添加到特征列表
                features.append(feature)

            except (KeyError, ValueError) as e:
                # 跳过有问题的数据包
                print(f"Skipping packet due to error: {e}")
                continue

        features_array=np.array(features)
        max_timestamp = np.max(features_array[:, 0])  # 获取最大时间戳
        # print("max")
        features_array[:, 0] = [feature[0] / max_timestamp for feature in features_array]  # 时间戳归一化
        # print("11") 

    except Exception as e:
        print(Exception)
        print(f"Error processing file {file_path}: {e}")
        features_array = np.zeros((1, 2834))  # 返回空特征以避免程序中断

    return features_array

In [4]:
import nest_asyncio
nest_asyncio.apply()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义文件夹路径 /content/drive/MyDrive/202_project/202_packet_json

split = 3 # 0:100, 1:300, 2:800

if split==1:
    folder_path = "packet_json_split"
elif split==2:
    folder_path = "202_packet_json_new_800"
elif split==3:
    folder_path = "202_packet_json_new_1000"
else:
    folder_path = "202_packet_json_new"

# 定义标签映射
label_mapping = {
    "0": 0,             # static
    "0_2": 1,           # slightly_move
    "1": 2,             # move
    "4": 3              # intensely_move
}

# 调用函数获取文件和标签
data_paths, labels = load_files_and_labels(folder_path,split)
# 用于存储每个样本的特征

print(type(labels))
labels_array=np.array(labels)#[:50]
print(labels_array.shape)
print(labels_array)
labels_tensor=torch.tensor(labels_array,dtype=torch.long)

Using device: cuda
<class 'list'>
(1000,)
[0 1 0 2 2 1 3 1 3 1 1 1 2 1 3 0 0 3 0 3 0 0 2 0 2 3 2 3 3 1 0 0 3 0 3 3 3
 0 0 0 1 2 1 3 1 2 0 2 1 1 3 3 3 0 0 0 2 0 2 2 3 0 3 3 1 2 1 3 1 1 0 0 3 0
 0 3 3 2 0 1 1 1 0 0 3 2 0 1 3 2 3 1 3 3 1 1 0 2 2 0 1 2 3 1 2 3 0 1 0 0 3
 1 1 3 3 0 1 3 1 1 2 0 3 2 3 1 1 0 0 1 0 0 3 3 2 0 2 1 2 2 3 0 3 2 0 3 0 3
 3 1 3 0 0 3 1 3 1 2 0 0 0 0 0 2 0 3 3 0 1 2 3 3 1 0 0 3 3 0 1 2 1 2 2 0 2
 2 0 3 0 0 1 1 1 0 2 1 3 1 0 0 0 3 3 2 1 1 2 3 1 3 0 1 3 1 3 3 3 2 1 3 1 2
 3 0 2 1 1 1 2 2 0 0 0 2 1 2 2 2 2 0 1 2 1 1 1 2 3 0 1 0 2 2 3 2 2 0 2 1 1
 3 2 0 3 2 2 2 1 1 2 3 0 2 1 3 3 0 1 0 1 0 3 2 2 3 1 3 1 2 3 1 3 2 1 3 1 0
 1 0 1 3 1 1 2 0 3 3 1 2 3 3 0 1 3 3 3 3 0 2 2 1 1 0 3 2 2 3 2 3 2 1 1 2 2
 0 1 3 1 3 3 0 0 3 0 2 3 0 2 0 2 0 1 3 0 2 2 0 0 0 2 2 2 1 2 0 0 2 3 3 3 3
 0 1 0 2 2 0 0 2 1 0 1 3 0 3 0 0 1 3 0 3 1 0 0 1 0 1 2 2 2 2 3 2 3 3 2 1 2
 1 3 3 1 1 3 1 3 1 2 1 3 1 1 3 0 3 1 0 3 0 0 2 2 1 2 0 3 1 3 1 0 1 1 2 0 3
 0 0 0 3 3 0 1 0 1 0 0 3 2 3 0 0 1 1 0 0 2 0 3 1 3 3 3 0 0

In [None]:
features_list = []

for data_path in data_paths:

  features = process_json(data_path)
  features_list.append(features)
print(f"生成了 {len(features_list)} 个样本特征")
print(f"第一个样本的特征形状: {features_list[0].shape}")

生成了 1000 个样本特征
第一个样本的特征形状: (1129, 2834)


: 

In [None]:

if split==1:fixed_time_steps = 3000
elif split==2: fixed_time_steps = 1500
elif split==3: fixed_time_steps = 1200
else: fixed_time_steps = 12000

# 截断或补零到固定长度
aligned_features = []
for feature in features_list:
    if feature.shape[0] > fixed_time_steps:
        truncated = feature[:fixed_time_steps, :]  # 截断
    else:
        truncated = np.pad(feature, ((0, fixed_time_steps - feature.shape[0]), (0, 0)), mode='constant')  # 补零
    aligned_features.append(truncated)

# 转为张量
features_tensor =  torch.tensor(np.stack(aligned_features, axis=0),dtype=torch.float32)
print(f"截断后的特征张量形状: {features_tensor.shape}")
print(type(features_tensor))

In [None]:
if split==1:
    train_dataset=TensorDataset(features_tensor[0:210],labels_tensor[0:210])
    eval_dataset =TensorDataset(features_tensor[210:-1],labels_tensor[210:-1])
elif split==2:
    train_dataset=TensorDataset(features_tensor[0:600],labels_tensor[0:600])
    eval_dataset =TensorDataset(features_tensor[600:-1],labels_tensor[600:-1])
elif split==3:
    train_dataset=TensorDataset(features_tensor[0:600],labels_tensor[0:600])
    eval_dataset =TensorDataset(features_tensor[600:-1],labels_tensor[600:-1])
else:
    train_dataset=TensorDataset(features_tensor[0:700],labels_tensor[0:700])
    eval_dataset =TensorDataset(features_tensor[700:-1],labels_tensor[700:-1])
batch_size=16
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
eval_loader  = DataLoader(eval_dataset,batch_size=batch_size,shuffle=True)

    
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2,dropout=0.2):
        super(LSTMModel, self).__init__()
        
        # Define an LSTM with multiple layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True,dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass through LSTM
        out, (hn, cn) = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last time-step's output for classification
        return out


In [None]:
input_size=2834
hidden_size=128
output_size=4
num_layers = 1     # number of LSTM layers

model = LSTMModel(input_size, hidden_size, output_size, num_layers=num_layers,dropout=0.4)
# model.add(Dense(units))

# inputs=features_tensor
model = model.to(device)
# model = model
# inputs = inputs.to(device)
# labels = labels_tensor
torch.cuda.empty_cache()
# outputs = model(inputs)

# print(f"outputs:{outputs.shape}")



In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))


True
0
NVIDIA GeForce RTX 4090


In [None]:

# 5. 损失函数和优化器
criterion = nn.CrossEntropyLoss()  # 分类任务使用交叉熵损失
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # Decay learning rate every 10 epochs
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)  # Decrease LR by 5% every epoch
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode='min',factor=0.95,patience=5)
pretrain=False
model_path="model_epoch_21tran_acc_88.57% val_acc_48.28%.pth"
if pretrain:
    model.load_state_dict(torch.load(model_path))
    model.to(device)
writer = SummaryWriter('runs/lstm_training')  # This will store logs in 'runs/lstm_training'

# 6. 训练循环
num_epochs = 1000  # 训练轮次
max_accu=0.25
for epoch in range(num_epochs):
    model.train()  # 设置模型为训练模式
    total_loss = 0  # 累计损失
    correct_preds = 0  # 记录正确预测的数量
    total_preds = 0  # 总的预测数
   
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # 前向传播
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model(inputs)  # 获取模型输出
        loss = criterion(outputs, targets)  # 计算损失

        # 反向传播
        optimizer.zero_grad()  # 清除梯度
        loss.backward()  # 反向传播
        optimizer.step()  # 更新权重
        
        # 累计损失
        total_loss += loss.item()

        # 计算正确预测的数量
        _, predicted = torch.max(outputs, 1)  # 获取预测类别
        correct_preds += (predicted == targets).sum().item()  # 统计正确的数量
        total_preds += targets.size(0)  # 统计总的预测数
    
    # 计算训练损失和准确率
    train_loss = total_loss / len(train_loader)
    train_accuracy = correct_preds / total_preds
    
    current_lr = optimizer.param_groups[0]['lr']
   

      # Log loss to TensorBoard after each epoch
    writer.add_scalar('Loss/train', total_loss / len(train_loader), epoch)
    writer.add_scalar('Learning Rate', current_lr, epoch)

    total_loss=0
    correct_preds = 0  # Reset for evaluation
    total_preds = 0  # Reset for evaluation
    scheduler.step()

    # 7. 评估模型（可选）
    model.eval()  # 设置模型为评估模式
    with torch.no_grad():  # 在评估时不计算梯度
        for batch_idx, (inputs, targets) in enumerate(eval_loader):
            # 前向传播
            inputs = inputs.to(device)
            targets = targets.to(device)
            outputs = model(inputs)  # 获取模型输出
            loss = criterion(outputs, targets)  # 计算损失
            
            # 累计损失
            total_loss += loss.item()

            # 计算正确预测的数量
            _, predicted = torch.max(outputs, 1)  # 获取预测类别
            correct_preds += (predicted == targets).sum().item()  # 统计正确的数量
            total_preds += targets.size(0)  # 统计总的预测数
        # 计算eval损失和准确率
    eval_loss = total_loss / len(eval_loader)
    eval_accuracy = correct_preds / total_preds
    # scheduler.step(eval_loss)
    if eval_accuracy > max_accu:
        max_accu = max(eval_accuracy,max_accu)  # Update max accuracy
        model_filename = f"_model_epoch_{epoch+1}tran_acc_{train_accuracy*100:.2f}% val_acc_{eval_accuracy*100:.2f}%.pth"
        torch.save(model.state_dict(), model_filename)  # Save model to disko disk
        print(f"Model saved with Accuracy: Traning--{train_accuracy*100:.2f}% and Eval--{eval_accuracy*100:.2f}%")

    print(f"Epoch [{epoch+1}/{num_epochs}], Learning Rate: {current_lr}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy*100:.2f}%")
    print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy--{eval_accuracy*100:.2f}%")
    print("---------------------------------------------------")
    writer.add_scalars('Loss',{'Train':train_loss,'Eval':eval_loss} , epoch)
    writer.add_scalars('Accuracy',{'Train':train_accuracy,'Eval':eval_accuracy} , epoch)

writer.close
# Disable gradient computation
with torch.no_grad():
    test_preds = []  # To store predictions
    test_labels = []  # To store true labels
    
    # Loop through the train_loader (or test_loader if you're evaluating the test set)
    for inputs, targets in train_loader:
        # Move inputs and targets to the same device as the model
        inputs = inputs.to(device)
        targets = targets.to(device)

        # Ensure the input shape is (batch_size, seq_len, input_size) for LSTM
        # Adjust this depending on the actual input shape
        # inputs = inputs.view(inputs.size(0), -1, inputs.size(1))  # Assuming inputs have shape (batch_size, input_size)
        
        # Forward pass through the model
        outputs = model(inputs)
        
        # Get predictions by taking the argmax along the output dimension
        _, predicted = torch.max(outputs, 1)
        
        # Store predictions and true labels as NumPy arrays
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(targets.cpu().numpy())
        
        # Print predictions and labels for debugging (optional)
    print("Predictions: ", test_preds)
    print("True Labels: ", test_labels)

    # You may want to calculate accuracy or other metrics after the loop
    # For example, calculate accuracy:
    accuracy = sum(np.array(test_preds) == np.array(test_labels)) / len(test_labels)
    # print(f"Accuracy: {accuracy * 100:.2f}%")


    # accuracy = accuracy_score(test_labels, test_preds)
    print(f"Test Accuracy: {accuracy*100:.2f}%")

Model saved with Accuracy: Traning--85.33% and Eval--66.78%
Epoch [1/1000], Learning Rate: 0.001
Train Loss: 0.3272, Train Accuracy: 85.33%
Eval Loss: 1.0367, Eval Accuracy--66.78%
---------------------------------------------------
Epoch [2/1000], Learning Rate: 0.001
Train Loss: 0.3042, Train Accuracy: 85.83%
Eval Loss: 1.1011, Eval Accuracy--65.78%
---------------------------------------------------
Model saved with Accuracy: Traning--84.83% and Eval--67.11%
Epoch [3/1000], Learning Rate: 0.001
Train Loss: 0.3516, Train Accuracy: 84.83%
Eval Loss: 1.0673, Eval Accuracy--67.11%
---------------------------------------------------
Model saved with Accuracy: Traning--86.00% and Eval--67.45%
Epoch [4/1000], Learning Rate: 0.001
Train Loss: 0.3018, Train Accuracy: 86.00%
Eval Loss: 1.0626, Eval Accuracy--67.45%
---------------------------------------------------
Epoch [5/1000], Learning Rate: 0.001
Train Loss: 0.2914, Train Accuracy: 86.33%
Eval Loss: 1.1622, Eval Accuracy--65.11%
-------

KeyboardInterrupt: 

In [None]:
# import os
# import json

# # Define folder paths
# input_folder_path = "202_packet_json"  # Input folder containing original JSON files
# output_folder_path = "202_packet_json_10"  # New folder to save filtered files

# # Ensure the output folder exists
# os.makedirs(output_folder_path, exist_ok=True)

# # Iterate over files in the input folder
# for filename in os.listdir(input_folder_path):
#     # Full file path
#     input_file_path = os.path.join(input_folder_path, filename)
    
#     # Check if it is a JSON file
#     if filename.endswith(".json"):
#         # Open and read the original JSON file
#         with open(input_file_path, "r") as file:
#             data = json.load(file)
        
#         # Check if '_source' and 'layers' are present in the structure
#         if "_source" in data and "layers" in data["_source"]:
#             # Check if "frame" is a list of packets
#             frames = data["_source"]["layers"].get("frame", [])
            
#             # List to store filtered packets
#             filtered_packets = []

#             # Extract packets and filter by time range (10s to 20s)
#             for packet in frames:
#                 # Ensure 'frame.time_relative' exists in each packet
#                 if "frame.time_relative" in packet:
#                     timestamp = float(packet["frame.time_relative"])
                    
#                     # Filter packets in the time range from 10s to 20s
#                     if 10 <= timestamp <= 20:
#                         filtered_packets.append(packet)

#             # If we have filtered packets, write them to a new file
#             if filtered_packets:
#                 # Output file path (same name but in the output folder)
#                 output_file_path = os.path.join(output_folder_path, filename)
                
#                 # Write the filtered packets into a new JSON file
#                 with open(output_file_path, "w") as output_file:
#                     json.dump(filtered_packets, output_file, indent=4)

#                 print(f"Processed {filename}, saved to {output_file_path}")
#             else:
#                 print(f"No packets found between 10s and 20s in {filename}")
