In [3]:
import torch
import torch.nn as nn
import re
import numpy as np
import json
import os
from sklearn.decomposition import PCA
from torch.utils.data import Dataset, DataLoader,TensorDataset
from torch.utils.tensorboard import SummaryWriter

In [4]:
def slide_split_get_label_from_filename(filename):
    # print(label_mapping.get(filename[9:-5],-1))
    return label_mapping.get(filename[9:-5],-1)

def split_get_label_from_filename(filename):
    return label_mapping.get(filename[8:-5],-1)

def get_label_from_filename(filename):
    match = re.search(r'_(\d+_?\d*)\.json$', filename)
    print(match)
    if match:
        label_key = match.group(1)
        return label_mapping.get(label_key, -1)  # 如果标签不匹配返回 -1

# 获取文件夹中的文件及其标签
def load_files_and_labels(folder_path,split=False):
    file_list = []
    labels = []
    for file in os.listdir(folder_path):
        # print("file: ", file)
        if file.endswith(".json"):
            if split == 1 or split==2 or split==3:
                label = split_get_label_from_filename(file)
            elif split == 4:
                # print(f"split is {split}")
                label = slide_split_get_label_from_filename(file)
            else:
                label = get_label_from_filename(file)
            if label != -1:
                file_list.append(os.path.join(folder_path, file))
                labels.append(label)
            #print(f"File: {file}, Label: {label}")

    return file_list, labels


In [5]:
import json
import numpy as np

# 1. 提取字符并编码为整数索引
def hex_to_sequence(hex_feature):
    """
    将十六进制字符串转换为整数索引序列。
    去掉冒号并转换为字符对应的索引。
    """
    hex_chars = hex_feature.replace(":", "")
    char_to_index = {char: idx for idx, char in enumerate("0123456789abcdef")}
    return [char_to_index[char] for char in hex_chars]

def process_json(file_path):
    """
    处理 JSON 文件，提取时间戳、包长度和 raw data 特征，并对特征进行归一化。
    """
    features = []
    try:
        # 打开 JSON 文件
        with open(file_path, 'r', encoding='utf-8') as file:
            datas = json.load(file)

        # 遍历 JSON 数据，提取时间戳和包长度
        initial_timestamp = None
        pre_time=None
        for data in datas:
            try:
                timestamp = float(data["_source"]["layers"]["frame"]["frame.time_relative"])  # 时间戳
                packet_length = int(data["_source"]["layers"]["frame"]["frame.len"])  # 包长度

                # 获取数据部分，若不存在则为0
                if 'data' in data["_source"]["layers"]:
                    rawdata = data["_source"]["layers"]["data"]["data.data"]
                else:
                    rawdata = '0'

                # 将原始数据转换为整数序列
                data_feature = hex_to_sequence(rawdata)

                # 将数据填充或截断为指定长度 2832
                if len(data_feature) < 2832:  # 如果长度小于 2832，填充 0
                    data_feature += [0] * (2832 - len(data_feature))
                elif len(data_feature) > 2832:  # 如果长度大于 2832，截断
                    data_feature = data_feature[:2832]

                # 如果数据长度不符合预期，打印出来
                if len(data_feature) != 2832:
                    print(f"Data feature length mismatch: {len(data_feature)}")

                # 初始化时间戳
                if initial_timestamp is None:
                    initial_timestamp = timestamp  # 设置初始时间戳


                # 计算相对时间戳

                relative_timestamp = timestamp - initial_timestamp
                if pre_time==None:
                    pre_time=relative_timestamp

                time_diff=relative_timestamp-pre_time
                pre_time=relative_timestamp

                # print(type(relative_timestamp))


                timestamp_array = np.array([relative_timestamp], dtype=float)
                time_diff_array = np.array([time_diff], dtype=float)

                packet_length_array = np.array([packet_length], dtype=float)/1512
                data_feature = np.array(data_feature, dtype=float)/15

                # 将特征按顺序组合为 [时间戳, 包长度, 数据特征]
                feature = np.hstack((timestamp_array,time_diff_array, packet_length_array,data_feature))

                # 添加到特征列表
                features.append(feature)

            except (KeyError, ValueError) as e:
                # 跳过有问题的数据包
                print(f"Skipping packet due to error: {e}")
                continue

        features_array=np.array(features)
        max_timestamp = np.max(features_array[:, 0])  # 获取最大时间戳
        max_time_diff = np.max(features_array[:, 1])  # 获取最大时间diff
        # print("max")
        features_array[:, 0] = [feature[0] / max_timestamp for feature in features_array]  # 时间戳归一化
        features_array[:, 1] = [feature[1] / max_time_diff for feature in features_array]  # 时间diff归一化
        # print("11")

    except Exception as e:
        print(Exception)
        print(f"Error processing file {file_path}: {e}")
        features_array = np.zeros((1, 2835))  # 返回空特征以避免程序中断

    return features_array

In [6]:
import nest_asyncio
nest_asyncio.apply()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义文件夹路径 /content/drive/MyDrive/202_project/202_packet_json

split = 2 # 0:100, 1:300, 2:800 3:1000 4:3500
# folder_path = "packet_json_split"
if split==1:
    folder_path = "packet_json_split"
elif split==2:
    folder_path = "202_packet_json_new_800"
elif split==3:
    folder_path = "202_packet_json_new_1000"
elif split==4:
    folder_path = "202_packet_json_new_3500"
else:
    folder_path = "202_packet_json_new"

# 定义标签映射
label_mapping = {
    "0": 0,             # static
    "0_2": 1,           # slightly_move
    "1": 2,             # move
    "4": 3              # intensely_move
}

# 调用函数获取文件和标签
data_paths, labels = load_files_and_labels(folder_path,split)
# 用于存储每个样本的特征

print(type(labels))
labels_array=np.array(labels)#[:50]
print(labels_array.shape)
print(labels_array)
labels_tensor=torch.tensor(labels_array,dtype=torch.long)

Using device: cuda
<class 'list'>
(800,)
[1 0 2 2 1 3 1 3 1 1 2 1 0 0 3 0 0 0 2 0 2 3 2 3 0 3 0 3 3 0 0 1 2 1 3 1 2
 0 2 1 3 3 0 0 2 2 2 3 0 3 3 1 2 1 1 3 0 0 3 3 2 0 1 1 1 0 0 3 0 1 3 2 3 3
 1 1 0 2 0 2 3 1 3 0 1 0 0 3 1 3 3 0 1 3 1 1 2 0 3 3 1 1 0 0 1 0 3 3 2 0 2
 2 2 3 0 3 2 3 0 3 1 0 0 3 2 0 0 0 0 2 0 3 0 2 3 3 1 0 3 0 1 2 1 2 0 2 0 3
 0 1 1 0 2 1 3 0 0 0 3 2 1 1 2 3 3 0 1 3 1 3 3 3 2 1 1 2 3 2 1 1 1 2 2 0 0
 0 1 2 2 2 2 0 1 2 1 1 1 2 3 0 1 0 2 3 2 2 0 2 1 1 3 0 3 2 2 1 2 3 0 2 3 0
 1 1 0 3 2 1 3 1 2 3 2 1 3 1 0 0 1 3 1 1 2 0 3 3 1 3 3 0 3 3 3 0 2 2 1 1 0
 3 2 2 3 3 2 1 1 2 2 1 3 3 3 0 0 3 0 2 3 0 2 0 2 0 2 2 0 0 0 2 2 1 2 0 0 2
 3 3 3 1 0 2 2 0 2 1 3 0 0 1 3 0 3 1 0 0 1 0 2 2 2 2 3 2 3 3 2 1 2 1 3 3 1
 1 3 1 1 2 1 1 1 3 0 3 0 3 0 0 2 2 0 3 1 1 0 1 1 0 3 0 0 3 3 0 1 0 1 0 3 3
 0 0 1 1 0 0 2 0 3 1 3 3 3 0 0 2 2 2 2 1 2 2 0 1 0 2 1 1 3 0 2 2 2 2 3 1 1
 1 2 3 1 1 2 1 2 3 0 3 3 3 1 3 2 2 1 1 1 2 2 1 2 2 0 0 0 3 2 3 0 0 2 1 3 3
 3 2 3 0 3 3 1 2 2 0 0 0 0 0 3 2 1 0 0 0 2 0 3 2 1 0 1 3 2 

In [7]:
features_list = []
for data_path in data_paths:
  features = process_json(data_path)
  features_list.append(features)
print(f"生成了 {len(features_list)} 个样本特征")
print(f"第一个样本的特征形状: {features_list[0].shape}")

生成了 800 个样本特征
第一个样本的特征形状: (1486, 2835)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# 对齐特征矩阵的形状
if split == 1:
    fixed_time_steps = 3000
elif split == 2:
    fixed_time_steps = 1600
elif split == 3:
    fixed_time_steps = 1200
elif split == 4:
    fixed_time_steps = 1600
else:
    fixed_time_steps = 12000

# target_dim = 256  # 降到 256 维
# pca = PCA(n_components=target_dim)

# # 对每个特征矩阵进行降维
# reduced_features_list = []
# for feature in features_list:
#     reduced_feature = pca.fit_transform(feature)  # 对特征维度进行降维
#     reduced_features_list.append(reduced_feature)

# 截断或补零到固定时间步长

aligned_features = []
for feature in features_list:
    if feature.shape[0] > fixed_time_steps:
        truncated = feature[:fixed_time_steps, :]
    else:
        truncated = np.pad(feature, ((0, fixed_time_steps - feature.shape[0]), (0, 0)), mode='constant')
    aligned_features.append(truncated)

# Flatten time-step dimension into a single feature vector
aligned_features_flat = [feature.flatten() for feature in aligned_features]
# Assuming you have already processed your data and have features_tensor and labels_tensor as NumPy arrays
features_tensor = np.array(aligned_features_flat)

# Ensure labels_tensor is defined
# Example:
labels_tensor = np.array(labels)  # This should be a NumPy array of the same length as features_tensor

# Convert features_tensor and labels_tensor to PyTorch tensors
features_tensor = torch.tensor(features_tensor, dtype=torch.float32)
labels_tensor = torch.tensor(labels_tensor, dtype=torch.long)

# Ensure that features_tensor and labels_tensor have the same number of samples
assert features_tensor.shape[0] == labels_tensor.shape[0], "The number of features and labels must match."

# Data splitting based on 'split' value
if split == 1:
    train_dataset = TensorDataset(features_tensor[0:210], labels_tensor[0:210])
    eval_dataset = TensorDataset(features_tensor[210:], labels_tensor[210:])
elif split == 2:
    train_dataset = TensorDataset(features_tensor[0:600], labels_tensor[0:600])
    eval_dataset = TensorDataset(features_tensor[600:], labels_tensor[600:])
elif split == 3:
    train_dataset = TensorDataset(features_tensor[0:600], labels_tensor[0:600])
    eval_dataset = TensorDataset(features_tensor[600:], labels_tensor[600:])
elif split == 4:
    train_dataset = TensorDataset(features_tensor[0:3400], labels_tensor[0:3400])
    eval_dataset = TensorDataset(features_tensor[3400:], labels_tensor[3400:])
else:
    train_dataset = TensorDataset(features_tensor[0:70], labels_tensor[0:70])
    eval_dataset = TensorDataset(features_tensor[70:], labels_tensor[70:])

# Create DataLoader instances for training and evaluation
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [11]:
# 定义三层 MLP 模型
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # 第一层全连接
        self.fc2 = nn.Linear(hidden_size, hidden_size)  # 第二层全连接
        self.fc3 = nn.Linear(hidden_size, output_size)  # 输出层
        self.relu = nn.ReLU()  # 激活函数
        self.dropout = nn.Dropout(0.3)  # Dropout 防止过拟合

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# 初始化模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = 2835
hidden_size = 64
output_size = 4

model = SimpleMLP(input_size, hidden_size, output_size).to(device)


# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
# 训练和验证
num_epochs = 30

# Loss function
criterion = nn.CrossEntropyLoss()  # No change needed

# Training loop
for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(features)  # Model outputs logits, no need for softmax here
        loss = criterion(outputs, labels)  # Labels are class indices, no one-hot encoding required
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate loss and accuracy
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_accuracy = correct / total
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy * 100:.2f}%")

    # Validation phase
    model.eval()
    eval_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in eval_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)

            eval_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    eval_accuracy = correct / total
    print(f"Validation Loss: {eval_loss:.4f}, Validation Accuracy: {eval_accuracy * 100:.2f}%")

# Classification Report
eval_predictions = []
eval_true_labels = []
with torch.no_grad():
    for features, labels in eval_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        eval_predictions.extend(predicted.cpu().numpy())
        eval_true_labels.extend(labels.cpu().numpy())

print("\nClassification Report:")
print(classification_report(eval_true_labels, eval_predictions))

AttributeError: 'numpy.ndarray' object has no attribute 'to'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
import numpy as np
import json
import os
import re

# 配置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义标签映射
label_mapping = {
    "0": 0,             # static
    "0_2": 1,           # slightly_move
    "1": 2,             # move
    "4": 3              # intensely_move
}

# 文件路径设置
split = 1
folder_mapping = {
    1: "packet_json_split",
    2: "202_packet_json_new_800",
    3: "202_packet_json_new_1000",
    4: "202_packet_json_new_3500"
}
folder_path = folder_mapping.get(split, "202_packet_json_new")

# 从文件名获取标签
def get_label_from_filename(filename):
    match = re.search(r'_(\d+_?\d*)\.json$', filename)
    if match:
        label_key = match.group(1)
        return label_mapping.get(label_key, -1)  # 标签不匹配时返回 -1
    return -1

# 加载文件及其对应标签
def load_files_and_labels(folder_path):
    file_list = []
    labels = []
    for file in os.listdir(folder_path):
        if file.endswith(".json"):
            label = get_label_from_filename(file)
            if label != -1:
                file_list.append(os.path.join(folder_path, file))
                labels.append(label)
    return file_list, labels

# 将十六进制字符串转换为整数序列
def hex_to_sequence(hex_feature):
    hex_chars = hex_feature.replace(":", "")
    char_to_index = {char: idx for idx, char in enumerate("0123456789abcdef")}
    return [char_to_index[char] for char in hex_chars]

# 处理 JSON 文件，提取并归一化特征
def process_json(file_path, feature_length=2832):
    features = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            datas = json.load(file)

        initial_timestamp = None
        for data in datas:
            try:
                timestamp = float(data["_source"]["layers"]["frame"]["frame.time_relative"])
                packet_length = int(data["_source"]["layers"]["frame"]["frame.len"])
                rawdata = data["_source"]["layers"].get("data", {}).get("data.data", '0')

                data_feature = hex_to_sequence(rawdata)
                data_feature = (data_feature + [0] * feature_length)[:feature_length]  # 填充或截断到指定长度

                if initial_timestamp is None:
                    initial_timestamp = timestamp
                relative_timestamp = timestamp - initial_timestamp

                feature = np.hstack((
                    relative_timestamp,
                    packet_length / 1512,
                    np.array(data_feature, dtype=float) / 15
                ))
                features.append(feature)
            except (KeyError, ValueError):
                continue

        features_array = np.array(features)
        features_array[:, 0] /= np.max(features_array[:, 0])  # 时间戳归一化
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        features_array = np.zeros((1, feature_length + 2))
    return features_array

# 加载数据
data_paths, labels = load_files_and_labels(folder_path)
features_list = [process_json(path) for path in data_paths]
labels_tensor = torch.tensor(labels, dtype=torch.long)

Using device: cuda


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
# 对齐特征矩阵形状
fixed_time_steps = {1: 3000, 2: 1600, 3: 1200, 4: 1600}.get(split, 12000)
aligned_features = []
for feature in features_list:
    if feature.shape[0] > fixed_time_steps:
        aligned_features.append(feature[:fixed_time_steps, :])
    else:
        aligned_features.append(
            np.pad(feature, ((0, fixed_time_steps - feature.shape[0]), (0, 0)), mode='constant')
        )
features_tensor = torch.tensor(np.stack(aligned_features, axis=0), dtype=torch.float32)

# 数据集划分
split_index = {1: 210, 2: 600, 3: 600, 4: 3400}.get(split, 70)
train_dataset = TensorDataset(features_tensor[:split_index], labels_tensor[:split_index])
eval_dataset = TensorDataset(features_tensor[split_index:], labels_tensor[split_index:])

# DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

AssertionError: Size mismatch between tensors

In [None]:
# 定义 MLP 模型
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc3(x)

# 初始化模型
input_size = 2834  # 特征长度 + 时间戳 + 包长度
hidden_size = 64
output_size = 4
model = SimpleMLP(input_size, hidden_size, output_size).to(device)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# 训练和验证
num_epochs = 30
for epoch in range(num_epochs):
    # 训练阶段
    model.train()
    total_loss, correct, total = 0, 0, 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)

    train_accuracy = correct / total
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {total_loss:.4f}, Train Accuracy: {train_accuracy:.2%}")

    # 验证阶段
    model.eval()
    eval_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for features, labels in eval_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            eval_loss += criterion(outputs, labels).item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)

    eval_accuracy = correct / total
    print(f"Validation Loss: {eval_loss:.4f}, Validation Accuracy: {eval_accuracy:.2%}")

# 生成分类报告
eval_preds, eval_labels = [], []
with torch.no_grad():
    for features, labels in eval_loader:
        features, labels = features.to(device), labels.to(device)
        eval_preds.extend(model(features).argmax(1).cpu().numpy())
        eval_labels.extend(labels.cpu().numpy())

print("\nClassification Report:")
print(classification_report(eval_labels, eval_preds))


RuntimeError: Expected target size [16, 4], got [16]