In [72]:
import os
import re
import json
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

In [73]:
# 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 标签映射
label_mapping = {
    "0": 0,             # static
    "0_2": 1,           # slightly_move
    "1": 2,             # move
    "4": 3              # intensely_move
}

# split 参数控制数据路径和特征参数
split = 1  # 0:100, 1:300, 2:800, 3:1000, 4:3500
split_configs = {
    1: {"train_folder": "202_packet_json_new_300_train", "eval_folder": "202_packet_json_new_300_eval", "fixed_time_steps": 3000},
    2: {"train_folder": "202_packet_json_new_800_train", "eval_folder": "202_packet_json_new_800_eval", "fixed_time_steps": 1600},
    3: {"train_folder": "202_packet_json_new_1000_train", "eval_folder": "202_packet_json_new_1000_eval", "fixed_time_steps": 1200},
    4: {"train_folder": "202_packet_json_new_3600_train", "eval_folder": "202_packet_json_new_3600_eval", "fixed_time_steps": 1600},
    0: {"train_folder": "202_packet_json_new_train", "eval_folder": "202_packet_json_new_eval", "fixed_time_steps": 12000}
}
config = split_configs[split]
train_folder = config["train_folder"]
eval_folder = config["eval_folder"]
fixed_time_steps = config["fixed_time_steps"]
pca_dim = 512  # 降维目标维度
batch_size = 16


Using device: cuda


In [None]:
def slide_split_get_label_from_filename(filename):
    # print(label_mapping.get(filename[9:-5],-1))
    return label_mapping.get(filename[9:-5],-1)

def split_get_label_from_filename(filename):
    # print(label_mapping.get(filename[8:-5],-1))
    return label_mapping.get(filename[8:-5],-1)

def get_label_from_filename(filename):
    match = re.search(r'_(\d+_?\d*)\.json$', filename)
    if match:
        label_key = match.group(1)
        return label_mapping.get(label_key, -1)

# 获取文件夹中的文件及其标签
def load_files_and_labels(folder_path,split=0):
    file_list = []
    labels = []
    for file in os.listdir(folder_path):
        # print("file: ", file)
        if file.endswith(".json"):
            if split == 0:
                label = get_label_from_filename(file)
            elif split == 4:
                label = slide_split_get_label_from_filename(file)
            elif split==1 or split==2 or split==3:
                label = split_get_label_from_filename(file)
            if label != -1:
                file_list.append(os.path.join(folder_path, file))
                labels.append(label)
    return file_list, labels

In [75]:
# 1. 提取字符并编码为整数索引
def hex_to_sequence(hex_feature):
    """
    将十六进制字符串转换为整数索引序列。
    去掉冒号并转换为字符对应的索引。
    """
    hex_chars = hex_feature.replace(":", "")
    char_to_index = {char: idx for idx, char in enumerate("0123456789abcdef")}
    return [char_to_index[char] for char in hex_chars]

def process_json(file_path):
    """
    处理 JSON 文件，提取时间戳、包长度和 raw data 特征，并对特征进行归一化。
    """
    features = []
    try:
        # 打开 JSON 文件
        with open(file_path, 'r', encoding='utf-8') as file:
            datas = json.load(file)

        # 遍历 JSON 数据，提取时间戳和包长度
        initial_timestamp = None
        pre_time=None
        relative_timestamp = None
        features_1 = []
        features_2 = []
        for data in datas:
            try:
                timestamp = float(data["_source"]["layers"]["frame"]["frame.time_relative"])  # 时间戳
                packet_length = int(data["_source"]["layers"]["frame"]["frame.len"])  # 包长度

                # 获取数据部分，若不存在则为0
                if 'data' in data["_source"]["layers"]:
                    rawdata = data["_source"]["layers"]["data"]["data.data"]
                else:
                    rawdata = '0'

                # 将原始数据转换为整数序列
                data_feature = hex_to_sequence(rawdata)

                # 将数据填充或截断为指定长度 length


                if len(data_feature) < 2832:  # 如果长度小于 2832，填充 0
                    data_feature += [0] * (2832 - len(data_feature))
                elif len(data_feature) > 2832:  # 如果长度大于 2832，截断
                    data_feature = data_feature[:2832]

                # 如果数据长度不符合预期，打印出来
                if len(data_feature) != 2832:
                    print(f"Data feature length mismatch: {len(data_feature)}")

                # 初始化时间戳
                if initial_timestamp is None:
                    initial_timestamp = timestamp  # 设置初始时间戳

                # 计算相对时间戳

                relative_timestamp = timestamp - initial_timestamp
                if pre_time==None:
                    pre_time=relative_timestamp
                times_diff=relative_timestamp-pre_time
                pre_time=relative_timestamp

                timestamp_array = np.array([relative_timestamp], dtype=float)
                time_diff_array = np.array([times_diff], dtype=float)
                packet_length_array = np.array([packet_length], dtype=float)/1512#length 归一化
                data_feature = np.array(data_feature, dtype=float)/15

                # 将特征按顺序组合为 [时间戳,time diff, 包长度, ]
                feature_1 = np.hstack((timestamp_array,time_diff_array, packet_length_array))# ,data_feature

                # 添加到特征列表
                features_1.append(feature_1)
                features_2.append(data_feature)

            except (KeyError, ValueError) as e:
                # 跳过有问题的数据包
                print(f"Skipping packet due to error: {e}")
                continue

        features_1_array=np.array(features_1)
        features_2_array=np.array(features_2)
        max_timestamp = np.max(features_1_array[:, 0])  # 获取最大时间戳
        # print("max")
        features_1_array[:, 0] = [feature[0] / max_timestamp for feature in features_1_array]  # 时间戳归一化
        max_time_diff=np.max(features_1_array[:, 1])
        features_1_array[:, 1] = [feature[1] / max_time_diff for feature in features_1_array]

    except Exception as e:
        print(Exception)
        print(f"Error processing file {file_path}:s {e}")
        features_1_array = np.zeros((1, 3))  # 返回空特征以避免程序中断
        features_2_array = np.zeros((1, 2832))

    return features_1_array,features_2_array

In [76]:
## 数据加载
def load_data(folder_path, fixed_time_steps, pca_dim):
    file_paths, labels = load_files_and_labels(folder_path,split)
    features_1_list, features_2_list = [], []

    print("processing json")
    for file_path in file_paths:
        f1, f2 = process_json(file_path)
        features_1_list.append(f1)
        features_2_list.append(f2)

    print("lowing dimensions")
    # PCA 降维
    pca = PCA(n_components=pca_dim)
    reduced_features_2_list = [pca.fit_transform(f2) for f2 in features_2_list]

    combined_features = []
    for f1, f2 in zip(features_1_list, reduced_features_2_list):
        combined = np.hstack((f1, f2))
        if combined.shape[0] > fixed_time_steps:
            combined = combined[:fixed_time_steps, :]
        else:
            combined = np.pad(combined, ((0, fixed_time_steps - combined.shape[0]), (0, 0)), mode='constant')
        combined_features.append(combined)

    features_tensor = torch.tensor(np.stack(combined_features, axis=0), dtype=torch.float32)
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    print(f"生成了 {len(features_1_list)} 个样本特征")
    print(f"第一个样本的特征1形状: {features_1_list[0].shape}")
    print(f"第一个样本的特征2形状: {features_2_list[0].shape}")
    return features_tensor, labels_tensor

In [None]:
# 加载数据
train_features, train_labels = load_data(train_folder, fixed_time_steps, pca_dim)
eval_features, eval_labels = load_data(eval_folder, fixed_time_steps, pca_dim)

# 数据加载器
train_loader = DataLoader(TensorDataset(train_features, train_labels), batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(TensorDataset(eval_features, eval_labels), batch_size=batch_size, shuffle=True)

Here
2
Here
1
Here
1
Here
1
Here
2
Here
3
Here
2
Here
2
Here
0
Here
3
Here
0
Here
3
Here
0
Here
1
Here
2
Here
3
Here
3
Here
0
Here
3
Here
3
Here
1
Here
3
Here
3
Here
1
Here
2
Here
2
Here
3
Here
0
Here
2
Here
0
Here
0
Here
3
Here
2
Here
1
Here
2
Here
0
Here
1
Here
0
Here
0
Here
0
Here
3
Here
3
Here
3
Here
3
Here
2
Here
1
Here
3
Here
2
Here
1
Here
0
Here
2
Here
1
Here
1
Here
3
Here
3
Here
1
Here
2
Here
2
Here
1
Here
2
Here
0
Here
0
Here
0
Here
2
Here
3
Here
2
Here
1
Here
3
Here
1
Here
0
Here
0
Here
1
Here
0
Here
2
Here
2
Here
0
Here
2
Here
0
Here
2
Here
0
Here
0
Here
1
Here
0
Here
2
Here
3
Here
2
Here
0
Here
0
Here
3
Here
2
Here
1
Here
0
Here
0
Here
3
Here
1
Here
0
Here
0
Here
1
Here
1
Here
3
Here
1
Here
0
Here
3
Here
3
Here
1
Here
2
Here
3
Here
3
Here
2
Here
1
Here
2
Here
1
Here
0
Here
0
Here
0
Here
0
Here
0
Here
0
Here
3
Here
2
Here
2
Here
3
Here
2
Here
3
Here
2
Here
1
Here
3
Here
1
Here
2
Here
1
Here
2
Here
1
Here
2
Here
3
Here
0
Here
3
Here
2
Here
2
Here
1
Here
1
Here
0
Here
2
Here
0

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f0de81e0e50>>
Traceback (most recent call last):
  File "/home/nesl/anaconda3/envs/202_proj/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
# 神经网络可调参数
hidden_size = 16
num_layers=1
dropout=0.4
learning_rate=0.001
num_epochs = 1000
best_accuracy = 0.25

In [None]:
# 模型定义
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

In [None]:
output_size = 4
input_size = pca_dim + 3
model = LSTMModel(input_size, hidden_size, output_size, num_layers=num_layers, dropout=dropout).to(device)

# 损失函数与优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5)

# TensorBoard 日志
writer = SummaryWriter('runs/lstm_pca')



for epoch in range(num_epochs):
    model.train()
    total_loss, correct_preds, total_preds = 0, 0, 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct_preds += (outputs.argmax(1) == targets).sum().item()
        total_preds += targets.size(0)

    train_accuracy = correct_preds / total_preds
    train_loss = total_loss / len(train_loader)

    model.eval()
    with torch.no_grad():
        eval_loss, correct_preds, total_preds = 0, 0, 0
        for inputs, targets in eval_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            eval_loss += criterion(outputs, targets).item()
            correct_preds += (outputs.argmax(1) == targets).sum().item()
            total_preds += targets.size(0)

    eval_accuracy = correct_preds / total_preds
    eval_loss /= len(eval_loader)
    scheduler.step(eval_loss)

    if eval_accuracy > best_accuracy:
        best_accuracy = eval_accuracy
        model_filename = f"model_epoch_{epoch+1}tran_acc_{train_accuracy*100:.2f}% val_acc_{eval_accuracy*100:.2f}%.pth"
        torch.save(model.state_dict(), model_filename)  # Save model to disko disk
        print(f"Model saved with Accuracy: Traning--{train_accuracy*100:.2f}% and Eval--{eval_accuracy*100:.2f}%")
    
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy * 100:.2f}%")
    print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_accuracy * 100:.2f}%")
    writer.add_scalars('Loss', {'Train': train_loss, 'Eval': eval_loss}, epoch)
    writer.add_scalars('Accuracy', {'Train': train_accuracy, 'Eval': eval_accuracy}, epoch)

writer.close()

Epoch 1/1000
Train Loss: 1.4214, Train Accuracy: 5.77%
Eval Loss: 1.3935, Eval Accuracy: 0.00%




Epoch 2/1000
Train Loss: 1.3456, Train Accuracy: 13.46%
Eval Loss: 1.3660, Eval Accuracy: 13.04%
Epoch 3/1000
Train Loss: 1.2930, Train Accuracy: 32.69%
Eval Loss: 1.3320, Eval Accuracy: 13.04%
Model saved with Accuracy: Traning--57.69% and Eval--43.48%
Epoch 4/1000
Train Loss: 1.2554, Train Accuracy: 57.69%
Eval Loss: 1.2917, Eval Accuracy: 43.48%
Model saved with Accuracy: Traning--84.62% and Eval--60.87%
Epoch 5/1000
Train Loss: 1.2056, Train Accuracy: 84.62%
Eval Loss: 1.2642, Eval Accuracy: 60.87%
Model saved with Accuracy: Traning--98.08% and Eval--65.22%
Epoch 6/1000
Train Loss: 1.1599, Train Accuracy: 98.08%
Eval Loss: 1.2209, Eval Accuracy: 65.22%
Model saved with Accuracy: Traning--100.00% and Eval--78.26%
Epoch 7/1000
Train Loss: 1.1009, Train Accuracy: 100.00%
Eval Loss: 1.2085, Eval Accuracy: 78.26%
Model saved with Accuracy: Traning--100.00% and Eval--82.61%
Epoch 8/1000
Train Loss: 1.0790, Train Accuracy: 100.00%
Eval Loss: 1.1708, Eval Accuracy: 82.61%
Epoch 9/1000
Trai

KeyboardInterrupt: 

In [None]:
pretrain=True
model_path="model_epoch_24tran_acc_100.00% val_acc_100.00%.pth"
if pretrain:
    model.load_state_dict(torch.load(model_path))
    model.to(device)

with torch.no_grad():
    test_preds = []  # To store predictions
    test_labels = []  # To store true labels

    # Loop through the train_loader (or test_loader if you're evaluating the test set)
    for inputs, targets in train_loader:
        # Move inputs and targets to the same device as the model
        inputs = inputs.to(device)
        targets = targets.to(device)

        # Ensure the input shape is (batch_size, seq_len, input_size) for LSTM
        # Adjust this depending on the actual input shape
        # inputs = inputs.view(inputs.size(0), -1, inputs.size(1))  # Assuming inputs have shape (batch_size, input_size)

        # Forward pass through the model
        outputs = model(inputs)

        # Get predictions by taking the argmax along the output dimension
        _, predicted = torch.max(outputs, 1)

        # Store predictions and true labels as NumPy arrays
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(targets.cpu().numpy())

        # Print predictions and labels for debugging (optional)
    print("Predictions: ", test_preds)
    print("True Labels: ", test_labels)

    # You may want to calculate accuracy or other metrics after the loop
    # For example, calculate accuracy:

    accuracy = sum(np.array(test_preds) == np.array(test_labels)) / len(test_labels)
    # print(f"Accuracy: {accuracy * 100:.2f}%")
    diff_indices = [i for i in range(len(test_labels)) if test_labels[i] != test_preds[i]]

    print(diff_indices)



    # accuracy = accuracy_score(test_labels, test_preds)
    print(f"Test Accuracy: {accuracy*100:.2f}%")

Predictions:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
True Labels:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[]
Test Accuracy: 100.00%


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_metrics(y_true, y_pred, class_names):
    """
    Calculate and display confusion matrix and metrics.
    :param y_true: Ground truth labels.
    :param y_pred: Predicted labels.
    :param class_names: List of class names.
    """
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)

    # Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')  # Macro-average for multiclass
    recall = recall_score(y_true, y_pred, average='macro')        # Macro-average for multiclass

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision (Macro): {precision:.2f}")
    print(f"Recall (Macro/Sensitivity): {recall:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=class_names))

# Gather predictions and labels for training set
train_preds, train_labels = [], []
with torch.no_grad():
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        train_preds.extend(predicted.cpu().numpy())
        train_labels.extend(targets.cpu().numpy())

# Gather predictions and labels for validation set
eval_preds, eval_labels = [], []
with torch.no_grad():
    for inputs, targets in eval_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        eval_preds.extend(predicted.cpu().numpy())
        eval_labels.extend(targets.cpu().numpy())

# Combine training and validation predictions for overall metrics
combined_preds = train_preds + eval_preds
combined_labels = train_labels + eval_labels

# Define class names for display
class_names = ["Static", "Slightly Move", "Move", "Intensely Move"]

# Calculate and display metrics for training set
print("Training Set Metrics:")
calculate_metrics(train_labels, train_preds, class_names)

# Calculate and display metrics for validation set
print("\nValidation Set Metrics:")
calculate_metrics(eval_labels, eval_preds, class_names)

# Calculate and display metrics for combined data
print("\nOverall Metrics:")
calculate_metrics(combined_labels, combined_preds, class_names)


Training Set Metrics:
Accuracy: 0.89
Precision (Macro): 0.92
Recall (Macro/Sensitivity): 0.89

Classification Report:
                precision    recall  f1-score   support

        Static       1.00      0.83      0.91        18
 Slightly Move       1.00      0.93      0.97        15
          Move       0.69      1.00      0.82        18
Intensely Move       1.00      0.79      0.88        19

      accuracy                           0.89        70
     macro avg       0.92      0.89      0.89        70
  weighted avg       0.92      0.89      0.89        70


Validation Set Metrics:
Accuracy: 0.45
Precision (Macro): 0.46
Recall (Macro/Sensitivity): 0.46

Classification Report:
                precision    recall  f1-score   support

        Static       0.45      0.71      0.56         7
 Slightly Move       0.50      0.30      0.38        10
          Move       0.60      0.43      0.50         7
Intensely Move       0.29      0.40      0.33         5

      accuracy              