In [1]:
import numpy as np
import pandas as pd
# dnn模型构建
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn.functional as F
import random
import logging
from imblearn.over_sampling import SMOTE

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

In [2]:
# 固定随机数种子，确保实验的可重复性
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

def get_fft_and_scaler(data, start=5192, end=8192):
    data = np.fft.fft(data)
    data = np.abs(data)
    data = data/np.expand_dims(data.max(axis=1), axis=1)
    return data[:, start:end]

In [3]:
class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.dnn = nn.Sequential(
            nn.BatchNorm1d(300),
            nn.Linear(300, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(1024, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(256, 10),
        )

    def forward(self, x):
        x = self.dnn(x)
        return F.softmax(x, dim=1)

In [8]:
# 搭建DNN模型
def model_train(train_loader, model, optimizer, criterion, labels):
    model.train()
    train_total_acc = 0
    train_loss = 0
    for feature, label in train_loader:
        feature = feature.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        preds = model(feature)

        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()

        train_total_acc += model(feature).argmax(dim=1).eq(label).sum().item()
        train_loss += loss.item()

        feature.cpu()
        label.cpu()

    print(
        f'Training loss: {train_loss/len(train_loader):.4f}',
        f'Training  acc: {train_total_acc/len(labels):.4f}',
         )

def predict(val_loader, model, criterion, labels):
    model.eval()
    val_total_acc = 0
    val_loss = 0
    for feature, label in val_loader:
        feature = feature.to(device)
        label = label.to(device)
        preds = model(feature)
        loss = criterion(preds, label)

        val_total_acc += model(feature).argmax(dim=1).eq(label).sum().item()
        val_loss += loss.item()

        feature.cpu()
        label.cpu()

    print(
        f'Val loss: {val_loss/len(val_loader):.4f}',
        f'Val  acc:{val_total_acc/len(labels):.4f}'
    )
    return val_loss

# 使用boost ing的想法，让神经网络学习错误的类别
# boosting 的想法：训练后面几次，分类错误的数据
# 重新定义一个新的分类器进行学习错误的数据, 保存这些模型的参数
# 然后使用模型融合
#print('Stage4: start training')
# 这里设置boost 的num数，设置为多少就会训练多少个dnn模型
#boost_epoch_num = 4
#for boost_num in range(boost_epoch_num):
#    # 分类器学习, 更新训练集
#    boost_feature = []
#    boost_label = []
#    optimizer = optim.Adam(model.parameters(), lr=lr)
#    criterion = nn.CrossEntropyLoss()
#    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
#    train_best = float('inf')
#    best_model = None
#    print('--'*8 + f'boost round: {boost_num}/{boost_epoch_num - 1}' + '--'*8)
#    print(f'train shape: {train_tensor.shape}')
#    print('--'*24)
#    for epoch in range(epochs):
#        print('='*20 + f' Epoch: {epoch} '+ '='*20)
#        model_train(train_loader, model, optimizer, criterion=criterion, labels=y_train_tensor)
#        loss = predict(val_loader, model, criterion=criterion, labels=val_label)
#        if loss <= train_best:
#            train_best = loss
#            best_model = model
#    # 模型保存
#    torch.save(best_model.state_dict(), f'./best_model{str(boost_num)}.point')
#    get_boost_data(train_tensor.to(device), y_train_tensor.to(device), model=best_model)
#    # 开始boosting
#    train_tensor = torch.cat(boost_feature, dim=0)
#    y_train_tensor = torch.cat(boost_label, dim=0)

#print('Stage5: model score')
#for i in range(boost_epoch_num):
#    model_name = f'./best_model{i}.point'
#    # 重新初始化模型
#    model = DNN().to(device)
#    model.load_state_dict(torch.load(model_name))
#    # 这里一定要开启验证模式
#    model.eval()
#    print('--'*24)
#    print(f'Model name: {model_name}')
#    # 这里只能用验证集来看准确率
#    preds = model(torch.FloatTensor(val_sp).to(device)).argmax(dim=1).cpu().numpy()
#    score = (preds == val_label).sum()/len(val_label)
#    print(f'Score: {score}')

In [4]:
print('Stage1: load data')
# 读取训练集，测试集和验证集
train = np.load('../train/10type_sort_train_data_8192.npy')
val = np.load('../val/10type_sort_eval_data_8192.npy')

# 读取训练集和验证集的标签，测试集是没有标签的，需要你使用模型进行分类，并将结果进行提交
train_label = np.load('../train/10type_sort_train_label_8192.npy')
val_label = np.load('../val/10type_sort_eval_label_8192.npy')

print('Stage2: data over_sampling')
smote = SMOTE(random_state=42, n_jobs=-1)
x_train, y_train = smote.fit_resample(train, train_label)

train_sp = get_fft_and_scaler(x_train, start=6892, end=7192)
val_sp = get_fft_and_scaler(val, start=6892, end=7192)

Stage1: load data
Stage2: data over_sampling


In [5]:
# 将数据转换成pytorch的tensor
print('Stage3: transform numpy data to tensor')
batch_size = 128

train_tensor = torch.tensor(train_sp).float()
y_train_tensor = torch.tensor(y_train).long()
val_tensor = torch.tensor(val_sp).float()
y_val_tensor = torch.tensor(val_label).long()

# 使用Dataloader对数据进行封装
train_dataset = TensorDataset(train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=True)
val_dataset = TensorDataset(val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)

Stage3: transform numpy data to tensor


In [6]:
lr = 0.0001
gamma = 0.9
step_size = 1
epochs = 15
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [7]:
model = DNN().to(device)

In [9]:
model.load_state_dict(torch.load('./best_model0.point'))

<All keys matched successfully>

In [11]:
model.eval()
preds = model(torch.FloatTensor(val_sp).to(device)).argmax(dim=1).cpu().numpy()
score = (preds == val_label).sum()/len(val_label)
print(f'Score: {score}')

Score: 0.6086826475238217


In [12]:
print(model)

DNN(
  (dnn): Sequential(
    (0): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=300, out_features=1024, bias=True)
    (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): ReLU()
    (4): Dropout(p=0.2, inplace=False)
    (5): Linear(in_features=1024, out_features=256, bias=True)
    (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=256, out_features=10, bias=True)
  )
)


In [14]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [31]:
model.dnn[9].register_forward_hook(get_activation(model.dnn[9]))

<torch.utils.hooks.RemovableHandle at 0x7f4989079400>

In [33]:
model(torch.FloatTensor(val_sp).to(device)).argmax(dim=1).cpu().numpy()

array([1, 4, 4, ..., 0, 8, 6])

In [64]:
activation['9'].shape

KeyError: '9'

In [40]:
for name, module in model.named_children():
    print(name)
    print(module)

dnn
Sequential(
  (0): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Linear(in_features=300, out_features=1024, bias=True)
  (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): ReLU()
  (4): Dropout(p=0.2, inplace=False)
  (5): Linear(in_features=1024, out_features=256, bias=True)
  (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): ReLU()
  (8): Dropout(p=0.2, inplace=False)
  (9): Linear(in_features=256, out_features=10, bias=True)
)


# 已知源识别

流程：
1. 构建训练集特征向量字典
    - 因为训练集准确率没有到100，所以有些分类错误的向量不能加入进去
        - 思路：将预测值和标签concate起来，变成11维度的向量，然后去除错误的
2. 求出每个测试集的特征向量
3. 测试集样本特征向量与训练集特征向量余弦相似度计算



##    训练集特征向量构建

In [272]:
# 构建训练集手机特征向量字典
train_feature = {}
#n_class = 10 # 10 phones
#database = {str(i):[] for i in range(n_class)}

In [273]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [274]:
# 抽取中间特征
model = DNN().to(device)
model.load_state_dict(torch.load('./best_model12.point'))
model.eval()
# 开始提取
model.dnn[9].register_forward_hook(get_activation(model.dnn[9]))
model(torch.FloatTensor(train_sp).to(device)).argmax(dim=1).cpu().numpy()

array([0, 0, 0, ..., 9, 9, 9])

In [275]:
(preds == y_train).sum() / len(y_train)

0.799615988828766

In [288]:
preds = activation[model.dnn[9]].cpu().argmax(dim=1).numpy()

In [289]:
all_data = np.concatenate([activation[model.dnn[9]].cpu(), np.expand_dims(preds, axis=1)], axis=1)

In [293]:
true_data = all_data[all_data[:, -1] == y_train, :]

In [322]:
# 获得分类正确的标签
for i in range(n_class):
    train_feature[str(i)] = torch.FloatTensor(true_data[true_data[:, -1]== i, :-1])

## 测试集的特征向量

In [323]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [324]:
model.eval()
# 开始提取
model.dnn[9].register_forward_hook(get_activation(model.dnn[9]))
model(torch.FloatTensor(val_sp).to(device)).argmax(dim=1).cpu().numpy()

array([1, 4, 8, ..., 0, 8, 6])

In [325]:
len(val_sp)

23403

In [326]:
val_feature = activation[model.dnn[9]].cpu()

In [327]:
val_feature.shape

torch.Size([23403, 10])

## 计算余弦相似度

In [328]:
train_feature['0'].shape

torch.Size([15573, 10])

In [329]:
def get_similarity(source_tensor, object_tensor):
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    output = cos(source_tensor.unsqueeze(dim=0), object_tensor)
    scaler_output = output.sum()/object_tensor.size(0)
    return scaler_output.item()

In [330]:
train_feature['0'].shape

torch.Size([15573, 10])

In [331]:
get_similarity(val_feature[0], torch.FloatTensor(train_feature['0']))

0.6705741286277771

In [332]:
similarity_list = []
for i in range(n_class):
    similarity = get_similarity(val_feature[0], train_feature[str(i)])
    similarity_list.append(similarity)

In [333]:
similarity_list

[0.6705741286277771,
 0.8546786904335022,
 0.2372012883424759,
 0.752787709236145,
 0.4196513593196869,
 0.14867010712623596,
 0.4547554552555084,
 0.5406652688980103,
 0.520589292049408,
 0.2931250035762787]

In [334]:
from tqdm import tqdm

In [335]:
preds_list = []
for source_tensor in tqdm(val_feature):
    similarity_list = []
    for j in range(n_class):
        similarity = get_similarity(source_tensor, train_feature[str(j)])
        similarity_list.append(similarity)
    ans = similarity_list.index(max(similarity_list))
    preds_list.append(ans)

100%|██████████| 23403/23403 [01:05<00:00, 357.02it/s]


In [336]:
len(val_label), len(preds_list)

(23403, 23403)

In [337]:
(np.array(preds_list) == val_label).sum()/len(val_label)

0.5819766696577362