In [3]:
import numpy as np
import pandas as pd
import mindspore as ms
import mindspore.nn as nn
import mindspore.dataset as ds
import mindspore.dataset.transforms.c_transforms as C
from mindspore import dtype as mstype
from mindspore.common.initializer import Normal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [4]:
# 读取数据集文件
dataset =  pd.read_csv('./data/processed_data/dataset6classes.csv')
dataset.head(3)

In [5]:
dataset = dataset.drop('Unnamed: 0', 1)
dataset[" Label"].unique()      # 查看Label的类别

array([0, 2, 4, 1, 3])

In [6]:
feature_lenth = dataset.shape[1]
data = dataset.iloc[:,:feature_lenth-1] # 获取特征属性列
label = dataset[' Label']               # 获取Label属性列，

# 分割数据集为训练集、验证集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, label,test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=0.2, random_state=1)

In [7]:
# 将训练集的均值和方差作为总样本的参数分布
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)

# 对数据集进行标准化处理
X_train = standard_scaler.transform(X_train)
X_val = standard_scaler.transform(X_val)
X_test = standard_scaler.transform(X_test)

# 保存缩放器模型
scaler_save_path = "./model_saved/StandardScaler.save"
joblib.dump(standard_scaler, scaler_save_path)

['./StandardScaler2.save']

In [8]:
print('train examples:', len(X_train))
print( 'validation examples', len(X_val))
print('test examples:', len(X_test) )

train examples: 473152
validation examples 118288
test examples: 65716


In [9]:

# 自定义数据源
class MyAccessible:
    def __init__(self, X, y):
        self._data = np.array(X)
        self._label = np.array(y)

    def __getitem__(self, index):
        return self._data[index], self._label[index]

    def __len__(self):
        return len(self._data)

# 创建数据集，使之符合模型输入的要求
def Ceate_dataset(X, y, shuffle_size = 200, batch_size=128):
    ds.config.set_seed(123)     # 设置随机种子，保证每次随机化处理的结果相同
    dataset = ds.GeneratorDataset(source=MyAccessible(X, y), column_names=["data", "label"])
    dataset = dataset.map(operations=C.TypeCast(mstype.float32), input_columns=["data"])    # 特征属性列的数据类型统一为float32
    dataset = dataset.map(operations=C.TypeCast(mstype.int32), input_columns=["label"])     # 类别属性的数据类型设为int32
    dataset = dataset.shuffle(shuffle_size)
    dataset = dataset.batch(batch_size)
    return dataset

# 得到训练集、验证集和测试集
data_train = Ceate_dataset(X_train, y_train)
data_val = Ceate_dataset(X_val, y_val)
data_test = Ceate_dataset(X_test, y_test)

# # 查看数据集中的前五条数据，观察数据类型和大小
# iterator = data_test.create_dict_iterator()
# i =0
# for item in iterator:
#     # item is a dict
#     i +=1
#     print(item)
#     if i==10:
#         break

In [10]:

class DNN(nn.Cell):
    """定义深度神经网络结构"""
    def __init__(self, input_dims=11, output_dims=5, dropout_ratio=0.2):
        super(DNN, self).__init__()
        self.input_dims = input_dims
        self.output_dims = output_dims
        self.dropout_ratio = dropout_ratio
        # 定义所需要的运算
        self.fc1 = nn.Dense(self.input_dims, 512, weight_init=Normal(0.02))
        self.fc2 = nn.Dense(512, 256, weight_init=Normal(0.02))
        self.fc3 = nn.Dense(256, 128, weight_init=Normal(0.02))
        self.fc4 = nn.Dense(128, 64, weight_init=Normal(0.02))
        self.fc5 = nn.Dense(64, self.output_dims, weight_init=Normal(0.02))
        self.relu = nn.ReLU()
        self.log_softmax = nn.LogSoftmax()
        self.dropout = nn.Dropout(keep_prob=dropout_ratio)

    # 使用定义好的运算构建前向网络
    def construct(self, x):

        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc3(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc4(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc5(x)
        x = self.log_softmax(x)
        return x
# 网络实例化
net = DNN()

In [11]:
from mindspore import Model
from mindspore.nn import SoftmaxCrossEntropyWithLogits
from mindspore.nn import SGD, Adam
from mindspore.nn.metrics import Accuracy

lr = 0.0001   # 学习率
#定义损失函数
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
#定义优化器（待更新的参数，学习率，权重衰减）
optim = Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=0.01)
#实例化模型对象（网络结构，损失函数，优化器，评价指标）
model = Model(network=net,loss_fn=loss,optimizer=optim,metrics={'accuracy': Accuracy()})

In [12]:
from mindspore.train.callback import LossMonitor, TimeMonitor, Callback,CheckpointConfig,ModelCheckpoint

# 自定义回调类（继承Callback类），用于计算输出验证集准确率
class ValCallBack(Callback):
    def __init__(self, model, val_dataset, epochs_to_val, per_val, dataset_sink_mode=False):
        self.model = model
        self.val_dataset = val_dataset      # 输出验证集准确率的epoch间隔数
        self.epochs_to_val = epochs_to_val  # 一个记录当前epoch和val_acc值的字典
        self.per_val = per_val
        self.dataset_sink_mode = dataset_sink_mode

    def epoch_end(self, run_context):
        # 获取到现在的epoch数
        cb_param = run_context.original_args()
        cur_epoch = cb_param.cur_epoch_num
        # 如果达到进行验证的epoch数，则进行以下验证操作
        if cur_epoch % self.epochs_to_val == 0:
            # 此处model设定的metrics是准确率Accuracy
            acc = self.model.eval(self.val_dataset, dataset_sink_mode=self.dataset_sink_mode)
            self.per_val["epoch"].append(cur_epoch)
            self.per_val["acc"].append(acc)
            print("验证集准确率为: {}".format(acc))

#实例化回调对象
epochs_to_val = 2                    # 每隔2个epoch在验证集上进行一次验证
val_acc = {'epoch':[], 'acc':[]}     # 记录当前epoch数和acc值
valAcc_cb = ValCallBack(model,data_val,epochs_to_val, val_acc)
# 回调训练过程中的loss值
loss_cb = LossMonitor(per_print_times=1000)

epoches = 20
print('---------------开始训练-------------')
model.train(epoches, data_train, callbacks=[loss_cb,valAcc_cb], dataset_sink_mode=False)
print('---------------训练结束-------------')


---------------开始训练-------------
epoch: 1 step: 1000, loss is 0.9638172388076782
epoch: 1 step: 2000, loss is 0.8211514353752136
epoch: 1 step: 3000, loss is 0.5580129623413086
epoch: 2 step: 303, loss is 0.37132251262664795
epoch: 2 step: 1303, loss is 0.3668070137500763
epoch: 2 step: 2303, loss is 0.3831019103527069
epoch: 2 step: 3303, loss is 0.34128567576408386
验证集准确率为: {'accuracy': 0.9162298796158528}
epoch: 3 step: 606, loss is 0.404092937707901
epoch: 3 step: 1606, loss is 0.5582598447799683
epoch: 3 step: 2606, loss is 0.35104385018348694
epoch: 3 step: 3606, loss is 0.38596564531326294
epoch: 4 step: 909, loss is 0.294807493686676
epoch: 4 step: 1909, loss is 0.4008302390575409
epoch: 4 step: 2909, loss is 0.27310097217559814
验证集准确率为: {'accuracy': 0.9305762207493575}
epoch: 5 step: 212, loss is 0.3179163634777069
epoch: 5 step: 1212, loss is 0.2543521523475647
epoch: 5 step: 2212, loss is 0.29405146837234497
epoch: 5 step: 3212, loss is 0.26114270091056824
epoch: 6 step: 515

In [13]:
# 使用验证集评估模型，打印总体准确率
acc = model.eval(data_test)
print('The accuracy of the test dataset is {}'.format(acc))

The accuracy of the test dataset is {'accuracy': 0.9307058608949417}


In [14]:
ms.save_checkpoint(net, "DNN.ckpt")