In [None]:
import paddle
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.dygraph import Linear
import numpy as np
import os
import random
import pandas as pd

In [None]:
# 连续特征保留特征
test_col = [ 'recency', 'frequency', 'monetary', 'avg_discount', 'items_total',
            'goods_price_last', 'goods_price_max', 'goods_price_min', 'goods_price_avg', 'goods_price_std', 
            'payment_max', 'payment_min', 'payment_avg', 'payment_std','gender', 
            'order_pay_time_diff_end-to-last2', 'order_double11_sum', 
            'rs', 'fs', 'ms', 'RFM', 'rfms', 'order_interval',]
train_col = test_col + ['labels']

In [None]:
# 保存为非归一化的train test 特征，无customer id和z-score后的rfm
d = pd.read_csv('train_features2.csv')
d = d[train_col]
# d.to_csv('train2.csv',index=False)

d = pd.read_csv('test_features2.csv')
d = d[test_col]
# d.to_csv('test2.csv',index=False)

In [None]:
# 数据归一化
columns = d.columns
for col in columns:
    max_num = d[col].max()
    min_num = d[col].min()
    avg = d[col].mean()
    d[col] = (d[col] - avg)/(max_num - min_num)
d.to_csv('train_1.csv',header=None,index=False)

columns = d.columns
for col in columns:
    max_num = d[col].max()
    min_num = d[col].min()
    avg = d[col].mean()
    d[col] = (d[col] - avg)/(max_num - min_num)
d.to_csv('test_1.csv',header=None,index=False)

In [None]:
# 有归一化方法的 load_data，用该方法时，需要载入 train.csv 和 test.csv
def load_data(path,istrain):
    # 从文件导入数据
    datafile = path
    # data = np.fromfile(datafile)
    data = pd.read_csv(datafile)
    
    # 每条数据包括14项，其中前面13项是影响因素，第14项是相应的房屋价格中位数
    feature_names = data.columns
    feature_num = len(feature_names)
    data = np.array(data)

    

    # 将原始数据进行Reshape，变成[N, 14]这样的形状
    data = data.reshape([-1, feature_num])
    
    # 训练集和测试集的划分比例
    #ratio = 0.8

    # 将原数据集拆分成训练集和测试集
    # 这里使用80%的数据做训练，20%的数据做测试
    # 测试集和训练集必须是没有交集的
    if istrain == True:
        ratio = 0.8
        offset = int(data.shape[0] * ratio)
        training_data = data[:offset]
        test_data = data[offset:]
    else:
        training_data = data
        test_data = None

    return training_data, test_data

In [None]:
# 加载处理后的数据
training_data, test_data = load_data('train_1.csv',True)
print('train set done.')

pre_data, none = load_data('test_1.csv',False)
print('test set done.')

In [None]:
# 构建多层神经网络
class Regressor(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(Regressor, self).__init__(name_scope)
        name_scope = self.full_name()
        # 定义三层全连接层，输出维度是1，激活函数为tanh
        self.fc1 = Linear(input_dim=23, output_dim=128, act='relu') # 输入层，input dim 为数据维度大小
        # self.fc2 = Linear(input_dim=128, output_dim=128, act='tanh')
        self.fc2 = Linear(input_dim=128, output_dim=1, act='sigmoid')
        # self.fc3 = Linear(input_dim=128, output_dim=1, act='sigmoid') # 输出二维softmax后的概率，分别代表01 label的概率
    # 网络的前向计算函数
    def forward(self, inputs):
        fc1 = self.fc1(inputs)
        # fc2 = self.fc2(fc1)
        # x = self.fc3(fc2)
        x = self.fc2(fc1)
        return x

In [None]:
with fluid.dygraph.guard():
    # 声明定义好的线性回归模型
    model = Regressor("Regressor")
    # 开启模型训练模式
    model.train()
    # 定义优化算法，这里使用随机梯度下降-SGD
    # 学习率设置为0.01
    opt = fluid.optimizer.Adam(learning_rate=0.00001, parameter_list=model.parameters())

In [None]:
# 自定义带权交叉熵
def wce_loss(pred, label, w=80, epsilon=1e-05): # w 是给到 y=1 类别的权重，越大越重视
    label = fluid.layers.clip(label, epsilon, 1-epsilon)
    pred = fluid.layers.clip(pred, epsilon, 1-epsilon)

    loss = -1 * (w * label * fluid.layers.log(pred) + (1 - label) * fluid.layers.log(1 - pred))
    loss = fluid.layers.reduce_mean(loss)
    return loss

In [None]:
# 模型训练和保存
with dygraph.guard(fluid.CPUPlace()):
    EPOCH_NUM = 30   # 设置外层循环次数
    BATCH_SIZE = 4096  # 设置batch大小
    
    # 定义外层循环
    for epoch_id in range(EPOCH_NUM):
        # 在每轮迭代开始之前，将训练数据的顺序随机的打乱
        np.random.shuffle(training_data)
        # 将训练数据进行拆分，每个batch包含10条数据
        mini_batches = [training_data[k:k+BATCH_SIZE] for k in range(0, len(training_data), BATCH_SIZE)]
        
        # 定义内层循环
        for iter_id, mini_batch in enumerate(mini_batches):
            x = np.array(mini_batch[:, :-1]).astype('float32') # 获得当前批次训练数据
            y = np.array(mini_batch[:, -1:]).astype('float32') # 获得当前批次训练标签（真实房价）

            # 将numpy数据转为飞桨动态图variable形式
            buyer_features = dygraph.to_variable(x)
            result = dygraph.to_variable(y)
            
            # 前向计算
            predicts = model(buyer_features)
            # loss = fluid.layers.log_loss(predicts, prices)
            loss = wce_loss(predicts, result)
            avg_loss = fluid.layers.mean(loss)
            
            # logloss = fluid.layers.log_loss(predicts, prices)

            if iter_id % 20 == 0:
                print("epoch: {}, iter: {}, loss is: {}".format(epoch_id, iter_id, avg_loss.numpy()),)
     
            # 反向传播
            avg_loss.backward()
            # 最小化loss,更新参数
            opt.minimize(avg_loss)
            # 清除梯度
            model.clear_gradients()
    # 保存模型
    fluid.save_dygraph(model.state_dict(), 'DNN_model')
    print("模型保存成功，模型参数保存在DNN_model中")

In [None]:
with dygraph.guard():
    # 参数为保存模型参数的文件地址
    model_dict, _ = fluid.load_dygraph('DNN_model')
    model.load_dict(model_dict)
    model.eval()
    # x = np.array(mini_batch[:, :-1]).astype('float32') # 获得当前批次训练数据
    pre_data, _ = load_data('test_1.csv', istrain=False)
    pre = pre_data.astype('float32')

    pre = dygraph.to_variable(pre)
    results = model(pre)

In [None]:
a = pd.read_csv('test_1.csv')
results = results.numpy()
temp = pd.DataFrame(columns=['customer_id','result'])
temp['customer_id'] = a['customer_id']
temp['result'] = results
temp.reset_index(drop=True).sort_values('customer_id', ascending=True, inplace=True,)
temp.to_csv('submission_DNN.csv', index=False)