# 前期设置（所需库的导入和设置）

In [1]:
import os
import random as rn
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import *

from sklearn import preprocessing

import tensorflow as tf
from tensorflow import set_random_seed

from keras import backend as K
from keras.layers.core import Activation
from keras.models import Sequential, load_model, Model
from keras.layers import Dense, Dropout, LSTM, Input, Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
rn.seed(12345)
tf.set_random_seed(1234)
# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibil
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

In [3]:
% matplotlib inline
plt.rcParams['figure.figsize'] = 16,6
plt.rcParams['xtick.color'] = 'w'  
plt.rcParams['ytick.color'] = 'w'  
mpl.style.use('ggplot')

# 数据集导入

In [4]:
#训练集
train1_path = r'./data/train_FD001.txt'
train2_path = r'./data/train_FD002.txt'
train3_path = r'./data/train_FD003.txt'
train4_path = r'./data/train_FD004.txt'

#测试集
test1_path = r'./data/test_FD001.txt'
test2_path = r'./data/test_FD002.txt'
test3_path = r'./data/test_FD003.txt'
test4_path = r'./data/test_FD004.txt'

#测试集标签
rul1_path = r'./data/RUL_FD001.txt'
rul2_path = r'./data/RUL_FD002.txt'
rul3_path = r'./data/RUL_FD003.txt'
rul4_path = r'./data/RUL_FD004.txt'

In [5]:
# train数据读取#######################################
train_df = pd.read_csv(train1_path, sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True) #这两列为空，删除
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
train_df = train_df.sort_values(['id','cycle'])

# test 数据读取#######################################
test_df = pd.read_csv(test1_path, sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True) #这两列为空，删除
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

# 测试集lable数据读取#######################################
truth_df = pd.read_csv(rul1_path, sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

# 训练集处理

In [6]:
# train数据处理#######################################
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
# 获得各个id下cycle的最大值，shape（100，2）
rul.columns = ['id', 'max']
# 将rul的列抬头由cycle换为max
train_df = train_df.merge(rul, on=['id'], how='left')
# 根据id这列，将max这列放到train_df的末尾，对于每个id不同cycle，max的值是一直的
train_df['RUL'] = train_df['max'] - train_df['cycle']
# 新增一列RUL，用当前的max减去当前的cycle
train_df.drop('max', axis=1, inplace=True)
train_df['cycle_norm'] = train_df['cycle']
# 新增cycle_norm这列
cols_normalize = train_df.columns.difference(['id','cycle','RUL'])
# 找出需要标准化的列
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]),
                             columns=cols_normalize,
                             index=train_df.index)
# 对需要标准化的列进行标准化
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
# 将没标准化的列和标准化的列合并
train_df = join_df.reindex(columns = train_df.columns)
# 根据train_df.columns的顺序对各列进行重新排序

# 将RUL中大于130的值改为130，cycle_norm和RUL无关，所以不管
train_df['RUL'].loc[train_df['RUL'] >130]=130
train_df.to_csv('./data/processed_data/train_df.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20631 entries, 0 to 20630
Data columns (total 28 columns):
id            20631 non-null int64
cycle         20631 non-null int64
setting1      20631 non-null float64
setting2      20631 non-null float64
setting3      20631 non-null float64
s1            20631 non-null float64
s2            20631 non-null float64
s3            20631 non-null float64
s4            20631 non-null float64
s5            20631 non-null float64
s6            20631 non-null float64
s7            20631 non-null float64
s8            20631 non-null float64
s9            20631 non-null float64
s10           20631 non-null float64
s11           20631 non-null float64
s12           20631 non-null float64
s13           20631 non-null float64
s14           20631 non-null float64
s15           20631 non-null float64
s16           20631 non-null float64
s17           20631 non-null float64
s18           20631 non-null float64
s19           20631 non-null float64
s20    

# 测试集处理

In [8]:
# test 数据处理#######################################
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]),
                            columns=cols_normalize,
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
# 将truth_df的RUL一列抬头改为more
truth_df['id'] = truth_df.index + 1
# 原truth_df的索引为0~99，新增一个id列，其值为1~100
truth_df['max'] = rul['max'] + truth_df['more']
# 如第一个零件，test数据中最大运行31个周期，RUL中还有112个周期。故最大周期为143
truth_df.drop('more', axis=1, inplace=True)
test_df = test_df.merge(truth_df, on=['id'], how='left')
# 将test的最大周期这一列加到test_df中，各id在不同cycle的最大周期一致。
test_df['RUL'] = test_df['max'] - test_df['cycle']
# 算得test的实时RUL值
test_df.drop('max', axis=1, inplace=True)
# 删掉用来计算RUL的max这一列。
# 将RUL中大于130的值改为130
# test_df['RUL'].loc[test_df['RUL'] >130]=130
test_df.to_csv('./data/processed_data/test_df.csv')

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13096 entries, 0 to 13095
Data columns (total 28 columns):
id            13096 non-null int64
cycle         13096 non-null int64
setting1      13096 non-null float64
setting2      13096 non-null float64
setting3      13096 non-null float64
s1            13096 non-null float64
s2            13096 non-null float64
s3            13096 non-null float64
s4            13096 non-null float64
s5            13096 non-null float64
s6            13096 non-null float64
s7            13096 non-null float64
s8            13096 non-null float64
s9            13096 non-null float64
s10           13096 non-null float64
s11           13096 non-null float64
s12           13096 non-null float64
s13           13096 non-null float64
s14           13096 non-null float64
s15           13096 non-null float64
s16           13096 non-null float64
s17           13096 non-null float64
s18           13096 non-null float64
s19           13096 non-null float64
s20    

In [10]:
sequence_length = 30

# 将数据格式变为(样本循环次数, 时间窗大小：50, 特征数)
def gen_sequence(id_df, seq_length, seq_cols):
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

# 选择特征列
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

# seq_array为用上函数生成的数组，其形状为(15631, 50, 25)
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols))
           for id in train_df['id'].unique())
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)

# 对应数据格式生成标签
def gen_labels(id_df, seq_length, label):
    data_array = id_df[label].values
    num_elements = data_array.shape[0]
    return data_array[seq_length:num_elements, :]

# 标签的形状为(15631, 1)
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['RUL'])
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)


# 生成test数据的最后一个序列，形状为(93, 50, 25)，不足100是因为有些测试集小于50
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:]
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]
seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)

# 对应生成test的label，形状为(93, 1)
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
label_array_test_last = test_df.groupby('id')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
# 建模##############################################



nb_features = seq_array.shape[2]
# nb_features == 25
nb_out = label_array.shape[1]
# nb_out ==1