In [1]:
# import os
# os.environ['KERAS_BACKEND']='theano'
# os.environ["MKL_THREADING_LAYER"] = "GNU"
import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/fchollet/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

import numpy as np
# Setting seed for reproducibility
np.random.seed(1234)
from tensorflow import set_random_seed
set_random_seed(2)

import keras
import keras.backend as K
from keras.layers.core import Activation
from keras.models import Sequential, load_model, Model
from keras.layers import Dense, Dropout, LSTM, Input, Bidirectional

import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn import preprocessing
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
# train数据读取#######################################
train_df = pd.read_csv('./data/train_FD001.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
train_df = train_df.sort_values(['id','cycle'])
# train数据读取#######################################

# test 数据读取#######################################
test_df = pd.read_csv('./data/test_FD001.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
# test 数据读取#######################################

# lable数据读取#######################################
truth_df = pd.read_csv('./data/RUL_FD001.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)
# lable数据读取#######################################

# train数据处理#######################################
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
# 获得各个id下cycle的最大值，shape（100，2）
rul.columns = ['id', 'max']
# 将rul的列抬头由cycle换为max
train_df = train_df.merge(rul, on=['id'], how='left')
# 根据id这列，将max这列放到train_df的末尾，对于每个id不同cycle，max的值是一直的
train_df['RUL'] = train_df['max'] - train_df['cycle']
# 新增一列RUL，用当前的max减去当前的cycle
train_df.drop('max', axis=1, inplace=True)
train_df['cycle_norm'] = train_df['cycle']
# 新增cycle_norm这列
cols_normalize = train_df.columns.difference(['id','cycle','RUL'])
# 找出需要标准化的列
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]),
                             columns=cols_normalize,
                             index=train_df.index)
# 对需要标准化的列进行标准化
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
# 将没标准化的列和标准化的列合并
train_df = join_df.reindex(columns = train_df.columns)
# 根据train_df.columns的顺序对各列进行重新排序

# 将RUL中大于130的值改为130，cycle_norm和RUL无关，所以不管
train_df['RUL'].loc[train_df['RUL'] >130]=130
# train_df.to_csv('train_df.csv')
# train数据处理#######################################

# test 数据处理#######################################
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]),
                            columns=cols_normalize,
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
# 将truth_df的RUL一列抬头改为more
truth_df['id'] = truth_df.index + 1
# 原truth_df的索引为0~99，新增一个id列，其值为1~100
truth_df['max'] = rul['max'] + truth_df['more']
# 如第一个零件，test数据中最大运行31个周期，RUL中还有112个周期。故最大周期为143
truth_df.drop('more', axis=1, inplace=True)
test_df = test_df.merge(truth_df, on=['id'], how='left')
# 将test的最大周期这一列加到test_df中，各id在不同cycle的最大周期一致。
test_df['RUL'] = test_df['max'] - test_df['cycle']
# 算得test的实时RUL值
test_df.drop('max', axis=1, inplace=True)
# 删掉用来计算RUL的max这一列。
# 将RUL中大于130的值改为130
# test_df['RUL'].loc[test_df['RUL'] >130]=130
# test_df.to_csv('test_df.csv')
# test 数据处理#######################################

sequence_length = 30

# 将数据格式变为(样本循环次数, 时间窗大小：50, 特征数)
def gen_sequence(id_df, seq_length, seq_cols):
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

# 选择特征列
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

# seq_array为用上函数生成的数组，其形状为(15631, 50, 25)
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols))
           for id in train_df['id'].unique())
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)

# 对应数据格式生成标签
def gen_labels(id_df, seq_length, label):
    data_array = id_df[label].values
    num_elements = data_array.shape[0]
    return data_array[seq_length:num_elements, :]

# 标签的形状为(15631, 1)
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['RUL'])
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)


# 生成test数据的最后一个序列，形状为(93, 50, 25)，不足100是因为有些测试集小于50
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:]
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]
seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)

# 对应生成test的label，形状为(93, 1)
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
label_array_test_last = test_df.groupby('id')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
# 建模##############################################



nb_features = seq_array.shape[2]
# nb_features == 25
nb_out = label_array.shape[1]
# nb_out ==1

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
