In [76]:
import keras
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM, Flatten, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Reshape
import progressbar
import numpy as np
import os

def data_generator():
    # train_file = '../data/train.csv'
    chunk_folder = '../data/chunks/'
    while True:
        chunkx = np.zeros((4096, 2))
        chunky = np.zeros((4096, ))
        ii = 0
        for file in os.listdir(chunk_folder):
            df = pd.read_csv(os.path.join(chunk_folder, file))
            data_chunk = df.values[:,0]
            target = np.mean(df.iloc[:,1])
            chunkx = np.zeros((4096, 2 ))
            chunkx[:df.shape[0], :] = df.iloc[:min(df.shape[0], 4096), :]
            chunky = np.zeros((4096, ))
            chunky[:df.shape[0]] = df.iloc[:min(df.shape[0], 4096), 1]
            yield chunkx[:df.shape[0]], chunky[:df.shape[0]]

def generate_single(file):
    input_length = 32
    df = pd.read_csv(os.path.join(chunk_folder, file))
    data_chunk = df.values[:,0]
    target = np.mean(df.iloc[:,1])
    chunkx = np.zeros((4096, input_length ))
    chunky = np.zeros((4096, ))
    imax = 0
    for i in range(df.shape[0]-input_length):
        chunkx[i, :] = df.iloc[i:i+input_length, 0]
        chunky[i] = df.iloc[i, 1]
        imax=i+1
    return chunkx[:imax, :], chunky[:imax]

def get_model():
    # max_features = 2
    input_length = 32
    num_internal_dimensions = 64
    dropout_rate = 0.1
    model = Sequential()
    model.add(Reshape((1, input_length), input_shape=(input_length, )))
    model.add(Dropout(dropout_rate))
    # model.add(Dropout(dropout_rate))
    model.add(Dense(num_internal_dimensions, input_shape=(1, 2), activation='relu'))
    # model.add(Conv1D(64, 2, activation='relu', input_shape=(1, 2), padding='causal'))
    # model.add(Conv1D(64, 2, activation='relu', padding='causal'))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(num_internal_dimensions))
    model.add(Dense(num_internal_dimensions, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Reshape((1, num_internal_dimensions)))
    model.add(LSTM(num_internal_dimensions))
    model.add(Dense(num_internal_dimensions, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Reshape((1, num_internal_dimensions)))
    model.add(Dense(num_internal_dimensions, activation='relu'))
    # model.add(MaxPooling1D(2))
    # model.add(Conv1D(128, 2, activation='relu', padding='causal'))
    # model.add(Conv1D(128, 2, activation='relu', padding='causal'))
    model.add(Flatten())
    # model.add(LSTM(128))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_absolute_error',
              optimizer='adam',
              metrics=['accuracy'])
    try:
        model.load_weights('best_weights.hf5')
    except:
        print('could not load weights')
    print(model.summary())
    return model

model = get_model()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_37 (Reshape)         (None, 1, 32)             0         
_________________________________________________________________
dropout_49 (Dropout)         (None, 1, 32)             0         
_________________________________________________________________
dense_76 (Dense)             (None, 1, 64)             2112      
_________________________________________________________________
dropout_50 (Dropout)         (None, 1, 64)             0         
_________________________________________________________________
lstm_39 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_77 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_51 (Dropout)         (None, 64)                0         
__________

In [74]:
%matplotlib notebook
import matplotlib.pyplot as plt

checkpoint = keras.callbacks.ModelCheckpoint('best_weights.hf5', monitor='mean_absolute_error', verbose=0, save_best_only=False, mode='min')
# model.fit_generator(data_generator(), steps_per_epoch=4096, epochs=10000, callbacks=[checkpoint])
chunk_folder = '../data/chunks/'
train_files = os.listdir(chunk_folder)
i_arr = []
loss_arr = []
fig = plt.figure()
ax = fig.add_subplot(111)
plt.ion()
for i, file in enumerate(train_files):
    # print(str(i) + ' ' + file)
    fitx, fity = generate_single(file)
    hist = model.fit(fitx, fity, callbacks=[checkpoint], verbose=0)
    # print(hist.history)
    loss_arr.append(hist.history['loss'][-1])
    i_arr.append(i)
    ax.clear()
    ax.loglog(i_arr, loss_arr)
    ax.grid()
    fig.canvas.draw()

<IPython.core.display.Javascript object>

  "Data has no positive values, and therefore cannot be "
in singular transformations; automatically expanding.
left=1.0, right=1.0
  self.set_xlim(upper, lower, auto=None)


KeyboardInterrupt: 

In [78]:
def get_single(file):
    input_length = 32
    df = pd.read_csv(file)
    data_chunk = df.values[:,0]
    target = np.mean(df.iloc[:,1])
    chunkx = np.zeros((4096, input_length ))
    imax = 0
    for i in range(min(df.shape[0]-input_length, 4096)):
        chunkx[i, :] = df.iloc[i:i+input_length, 0]
        imax=i+1
    return chunkx[:imax, :]


files = os.listdir('../data/test/')
for i, file in enumerate(files):
    pcolname = 'time_to_failure'
    # segid = file.replace('.csv','')
    fullname = os.path.join('../data/test', file)
    xarr = get_single(fullname)
    df = pd.read_csv(fullname)
    ypred = model.predict(xarr)
    print(str(i) + ' ' + file + ' ' + str(ypred[-1][0]))
    # for i in range(len(ypred)):
    #    df['time_to_failure'].iloc[i] = ypred[i]
    df['time_to_failure'] = ypred[-1][0]
    df.to_csv(fullname, index=False)

0 seg_00030f.csv [5.2971272]


ValueError: Length of values does not match length of index

In [28]:
import progressbar
files = os.listdir('../data/test/')
bar = progressbar.ProgressBar(len(files))
bar.start()
lines = []
for i, file in enumerate(files):
    bar.update(i)
    pcolname = 'time_to_failure'
    segid = file.replace('.csv','')
    fullname = os.path.join('../data/test', file)
    df = pd.read_csv(fullname)
    ttf = df['time_to_failure'].mean()
    lines.append({'seg_id': segid, 'time_to_failure': ttf})
out_df = pd.DataFrame(lines)
out_df.to_csv('prediction.csv', index=False)

- |                        #                       | 2623 Elapsed Time: 0:03:04