In [8]:
import numpy as np
import csv
import datetime
import matplotlib.pyplot as plt

import pandas as pd
from pandas import concat

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

def calc_avg(lst):
    total = 0
    for element in lst:
        total += element
    if len(lst) < 1:
        return 0
    else:
        return total/len(lst)
    
def read_data(filename):
    nprocs = []
    runtime = []
    total_jobs = []
    submit_time = []
    
    core_count = []
    r = []
    
    
    with open(filename) as file:
        tsv_file = csv.reader(file, delimiter="\t")
        field_count = 0
        while int(field_count) < 20:
            field_count = int(len(next(tsv_file)))

        last_time = -1
        last_hour = -1
        job_count = 0
        last_submitted = -1
        i = 0
        for line in tsv_file:
            i += 1
            #if i > 30000:
                # break
            if (float(line.__getitem__(3)) > -0.5) and (float(line.__getitem__(4)) > -0.5):
                submitted = int(line.__getitem__(1))
                dt = datetime.datetime.fromtimestamp(submitted)
                time_hour = dt.hour

                if last_hour == -1:
                    last_hour = time_hour
                    last_time = dt
                
                if last_hour != time_hour:
                    runtime.append(calc_avg(r))
                    nprocs.append(calc_avg(core_count))
                    total_jobs.append(job_count)
                    submit_time.append(dt.replace(minute=0, second=0, microsecond=0))
                    last_time = dt
                    job_count = 0
                    core_count.clear
                    r.clear
                    last_submitted = -1
                
                core_count.append(float(line.__getitem__(4))) # number of allocated processors
                r.append(float(line.__getitem__(3))) # runtime of the job
                job_count += 1
                last_submitted = submitted
                last_hour = time_hour
    return submit_time, runtime, nprocs, total_jobs

def read_dataframe():
    submit_time, runtime, nprocs, total_jobs = read_data('anon_jobs_Sharc.gwf') #SharcNet
    df = pd.DataFrame(list(zip(submit_time, runtime, nprocs, total_jobs)), columns=['ds', 'RunTime', 'NProcs', 'TotalJobs'])
    df.to_pickle('job_traces')
    return df

def load_dataframe():
    return pd.read_pickle('total_jobs_dataframe')

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [5]:
dataframe = read_dataframe()
dataframe.set_index('ds')
values = dataset.values
print(dataframe)
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])

                      ds       RunTime     NProcs  TotalJobs
0    2006-01-24 17:00:00     13.600000   5.400000          5
1    2006-01-24 18:00:00     77.000000  19.500000          9
2    2006-01-24 19:00:00    148.625000  27.062500          2
3    2006-01-24 20:00:00    119.105263  48.947368         22
4    2006-01-24 21:00:00    171.384615  73.948718          1
...                  ...           ...        ...        ...
7071 2007-01-15 21:00:00  31829.883378   3.013483        254
7072 2007-01-15 22:00:00  31828.379609   3.013501         58
7073 2007-01-15 23:00:00  31826.919224   3.013525         57
7074 2007-01-16 00:00:00  31825.394948   3.013849         59
7075 2007-01-16 01:00:00  31825.058342   3.013981         13

[7076 rows x 4 columns]


In [21]:
tf.random.set_seed(7)
dataset = dataframe[['TotalJobs']].values
print(dataset)
dataset = dataset.astype('float32')

[[ 5]
 [ 9]
 [ 2]
 ...
 [57]
 [59]
 [13]]


In [22]:
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

# train test split
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

4740 2336


In [23]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

# reshape into X=t and Y=t+1
look_back = 1
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

In [24]:
# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

In [None]:
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

Epoch 1/100
4738/4738 - 5s - loss: 9.1769e-04 - 5s/epoch - 1ms/step
Epoch 2/100
4738/4738 - 4s - loss: 9.1173e-04 - 4s/epoch - 891us/step
Epoch 3/100
4738/4738 - 4s - loss: 9.1304e-04 - 4s/epoch - 864us/step
Epoch 4/100


In [None]:
print(trainY)

In [None]:
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform(trainY)
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform(testY)
# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

In [None]:
plt.plot(range(0,24), testPredict[:24])
plt.plot(range(0,24), testY[0][:24])

In [None]:
# shift train predictions for plotting
trainPredictPlot = np.empty_like(dataset)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = np.empty_like(dataset)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)