diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f11b75 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ diff --git a/Data_preparation.py b/Data_preparation.py deleted file mode 100644 index 0d847a1..0000000 --- a/Data_preparation.py +++ /dev/null @@ -1,175 +0,0 @@ -import pickle -import numpy as np -import pandas as pd -from tqdm import tqdm -from keras.preprocessing.sequence import pad_sequences - -# After exporting the relational database to separate tables with .csv extension, the transformation can begin -# The first step is to read the cvs files as Dataframes -df_taskset = pd.read_csv('TaskSet.csv') # import task-sets -# print(df_taskset.head()) if you want to see how the data look like - -df_task = pd.read_csv('Task.csv') # import tasks -# print(df_task.head()) - -df_job = pd.read_csv('Job.csv') # import jobs -# print(df_job.head()) - - -# 2. data transformation - -# here starts data transformation -ntn = df_task[['PKG']].values # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task -ntn1 = [] -for n in ntn: - ntn1.append(n[0]) -print(np.unique(ntn1)) # print the unique values - - - -# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label -PKGs = {} -PKGs['pi'] = 0 -PKGs['hey'] = 1 -PKGs['tumatmul'] = 2 -PKGs['cond_mod'] = 3 - -# INteger encoding for Exit_Values from Jobs -Exit_Values = {} -Exit_Values['EXIT'] = 1 -Exit_Values['EXIT_CRITICAL'] = 0 - - -# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 -Arg_Values = {} -Arg_Values[1] = 1 -Arg_Values[4096] = 2 -Arg_Values[8192] = 3 -Arg_Values[16384] = 4 -Arg_Values[32768] = 5 -Arg_Values[65536] = 6 -Arg_Values[131072] = 7 -Arg_Values[262144] = 8 -Arg_Values[524288] = 9 -Arg_Values[1048576] = 10 -Arg_Values[2097152] = 11 -Arg_Values[847288609443] = 12 -Arg_Values[2541865828329] = 13 -Arg_Values[7625597484987] = 14 -Arg_Values[22876792454961] = 15 -Arg_Values[68630377364883] = 16 -Arg_Values[205891132094649] = 17 - - - -# 3. Features and Labels extraction -i = 0 - -features = [] # create an empty list for features -labels = [] # create an empty list for labels -# loop in the task-set -with tqdm ( total=len ( - list(df_taskset.iterrows()))) as pbar: # the total length would be total=len(list(df_taskset.iterrows())) - for index, row in df_taskset.iterrows (): - - try: - - i += 1 - grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID - first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id - second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id - third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id - fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id - tasks = [] # empty list of tasks where features are saved later - - if first_task != -1: # if the first task exists in this task-set then : - - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) # save the priority - tasks.append(int(task_info['Period']/1000)) # save the period in seconds - tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) #save the numerical value of PKG - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) #save the scaled value of Arg - tasks.append(int(task_info['CRITICALTIME']/1000)) # save criticaltime in seconds - # for each job in that is in the task and has this task_set id - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) # save the transformed exit value - - if second_task != -1: # if the second task exists in this task-set then : - first_task = second_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - print(tasks) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if third_task != -1: # if the third task exists in this task-set then : - first_task = third_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - - if fourth_task != -1: # if the fourth task exists in this task-set then : - first_task = fourth_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - - tasks = np.array(tasks) # to save the task list as numpy array - features.append(tasks) # values in tasks are features - labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set - except Exception as e: # exception handler - print(e) - pass - pbar.update(1) - - - -labels = np.array(labels) # to save the labels list as numpy array - -# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value -features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') - -#print(features.shape) # the dimensionality of features -#print(labels.shape) # the dimensionality of labels - -# save both files for the training -with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' - pickle.dump(features, outfile) - -with open ( '56_labels', 'wb' ) as outfile: - pickle.dump(labels, outfile) diff --git a/README.md b/README.md index 70d2e70..c488bc9 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The first step is to preprocess the data. The database was imported and transfor 5. Task Critical time: Integer 6. Number of Jobs: Integer From Jobs only one feature was selected: Job Exit_Value: String. -After exporting all tables, start with Data_preparation.py. Line 165 is responsible for the length of the feature vector. +After exporting all tables, start with Data_preparation.py. Feature and labels are save in the end. @@ -35,7 +35,7 @@ CuDNNLSTM.py. When using CPU, install Tensorflow and replace CuDNNLSTM with LSTM Evaluation.py. Evaluation prints the confusion matrix and classification report. Tensorboard can be launched by typing tensorboard -–logdir=logs/ into the terminal and logs from trained models can be visualized **4. Prediction:** -predictin.py. A CSV file will be save with actual and predictied values. The trained model should be loaded first. +prediction.py. A CSV file will be save with actual and predicted values. The trained model should be loaded first. **5. Plotting:** Plotting.py. Another way to visualize the model built. diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 0000000..434703f --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,73 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# All Vagrant configuration is done below. The "2" in Vagrant.configure +# configures the configuration version (we support older styles for +# backwards compatibility). Please don't change it unless you know what +# you're doing. +Vagrant.configure("2") do |config| + # The most common configuration options are documented and commented below. + # For a complete reference, please see the online documentation at + # https://docs.vagrantup.com. + + # Every Vagrant development environment requires a box. You can search for + # boxes at https://atlas.hashicorp.com/search. + config.vm.box = "ubuntu/xenial64" + + # Disable automatic box update checking. If you disable this, then + # boxes will only be checked for updates when the user runs + # `vagrant box outdated`. This is not recommended. + # config.vm.box_check_update = false + + # Create a forwarded port mapping which allows access to a specific port + # within the machine from a port on the host machine. In the example below, + # accessing "localhost:8080" will access port 80 on the guest machine. + # config.vm.network "forwarded_port", guest: 80, host: 8080 + + # Create a private network, which allows host-only access to the machine + # using a specific IP. + # config.vm.network "private_network", ip: "192.168.33.10" + # config.vm.network "public_network", ip: "127.0.0.1", bridge: "enp0s25" + + + # Create a public network, which generally matched to bridged network. + # Bridged networks make the machine appear as another physical device on + # your network. + config.vm.network "public_network", :mac => "0A0100000000", :auto_config => false + + # Share an additional folder to the guest VM. The first argument is + # the path on the host to the actual folder. The second argument is + # the path on the guest to mount the folder. And the optional third + # argument is a set of non-required options. + # config.vm.synced_folder "../data", "/vagrant_data" + + # Provider-specific configuration so you can fine-tune various + # backing providers for Vagrant. These expose provider-specific options. + # Example for VirtualBox: + # + config.vm.provider "virtualbox" do |vb| + # # Display the VirtualBox GUI when booting the machine + vb.gui = false + # + # # Customize the amount of memory on the VM: + vb.memory = 4096 + vb.cpus = 2 + end + # + # View the documentation for the provider you are using for more + # information on available options. + + # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies + # such as FTP and Heroku are also available. See the documentation at + # https://docs.vagrantup.com/v2/push/atlas.html for more information. + # config.push.define "atlas" do |push| + # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" + # end + + # Enable provisioning with a shell script. Additional provisioners such as + # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the + # documentation for more information about their specific syntax and use. + config.vm.provision "shell", path:"provision.sh", privileged:false; + config.vm.provision "shell", path:"bootstrap.sh" , run:"always"; + +end diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100644 index 0000000..f5e64fd --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,15 @@ + +############################## +# +# This is a bootstrap script which is +# run at every startup of the vagrant machine +# If you want to run something just once at provisioning +# and first bootup of the vagrant machine please see +# provision.sh +# +# Contributor: Bernhard Blieninger +############################## + +python3 -m venv lstm-virtenv +source lstm-virtenv/bin/activate +pip3 install -r python3-lstm/requirements.txt diff --git a/prediction.py b/prediction.py deleted file mode 100644 index 3280154..0000000 --- a/prediction.py +++ /dev/null @@ -1,153 +0,0 @@ -import pickle -import numpy as np -import pandas as pd -from tqdm import tqdm -from keras.preprocessing.sequence import pad_sequences -from keras.models import load_model -import csv - -df_taskset = pd.read_csv ( 'TaskSet.csv' ) -# df_taskset = df_taskset.sample(frac=0.0001, random_state=99) -df_task = pd.read_csv ( 'Task.csv' ) -df_job = pd.read_csv ( 'Job.csv' ) - -ntn = df_task[['PKG']].values -ntn1 = [] -for n in ntn: - ntn1.append ( n[0] ) - -PKGs = {} -PKGs['pi'] = 0 -PKGs['hey'] = 1 -PKGs['tumatmul'] = 2 -PKGs['cond_mod'] = 3 - -Exit_Values = {} -Exit_Values['EXIT'] = 1 -Exit_Values['EXIT_CRITICAL'] = 0 - -Arg_Values = {} -Arg_Values[1] = 1 -Arg_Values[4096] = 2 -Arg_Values[8192] = 3 -Arg_Values[16384] = 4 -Arg_Values[32768] = 5 -Arg_Values[65536] = 6 -Arg_Values[131072] = 7 -Arg_Values[262144] = 8 -Arg_Values[524288] = 9 -Arg_Values[1048576] = 10 -Arg_Values[2097152] = 11 -Arg_Values[847288609443] = 12 -Arg_Values[2541865828329] = 13 -Arg_Values[7625597484987] = 14 -Arg_Values[22876792454961] = 15 -Arg_Values[68630377364883] = 16 -Arg_Values[205891132094649] = 17 - -i = 0 -features = [] -labels = [] -with tqdm(total=len(list(df_taskset.iterrows()))) as pbar: - for index, row in df_taskset.iterrows(): - - try: - - i += 1 - grid = int(df_taskset.loc[index, 'Set_ID']) - res = int(df_taskset.loc[index, 'Successful']) - print(grid) - first_task = int(df_taskset.loc[index, 'TASK1_ID']) - second_task = int(df_taskset.loc[index, 'TASK2_ID']) - third_task = int(df_taskset.loc[index, 'TASK3_ID']) - fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) - tasks = [] - - if first_task != -1: - - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'] / 1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append(int(task_info['CRITICALTIME'] / 1000)) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if second_task != -1: - first_task = second_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'] / 1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append(int(task_info['CRITICALTIME'] / 1000)) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if third_task != -1: - first_task = third_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'] / 1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append( int ( task_info['CRITICALTIME'] / 1000)) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if fourth_task != -1: - first_task = fourth_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'])) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append(int(task_info['CRITICALTIME'])) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - labels = np.array(int(df_taskset.loc[index, 'Successful'])) - - tasks = np.array(tasks) - features.append(tasks) - labels.append(res) - except Exception as e: - print(e) - pass - pbar.update(1) - -labels = np.array(labels) -features = pad_sequences(features, maxlen=42, value=-1, padding='post', truncating='post') - -model = load_model('My_LSTM_Model.h5') -X = np.expand_dims(features, axis=2) -preds = model.predict(X) - -arr = [] -for i in range(len(labels)): - l = labels[i] - p = np.argmax(preds[i]) - print ( "the actual value is{0}and the predicted value is {1}".format(l, p)) - arr.append([i + 1, l, p]) - -csvfile = "Predicion_results.csv" - -i = 0 -with open(csvfile, "w") as output: - writer = csv.writer(output, lineterminator='\n') - if (i == 0): - writer.writerow(["TaskSet ID", "Actual Value", "Predicted Value"]) - i += 1 - writer.writerows(arr) diff --git a/provision.sh b/provision.sh new file mode 100755 index 0000000..5b4f244 --- /dev/null +++ b/provision.sh @@ -0,0 +1,17 @@ +#!/bin/bash +####################### +# +# This is a provision script +# it will be called once when the vagrant vm is first provisioned +# If you have commands that you want to run always please have a +# look at the bootstrap.sh script +# +# Contributor: Bernhard Blieninger, Robert Hamsch +###################### + +sudo apt update -qq + +sudo apt install python3.5 python3-pip tmux -qq + +sudo apt install python3-venv +#pip3 install --user virtualenv diff --git a/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py similarity index 83% rename from CuDNNLSTM.py rename to python3-lstm/CuDNNLSTM.py index 5fd9629..1a28871 100644 --- a/CuDNNLSTM.py +++ b/python3-lstm/CuDNNLSTM.py @@ -1,5 +1,10 @@ import pickle import time +import warnings +warnings.filterwarnings('ignore',category=FutureWarning) +#ignore deprecation warnings to get a better and cleaner output +from tensorflow.python.util import deprecation +deprecation._PRINT_DEPRECATION_WARNINGS = False import tensorflow as tf import numpy as np from keras.callbacks import TensorBoard @@ -10,6 +15,8 @@ from keras.optimizers import Adam from sklearn.model_selection import train_test_split + + name = "logname-{}".format ( int ( time.time () ) ) # both metrics and early stopping conditions are defined here and then saved in the log42 file @@ -18,9 +25,9 @@ es = EarlyStopping ( monitor='val_loss', mode='min', verbose=1 ) # define early stopping criteria # Importing the the extracted features and labels -with open ( '56_features', 'rb' ) as fp: +with open ( '42_features', 'rb' ) as fp: X = pickle.load ( fp ) -with open ( '56_labels', 'rb' ) as fp: +with open ( '42_labels', 'rb' ) as fp: y = pickle.load ( fp ) # LSTM’s input shape argument expects a three-dimensional array as an input in this order: Samples, timestamps and features. This is why we need to add another dimention to the numpy array. @@ -39,15 +46,14 @@ # print ( count ) # devide data into training and test sets -X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ) +X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ,random_state=42) # print ( X_train.shape ) # LSTM input is fifty-six time-steps and one feature at each time-step is represented by the notation: (56,1). -input = Input ( shape=(56, 1) ) +input = Input ( shape=(42, 1) ) # the first LSTM layer has 64 cells, the number must be equal/bigger than the input size. If you are using a CPU then change CuDNNLSTM to LSTM -lstm = CuDNNLSTM ( 64, return_sequences=True ) ( - input ) # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM +lstm = CuDNNLSTM ( 64, return_sequences=True ) ( input ) # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM lstm = CuDNNLSTM ( 128, return_sequences=True ) ( lstm ) lstm = CuDNNLSTM ( 256 ) ( lstm ) diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py new file mode 100644 index 0000000..0f8bbad --- /dev/null +++ b/python3-lstm/Data_preparation.py @@ -0,0 +1,176 @@ + +import warnings +warnings.filterwarnings('ignore',category=FutureWarning) +import pickle +import sys +import numpy as np +from keras.preprocessing.sequence import pad_sequences +import sqlite3 + +debug = False + + +# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label +PKGs = { + 'pi' : 0, + 'hey' : 1, + 'tumatmul' : 2, + 'cond_mod' : 3 + } + +# Integer encoding for Exit_Values from Jobs +Exit_Values = { + 'EXIT' : 1, + 'EXIT_CRITICAL' : 0, + 'EXIT_PERIOD' : 2, + 'OUT_OF_CAPS' : 3, + 'OUT_OF_QUOTA' : 4, + 'EXIT_ERROR' : 5 + } + +# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 +Arg_Values = { + 1 : 1, + 4096 : 2, + 8192 : 3, + 16384 : 4, + 32768 : 5, + 65536 : 6, + 131072 : 7, + 262144 : 8, + 524288 : 9, + 1048576 : 10, + 2097152 : 11, + 847288609443 : 12, + 2541865828329 : 13, + 7625597484987 : 14, + 22876792454961 : 15, + 68630377364883 : 16, + 205891132094649 : 17 + } + + +if debug: + print("Doing writing") + +DB_PATH = sys.argv[1] +TASKS_DICT = {} + +def taskToFeatureList(task): + #returns a fature list for the corresponding task values + feature = [] + feature.append(task['Priority']) + feature.append(task['Period']) + feature.append(task['Number_of_Jobs']) + feature.append(task['PKG']) + feature.append(task['Arg']) + feature.append(task['CRITICALTIME']) + return feature + + +def getTaskFeatures(db_path): #c is the cursor for the db + # returns a dictionary + # { task_id : [ feature, list ] + conn = sqlite3.connect(db_path) + conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) } + db_cursor = conn.cursor() + db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task') + outputTable = db_cursor.fetchall() + + tasks_dict = {} + for row in outputTable: + row['Period'] = int(row['Period']/1000) + row['Number_of_Jobs'] = int(row['Number_of_Jobs']) + row['PKG'] = PKGs[row['PKG']] + row['CRITICALTIME'] = int(row['CRITICALTIME']/1000) + row['Arg'] = Arg_Values[row['Arg']] + tasks_dict[row['Task_ID']] = taskToFeatureList(row) + return tasks_dict + + +def processTaskset(tasksetData): + # tasksetData is a list of tuples returned from the DB in getTasksetData() + label = tasksetData[0][-1] + features = [] + jobExitsByTask = {} + for tsData in tasksetData: + try: + jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) + except KeyError: + jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] + for taskIdNo in (1,2,3): + if tasksetData[0][taskIdNo] != -1: + features += TASKS_DICT[tasksetData[0][taskIdNo]] + try: + features += jobExitsByTask[tasksetData[0][taskIdNo]] + except KeyError: + features += [Exit_Values['EXIT_ERROR']] + return np.array(features), label + + +def getFeaturesLabels(db_path): + conn = sqlite3.connect(db_path) + db_cursor = conn.cursor() + command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\ + ' FROM TaskSet JOIN Job'\ + ' ON TaskSet.Set_ID = Job.Set_ID and'\ + ' (TaskSet.TASK1_ID == Job.Task_ID or'\ + ' TaskSet.TASK2_ID == Job.Task_ID or'\ + ' TaskSet.TASK3_ID == Job.Task_ID);' + db_cursor.execute(command) + # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] + data_table = db_cursor.fetchall() + + finalFeatureList = [] + finalLabelList = [] + currentTset = data_table[0][0] # first taskset id + tSetJobs = [] + totalSize = len(data_table) + for row in data_table: + if row[0] == currentTset: + #then still same setTset + tSetJobs.append(row) + else: + # job of next taskset + # process data and record new + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + tSetJobs = [] + currentTset = row[0] + tSetJobs.append(row) + # process last taskset + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + return finalFeatureList, finalLabelList + + + +TASKS_DICT = getTaskFeatures(DB_PATH) + +if debug: + print('Tasks have been added to TASKS_DICT') + print('length of taskdict: ', len(TASKS_DICT)) + print('example task 222:',TASKS_DICT[222]) + +features, labels = getFeaturesLabels(DB_PATH) + +labels = np.array(labels) # to save the labels list as numpy array + +# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value +features = pad_sequences(features, maxlen=42, value=-1, padding='post', truncating='post') + +if debug: + print(features.shape) # the dimensionality of features + input() + print(labels.shape) # the dimensionality of labels + input() + +# save both files for the training +with open ( '42_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' + pickle.dump(features, outfile) + +with open ( '42_labels', 'wb' ) as outfile: + pickle.dump(labels, outfile) + diff --git a/Evaluation.py b/python3-lstm/Evaluation.py similarity index 88% rename from Evaluation.py rename to python3-lstm/Evaluation.py index ffd9811..1ffccc8 100644 --- a/Evaluation.py +++ b/python3-lstm/Evaluation.py @@ -12,9 +12,9 @@ from sklearn import metrics from sklearn.model_selection import train_test_split -with open ( '56_features', 'rb' ) as fp: +with open ( '42_features', 'rb' ) as fp: X = pickle.load ( fp ) -with open ( '56_labels', 'rb' ) as fp: +with open ( '42_labels', 'rb' ) as fp: y = pickle.load ( fp ) X = np.expand_dims ( X, axis=2 ) @@ -30,7 +30,7 @@ y = np.array ( newy ) print ( count ) -X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ) +X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3, random_state=42) print ( X_train.shape ) model = load_model ( 'My_LSTM_Model.h5' ) # loading saved model @@ -58,8 +58,8 @@ plt.figure ( figsize=(5.5, 4) ) sns.heatmap ( cm_df, annot=True, fmt='g' ) -plt.title ( 'Confusoin Matrix \n Accuracy:{0:.3f}'.format ( accuracy_score ( yt, yp ) ) ) +plt.title ( 'Confusion Matrix \n Accuracy:{0:.3f}'.format ( accuracy_score ( yt, yp ) ) ) plt.ylabel ( 'True label' ) plt.xlabel ( 'Predicted label' ) plt.show () -plt.savefig ( 'Confusoin_Matrix.png' ) +plt.savefig ( 'Confusion_Matrix.png' ) diff --git a/Plotting.py b/python3-lstm/Plotting.py similarity index 100% rename from Plotting.py rename to python3-lstm/Plotting.py diff --git a/parallel_search.py b/python3-lstm/parallel_search.py similarity index 100% rename from parallel_search.py rename to python3-lstm/parallel_search.py diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py new file mode 100644 index 0000000..8dc2274 --- /dev/null +++ b/python3-lstm/prediction.py @@ -0,0 +1,31 @@ +import numpy as np +import pickle +from keras.models import load_model +import csv + +with open ( '42_features', 'rb' ) as outfile: # 'wb' is the file mode, it means 'write binary' + features = pickle.load(outfile, fix_imports=True) + +with open ( '42_labels', 'rb' ) as outfile: + labels = pickle.load(outfile, fix_imports=True) + +model = load_model('My_LSTM_Model.h5') +X = np.expand_dims(features, axis=2) +preds = model.predict(X) + +arr = [] +for i in range(len(labels)): + l = labels[i] + p = np.argmax(preds[i]) + print ( "the actual value is {0} and the predicted value is {1}".format(l, p)) + arr.append([i + 1, l, p]) + +csvfile = "Predicion_results.csv" + +i = 0 +with open(csvfile, "w") as output: + writer = csv.writer(output, lineterminator='\n') + if (i == 0): + writer.writerow(["TaskSet ID", "Actual Value", "Predicted Value"]) + i += 1 + writer.writerows(arr) diff --git a/python3-lstm/requirements.txt b/python3-lstm/requirements.txt new file mode 100644 index 0000000..99bb79b --- /dev/null +++ b/python3-lstm/requirements.txt @@ -0,0 +1,8 @@ +keras==2.2.5 +matplotlib==3.1.1 +numpy==1.17.2 +pandas==0.25.1 +seaborn==0.9.0 +scikit_learn==0.21.3 +tensorboard==1.14.0 +tensorflow==1.14.0