In [None]:
from __future__ import absolute_import, division, print_function

import math
import os
import parser

import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier

import generar_jobs
import time

In [None]:
# Problem parameters.
# TODO receive as parameters
task_amount = 128
machine_amount = 4
task_heterogeneity = 0
machine_heterogeneity = 0
consistency_type = 0
accuracy_scores = []
classifiers = []
# Makespan stuff
# Preciso un vector donde para cada indice tengo el makespan de una instancia de validacion (voy a tener
# tantas entradas como instancias de validacion tenga)
# En definitiva van a ser dos vectores, uno para los makespan por tarea de la heuristica, y otro para
# mis clasificadores
makespan_per_machine_heuristic = [[]] * machine_amount
makespan_per_machine_prediction = [[]] * machine_amount
# Classifier configuration.
CLASSIFIER_STRING_ANN = 'ann'
CLASSIFIER_STRING_SVM = 'svm'
classifier_types = [CLASSIFIER_STRING_ANN, CLASSIFIER_STRING_SVM]
current_classifier_index = 0 # Only modify this.
current_classifier_str = classifier_types[current_classifier_index]
# Base path for classifier persistence.
model_base_path = './models/' + current_classifier_str + '/' + str(task_amount) + 'x' + str(machine_amount) \
    + '-' + str(task_heterogeneity) + str(machine_heterogeneity) \
    + str(consistency_type) + '/'
baseDir = './data-processed/' + str(task_amount) + 'x' \
    + str(machine_amount) + '-' + str(task_heterogeneity) \
    + str(machine_heterogeneity) + str(consistency_type) + '/'
model_file_prefix = 'clf-' + current_classifier_str
model_file_extension = '.pkl'

if current_classifier_str == CLASSIFIER_STRING_ANN:
    dimension = task_amount * machine_amount
    # Reference: https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
    ns = 600  # Amount of training examples.
    ni = dimension
    no = 1  # Amount of output neurons.
    alpha = 2
    hidden_layer_amount = int(math.ceil(ns / (alpha * (ni + no))))
    # Each hidden layer has an intermediate amount of neurons (between the neuron amount
    # present in the output layer and the input layer).
    # A tuple is generated to set up the MLPClassifier.
    hidden_layer_neuron_amount = tuple([int(math.ceil((task_amount - no) / 2))]
                                       * hidden_layer_amount)  
elif current_classifier_str == CLASSIFIER_STRING_SVM:
    # No config necessary for SVC method.
    pass

In [None]:
# Generate or load the classifiers (if they already exist).
# TODO maybe specify classifier configuration along with this (so as to not specify something that might already exist)
for i in range(0, task_amount):
    try:
        classifier = joblib.load(model_base_path + model_file_prefix + str(i) \
                                 + model_file_extension)
    except Exception:
        print('The classifier for output ' + str(i) + ' didn\'t exist.')
        if current_classifier_str == CLASSIFIER_STRING_ANN:
            classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, 
                hidden_layer_sizes=hidden_layer_neuron_amount, random_state=1)
        elif current_classifier_str == CLASSIFIER_STRING_SVM:
            classifier = svm.SVC()
    finally:
        # Append classifier to classifier list (in memory).
        classifiers.append(classifier)  

Se entrena cada clasificador, y para cada uno se hace lo siguiente (post-entrenamiento)
* Se evalua la accuracy usando el training set
* Se recorre cada instancia del training set y se calcula el makespan que aporta el clasificador correspondiente
    * O sea que se va a obtener un vector, donde cada entrada es el makespan para una instancia de entrenamiento distinta
    * Como este vector eventualmente se va a obtener para cada clasificador, se va a tener una matriz, donde el primer indice accede a un clasificador, y el segundo a un makespan

In [None]:
# No threading version
start = time.time()

for i in range(0, task_amount): # For each task/classifier
    print("Training classifier " + str(i) + "...")
    # Data is loaded.
    TRAINING_FILE = baseDir + 'training/' + str(i) + '.csv' # Training file for current classifier
    TEST_FILE = baseDir + 'test/' + str(i) + '.csv' # Test file for current classifier
    training_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
    test_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
    # Create dataframe for data and separate target.
    df_training = pd.DataFrame(training_set)
    y_training = df_training.iloc[:, -1]
    # Validation/testing data is loaded.
    df_test = pd.DataFrame(test_set)
    y_test = df_test.iloc[:, -1]
    # Classifier is trained using the data.
    classifiers[i].fit(df_training.iloc[:, :-1], y_training)
    # Classifier directory is generated if it doesn't exist.
    generar_jobs.generate_dir(model_base_path)
    # Classifier is persisted.
    joblib.dump(classifiers[i], model_base_path + model_file_prefix + str(i) \
                + model_file_extension)
    # Classifier accuracy is determined using test data.
    results = []
    # Go through every test instance manually to calculate makespan for each
    # problem-classifier/task pair
    current_task_index = i * machine_amount # Column index within etc matrix
    print("    Doing makespan stuff...")
    test_instance_amount = len(df_test)
    current_task_makespan_prediction = [[]] * machine_amount # Array that holds makespan for every validation instance, 
    # for only one task
    current_task_makespan_heuristic = [[]] * machine_amount # Array that holds makespan for every validation instance, 
    # for only one task
    for j in range(0, test_instance_amount): # For every validation instance
        # df_test.iloc[j] is an ETC matrix + the corresponding classification for one task
        problem_instance = df_test.iloc[j]
        etc_matrix = problem_instance[:-1]
        classification_heuristic = float(problem_instance[-1:]) # float format
        # Every test example is classified, and its classification is appended
        # to a results array.
        # Make prediction for current problem instance or etc matrix
        prediction_pandas = float(classifiers[i].predict(etc_matrix.values.reshape(1, -1)))
        results.append(prediction_pandas)
        prediction = float(prediction_pandas) # To work in floats
        # TODO Agregar resultados de clasificacion a un array, para comparar con y_test
        # asi obtengo medida de accuracy aparte de esto que estoy armando
        # Para cada ejemplo de validacion, determino cuanto tiempo aporta al makespan de la tarea/clasificador actual
        # Get times for all machinesi
        sub_row_for_current_task = etc_matrix[current_task_index:current_task_index + machine_amount]
        # Makespan value for prediction
        current_makespan_prediction = sub_row_for_current_task[current_task_index + prediction]
        # Makespan value for heuristic
        current_makespan_heuristic = sub_row_for_current_task[current_task_index + classification_heuristic]        
#         print('    Sub row for current task is: \n' + str(sub_row_for_current_task))
#         print('        Prediction is: ' + str(prediction) + ', Makespan is: ' + str(current_makespan_prediction))
#         print('        Heuristic is: ' + str(classification_heuristic) + ', Makespan is: ' + str(current_makespan_heuristic))
        current_task_makespan_prediction[int(prediction)].append(current_makespan_prediction)
        current_task_makespan_heuristic[int(classification_heuristic)].append(current_makespan_heuristic)
    # Cada indice de current_task_makespan_* tiene el vector de makespan values para una instancia en una maquina
    for j in range(0, machine_amount):
        makespan_per_machine_prediction[j].append([]) # Append new array for current task
        makespan_per_machine_heuristic[j].append([]) # Append new array for current task        
        makespan_per_machine_prediction[j][-1].append(current_task_makespan_prediction[j])
        makespan_per_machine_heuristic[j][-1].append(current_task_makespan_heuristic[j])        
    # Actual results are compared to expected values.
    accuracy = accuracy_score(y_test, results)
    print("    Classifier accuracy: " + str(accuracy))
    # Calculated accuracy is added to accuracies list.
    accuracy_scores.append(accuracy)
end = time.time()
print('The execution took ' + str(end - start) + ' seconds')

In [None]:
print(len(makespan_for_instances_heuristic))
makespan_differences = [] # Array that holds, for each validation instance, the difference between
# heuristic and prediction
validation_instance_amount = len(makespan_for_instances_prediction[0])
for current_instance in range(0, validation_instance_amount):
    print("Instance " + str(current_instance) + ":")
    current_makespan_heuristic = 0.0
    for task in range(0, task_amount):
        current_makespan_heuristic += makespan_for_instances_heuristic[task][current_instance]
    print("    Total makespan for heuristic and instance " + str(current_instance) + ": " + str(current_makespan_heuristic))
    current_makespan_prediction = 0.0
    for task in range(0, task_amount):
        current_makespan_prediction += makespan_for_instances_prediction[task][current_instance]
    print("    Total makespan for prediction and instance " + str(current_instance) + ": " + str(current_makespan_prediction))
    makespan_difference = current_makespan_heuristic - current_makespan_prediction
    makespan_differences.append(makespan_difference)
avg = np.mean(makespan_differences)
print("The average difference between makespan values between heuristic and prediction is " + str(avg))
if avg > 0:
    print("Heuristic takes longer.")
elif avg < 0:
    print("Prediction takes longer.")
else:
    print("Same performance.")
    
    # makespan_for_instances_prediction[i][j] #i es la task, j es una instancia en particular

In [None]:
print(len(makespan_per_machine_heuristic[1][0]))

In [None]:
# # Multithreading version
# import os

# def train_and_persist_classifier(classifier_index):
#     # Data is loaded.
#     TRAINING_FILE = baseDir + 'training/' + str(classifier_index) + '.csv'
#     TEST_FILE = baseDir + 'test/' + str(classifier_index) + '.csv'
#     training_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
#     test_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
#     # Create dataframe for data and separate target.
#     df_training = pd.DataFrame(training_set)
#     y_training = df_training.iloc[:, -1]
#     # Validation/testing data is loaded.
#     df_test = pd.DataFrame(test_set)
#     y_test = df_test.iloc[:, -1]
#     # Classifier is trained using the data.
#     classifiers[classifier_index].fit(df_training.iloc[:, :-1], y_training)
#     # Classifier directory is generated if it doesn't exist.
#     generar_jobs.generate_dir(model_base_path)
#     # Classifier is persisted.
#     joblib.dump(classifiers[classifier_index], model_base_path + model_file_prefix + str(classifier_index) \
#                 + model_file_extension)
#     # Classifier accuracy is determined using test data.
#     results = []
#     for i in range(0, len(df_test)):
#         # Every test example is classified, and its classification is appended
#         # to a results array.
#         results.append(classifiers[classifier_index].predict(
#             df_test.iloc[i][:-1].values.reshape(1, -1)))
#     # Actual results are compared to expected values.
#     accuracy = accuracy_score(y_test, results)
#     os.write(1,'Classifier ' + str(classifier_index) + ':\n') # Print directly to console
#     os.write(1, 'Accuracy: ' + str(accuracy) + ', ') # Print directly to console
#     # Calculated accuracy is added to accuracies list.
#     accuracy_scores.append(accuracy)
# #     os.write(1, 'Training of classifier ' + str(classifier_index) + ' finished.\n') 
#     return

# from joblib import Parallel, delayed
# import multiprocessing

# if __name__ == '__main__':
#     ##### VERSION 1 #####
# #     jobs = []
# #     for i in range(0, task_amount):
# #         print('Starting training of classifier ' + str(i))
# #         p = multiprocessing.Process(target=train_and_persist_classifier(i))
# #         jobs.append(p)
# #         p.start()
#     ##### END VERSION 1 #####
#     ##### VERSION 2 #####
#     start = time.time()
#     num_cores = multiprocessing.cpu_count() * 4
#     # For every task, train a classifier.
#     Parallel(n_jobs=num_cores)(delayed(train_and_persist_classifier)(i) for i in range(0,task_amount))
#     end = time.time()
#     print('The execution took ' + str(end - start) + ' seconds')    
    

In [None]:
# Average accuracy (for all classifiers) is calculated (nothing to do with threading).
promedio = 0.
score_amount = len(accuracy_scores)
for i in range(0, score_amount):
    promedio += accuracy_scores[i]
promedio /= score_amount
print ('The average accuracy is {}'.format(promedio))