In [1]:
from __future__ import absolute_import, division, print_function

import math
import os
import parser

import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier

import generar_jobs
import time

from sklearn.preprocessing import StandardScaler

In [2]:
# Problem parameters.
# TODO receive as parameters
task_amount = 128
machine_amount = 4
task_heterogeneity = 0
machine_heterogeneity = 0
consistency_type = 0
accuracy_scores = []
classifiers = []
# Classifier configuration.
CLASSIFIER_STRING_ANN = 'ann'
CLASSIFIER_STRING_SVM = 'svm'
classifier_types = [CLASSIFIER_STRING_ANN, CLASSIFIER_STRING_SVM]
current_classifier_index = 0 # Only modify this.
current_classifier_str = classifier_types[current_classifier_index]
# Base path for classifier persistence.
model_base_path = './models/' + current_classifier_str + '/' + str(task_amount) + 'x' + str(machine_amount) \
    + '-' + str(task_heterogeneity) + str(machine_heterogeneity) \
    + str(consistency_type) + '/'
baseDir = './data-processed/' + str(task_amount) + 'x' \
    + str(machine_amount) + '-' + str(task_heterogeneity) \
    + str(machine_heterogeneity) + str(consistency_type) + '/'
model_file_prefix = 'clf-' + current_classifier_str
model_file_extension = '.pkl'

if current_classifier_str == CLASSIFIER_STRING_ANN:
    dimension = task_amount * machine_amount
    # Reference: https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
    ns = 600  # Amount of training examples.
    ni = dimension
    no = 1  # Amount of output neurons.
    alpha = 2
    hidden_layer_amount = 2 #int(math.ceil(ns / (alpha * (ni + no)))) # Con 2 hardcodeado parece aprender mejor
    # Each hidden layer has an intermediate amount of neurons (between the neuron amount
    # present in the output layer and the input layer).
    # A tuple is generated to set up the MLPClassifier.
    hidden_layer_neuron_amount = tuple([int(math.ceil((task_amount - no) / 2))]
                                       * hidden_layer_amount)  
elif current_classifier_str == CLASSIFIER_STRING_SVM:
    # No config necessary for SVC method.
    pass

In [3]:
# Generate or load the classifiers (if they already exist).
# TODO maybe specify classifier configuration along with this (so as to not specify something that might already exist)
for i in range(0, task_amount):
    try:
        classifier = joblib.load(model_base_path + model_file_prefix + str(i) \
                                 + model_file_extension)
    except Exception:
        print('The classifier for output ' + str(i) + ' didn\'t exist.')
        if current_classifier_str == CLASSIFIER_STRING_ANN:
            classifier = MLPClassifier(solver='lbfgs', alpha=1e-2, 
                hidden_layer_sizes=hidden_layer_neuron_amount, random_state=1)
        elif current_classifier_str == CLASSIFIER_STRING_SVM:
            classifier = svm.SVC()
    finally:
        # Append classifier to classifier list (in memory).
        classifiers.append(classifier)  

The classifier for output 14 didn't exist.
The classifier for output 15 didn't exist.
The classifier for output 16 didn't exist.
The classifier for output 17 didn't exist.
The classifier for output 18 didn't exist.
The classifier for output 19 didn't exist.
The classifier for output 20 didn't exist.
The classifier for output 21 didn't exist.
The classifier for output 22 didn't exist.
The classifier for output 23 didn't exist.
The classifier for output 24 didn't exist.
The classifier for output 25 didn't exist.
The classifier for output 26 didn't exist.
The classifier for output 27 didn't exist.
The classifier for output 28 didn't exist.
The classifier for output 29 didn't exist.
The classifier for output 30 didn't exist.
The classifier for output 31 didn't exist.
The classifier for output 32 didn't exist.
The classifier for output 33 didn't exist.
The classifier for output 34 didn't exist.
The classifier for output 35 didn't exist.
The classifier for output 36 didn't exist.
The classif

Se entrena cada clasificador, y para cada uno se hace lo siguiente (post-entrenamiento)
* Se evalua la accuracy usando el training set
* Se recorre cada instancia del training set y se calcula el makespan que aporta el clasificador correspondiente
    * O sea que se va a obtener un vector, donde cada entrada es el makespan para una instancia de entrenamiento distinta
    * Como este vector eventualmente se va a obtener para cada clasificador, se va a tener una matriz, donde el primer indice accede a un clasificador, y el segundo a un makespan

In [4]:
# No threading version
start = time.time()
# Each index corresponds to an instance
makespan_instance_machines_heuristic = []
makespan_instance_machines_prediction = []
SCALE_DATA = True
# Within each index, there'll be an array of machine_amount elements, in which each element
# is the time during which each machine is running
# Something along the lines of [[10,20,9,40], [99,88,22,11], ..., [10,9,21,35]]
for i in range(0, task_amount): # For each task/classifier
    print("Training classifier " + str(i) + "...")
    # Data is loaded.
    TRAINING_FILE = baseDir + 'training/' + str(i) + '.csv' # Training file for current classifier
    TEST_FILE = baseDir + 'test/' + str(i) + '.csv' # Test file for current classifier
    training_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
    test_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
    
    
    # Create dataframe for data and separate target.
    df_training = pd.DataFrame(training_set)
    df_training_input = df_training.iloc[:, :-1] 
    df_training_output = df_training.iloc[:, -1] #y_training
    # print("##### BEFORE #####")
    # print(df_training_input)
    if SCALE_DATA:
        # Scale data because http://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use
        scaler = StandardScaler()  
        # Don't cheat - fit only on training data
        scaler.fit(df_training_input)
        # Reconvert input training data to dataframe after scaling (which converts it to an array of arrays).
        df_training_input = pd.DataFrame(scaler.transform(df_training_input))
        # print("##### AFTER #####")
        # print(df_training_input)

    # print("#############")
    # print(df_training)
    # y_training = df_training.iloc[:, -1]
    # Validation/testing data is loaded.
    df_test = pd.DataFrame(test_set)
    df_test_input = df_test.iloc[:, :-1]
    df_test_output = df_test.iloc[:, -1] #y_test
    
    if SCALE_DATA:
        scaler = StandardScaler()  
        scaler.fit(df_test_input)
        # Scale test data.
        df_test_input = pd.DataFrame(scaler.transform(df_test_input))
    
    
    
    # Classifier is trained using the data.
    classifiers[i].fit(df_training_input, df_training_output)
    # Classifier directory is generated if it doesn't exist.
    generar_jobs.generate_dir(model_base_path)
    # Classifier is persisted.
    joblib.dump(classifiers[i], model_base_path + model_file_prefix + str(i) \
                + model_file_extension)
    # Classifier accuracy is determined using test data.
    results = []
    # Go through every test instance manually to calculate makespan for each
    # problem-classifier/task pair
    current_task_index = i * machine_amount # Column index within etc matrix
    print("    Doing makespan stuff...")
    test_instance_amount = len(df_test)
    for j in range(0, test_instance_amount): # For every validation instance
#         # df_test.iloc[j] is an ETC matrix + the corresponding classification for one task
#         problem_instance = df_test.iloc[j]
#         etc_matrix = problem_instance[:-1]
#         classification_heuristic = float(problem_instance[-1:]) # float format
        etc_matrix = df_test_input.iloc[[j]] # etc_matrix for j problem instance
#         print("########## DF_TEST_INPUT ##########")
#         print(df_test_input)
#         print("Current etc_matrix is: " + str(etc_matrix))
        classification_heuristic = float(df_test_output[j])
#         print("Classification is: " + str(classification_heuristic))
        # Every test example is classified, and its classification is appended
        # to a results array.
        # Make prediction for current problem instance or etc matrix
        prediction_pandas = float(classifiers[i].predict(etc_matrix.values.reshape(1, -1)))
        results.append(prediction_pandas)
        prediction = float(prediction_pandas) # To work in floats
        #os.write(1, 'Prediction pandas: ' + str(prediction_pandas) + ', Prediction: ' + str(prediction) + '\n');
#         print('Prediction pandas: ' + str(prediction_pandas) + ', Prediction: ' + str(prediction));

        # TODO ver que pasa con las dimensiones de etc_matrix, porque no agarra bien la subrow
        sub_row_for_current_task = etc_matrix[current_task_index:current_task_index + machine_amount]
        os.write(1, "########### ETC_MATRIX:\n" + str(etc_matrix))
        os.write(1, "########### SUBROW:\n" + str(sub_row_for_current_task))
        # Makespan value for prediction (tiempo que me incurre hacer la prediccion)
        current_makespan_prediction = sub_row_for_current_task[current_task_index + prediction]
        # Makespan value for heuristic (tiempo en el que incurre la heuristica al hacer esta asignacion)
        current_makespan_heuristic = sub_row_for_current_task[current_task_index + classification_heuristic]
        if len(makespan_instance_machines_prediction) <= j: # Si todavia no tengo entrada para esta instancia
            makespan_instance_machines_prediction.append([0.0] * machine_amount) # Inicializo entrada para esta instancia, con el makespan
            makespan_instance_machines_heuristic.append([0.0] * machine_amount)
            # de cada maquina
#         print("Makespan pred: \n" + str(current_makespan_prediction) + "\nHeuristic: " + str(current_makespan_heuristic))
        makespan_instance_machines_prediction[j][int(prediction)] += current_makespan_prediction
        makespan_instance_machines_heuristic[j][int(classification_heuristic)] += current_makespan_heuristic
    # Actual results are compared to expected values.
    accuracy = accuracy_score(df_test_output, results)
    print("    Classifier accuracy: " + str(accuracy))
    # Calculated accuracy is added to accuracies list.
    accuracy_scores.append(accuracy)
end = time.time()
print('The execution took ' + str(end - start) + ' seconds')

Training classifier 0...
    Doing makespan stuff...
    Classifier accuracy: 0.34
Training classifier 1...
    Doing makespan stuff...
    Classifier accuracy: 0.35
Training classifier 2...


KeyboardInterrupt: 

In [14]:
# # Array that holds makespan values for the prediction.
# makespan_prediction = []
# for i in range(0, len(makespan_instance_machines_prediction)):
#     makespan_prediction.append(np.max(makespan_instance_machines_prediction[i]))
# # Array that holds makespan values for the heuristic
# makespan_heuristic = []
# for i in range(0, len(makespan_instance_machines_heuristic)):
#     makespan_heuristic.append(np.max(makespan_instance_machines_heuristic[i]))
# # Array that holds the difference between heuristic and prediction makespan.
# makespan_diff = []
# for i in range(0, len(makespan_prediction)):
#     makespan_diff.append(makespan_prediction[i] - makespan_heuristic[i])
# # Calculate average difference between methods.
# avg_difference_between_methods = np.mean(makespan_diff)
# print('Average difference between techniques: ' + str(avg_difference_between_methods))
# if avg_difference_between_methods > 0:
#     print('The heuristic works better on average')
# elif avg_difference_between_methods < 0:
#     print('Savant works better on average')
# else:
#     print('Both techniques work equivalently on average')


[[0   NaN
  Name: 1, dtype: float64,
  Series([], Name: 7, dtype: float64),
  Series([], Name: 14, dtype: float64),
  Series([], Name: 10, dtype: float64)],
 [1   NaN
  Name: 0, dtype: float64,
  Series([], Name: 22, dtype: float64),
  Series([], Name: 14, dtype: float64),
  Series([], Name: 17, dtype: float64)],
 [2   NaN
  Name: 0, dtype: float64,
  Series([], Name: 35, dtype: float64),
  Series([], Name: 16, dtype: float64),
  Series([], Name: 66, dtype: float64)],
 [3   NaN
  Name: 0, dtype: float64,
  Series([], Name: 12, dtype: float64),
  Series([], Name: 30, dtype: float64),
  Series([], Name: 100, dtype: float64)],
 [4   NaN
  Name: 0, dtype: float64,
  Series([], Name: 16, dtype: float64),
  Series([], Name: 24, dtype: float64),
  Series([], Name: 14, dtype: float64)],
 [Series([], Name: 22, dtype: float64),
  Series([], Name: 4, dtype: float64),
  5   NaN
  Name: 1, dtype: float64,
  Series([], Name: 70, dtype: float64)],
 [Series([], Name: 5, dtype: float64),
  Series([], N

In [None]:
# TODO update this to include makespan calculation

# # Multithreading version
# import os

# def train_and_persist_classifier(classifier_index):
#     # Data is loaded.
#     TRAINING_FILE = baseDir + 'training/' + str(classifier_index) + '.csv'
#     TEST_FILE = baseDir + 'test/' + str(classifier_index) + '.csv'
#     training_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
#     test_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
#     # Create dataframe for data and separate target.
#     df_training = pd.DataFrame(training_set)
#     y_training = df_training.iloc[:, -1]
#     # Validation/testing data is loaded.
#     df_test = pd.DataFrame(test_set)
#     y_test = df_test.iloc[:, -1]
#     # Classifier is trained using the data.
#     classifiers[classifier_index].fit(df_training.iloc[:, :-1], y_training)
#     # Classifier directory is generated if it doesn't exist.
#     generar_jobs.generate_dir(model_base_path)
#     # Classifier is persisted.
#     joblib.dump(classifiers[classifier_index], model_base_path + model_file_prefix + str(classifier_index) \
#                 + model_file_extension)
#     # Classifier accuracy is determined using test data.
#     results = []
#     for i in range(0, len(df_test)):
#         # Every test example is classified, and its classification is appended
#         # to a results array.
#         results.append(classifiers[classifier_index].predict(
#             df_test.iloc[i][:-1].values.reshape(1, -1)))
#     # Actual results are compared to expected values.
#     accuracy = accuracy_score(y_test, results)
#     os.write(1,'Classifier ' + str(classifier_index) + ':\n') # Print directly to console
#     os.write(1, 'Accuracy: ' + str(accuracy) + ', ') # Print directly to console
#     # Calculated accuracy is added to accuracies list.
#     accuracy_scores.append(accuracy)
# #     os.write(1, 'Training of classifier ' + str(classifier_index) + ' finished.\n') 
#     return

# from joblib import Parallel, delayed
# import multiprocessing

# if __name__ == '__main__':
#     ##### VERSION 1 #####
# #     jobs = []
# #     for i in range(0, task_amount):
# #         print('Starting training of classifier ' + str(i))
# #         p = multiprocessing.Process(target=train_and_persist_classifier(i))
# #         jobs.append(p)
# #         p.start()
#     ##### END VERSION 1 #####
#     ##### VERSION 2 #####
#     start = time.time()
#     num_cores = multiprocessing.cpu_count() * 4
#     # For every task, train a classifier.
#     Parallel(n_jobs=num_cores)(delayed(train_and_persist_classifier)(i) for i in range(0,task_amount))
#     end = time.time()
#     print('The execution took ' + str(end - start) + ' seconds')    
    

In [None]:
# Average accuracy (for all classifiers) is calculated (nothing to do with threading).
promedio = 0.
score_amount = len(accuracy_scores)
for i in range(0, score_amount):
    promedio += accuracy_scores[i]
promedio /= score_amount
print ('The average accuracy is {}'.format(promedio))

In [33]:
i = 0
print("Training classifier " + str(i) + "...")
# Data is loaded.
TRAINING_FILE = baseDir + 'training/' + str(i) + '.csv' # Training file for current classifier
TEST_FILE = baseDir + 'test/' + str(i) + '.csv' # Test file for current classifier
training_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
test_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
# print("############# TRAINING SET #############")
# print(training_set)

# print("############# SCALED TRAINING SET #############")
# print(pd.DataFrame(training_set))
# apply same transformation to test data
# test_set = scaler.transform(test_set)      



# Create dataframe for data and separate target.
df_training = pd.DataFrame(training_set)
df_training_input = df_training.iloc[:, :-1]
print("##### BEFORE #####")
print(df_training_input.iloc[:,:])
df_training_output = df_training.iloc[:, -1]
# Scale data because http://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(df_training_input)
# Reconvert input training data to dataframe after scaling (which converts it to an array of arrays).
df_training_input = pd.DataFrame(scaler.transform(df_training_input))
print("##### AFTER #####")
print(df_training_input)

# print("#############")
# print(df_training)
# y_training = df_training.iloc[:, -1]
# Validation/testing data is loaded.
df_test = pd.DataFrame(test_set)
y_test = df_test.iloc[:, -1]

Training classifier 0...
##### BEFORE #####
        0       1       2       3       4       5       6       7       8    \
0    180.80  242.71  495.44  526.07  125.71  271.89  274.12  373.41    2.33   
1     12.73   25.58  439.26  676.48   91.42  110.97  168.78  321.79   67.61   
2    306.20  355.49  377.72  462.72   42.92  106.53  545.97  757.45   19.48   
3    155.29  238.72  274.79  466.53    9.52   36.11   53.07   65.96   75.63   
4      9.20   82.30  152.59  197.13   37.05  105.36  180.67  230.68  115.33   
5    122.52  164.22  201.54  218.26   62.69  342.09  377.68  497.12    0.68   
6     34.61   41.60   50.97   67.81  267.76  400.21  438.12  445.82   18.99   
7    135.06  432.79  503.92  799.80  107.89  215.49  405.02  929.46   61.02   
8    155.96  245.41  426.66  706.89   46.77   92.23   92.98  128.60  182.87   
9    352.45  388.65  537.12  654.71   38.96   68.38   87.11  173.51  160.33   
10   129.75  146.74  281.30  383.85   36.69   54.80  350.68  500.77    2.76   
11   180