In [1]:
from __future__ import absolute_import, division, print_function

import math
import os
import parser

import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

import generar_jobs
import time

from sklearn.preprocessing import StandardScaler

In this section, the problem parameters are established (should make this dynamic)

In [2]:
# TODO receive as parameters
task_amount = 128
machine_amount = 4
task_heterogeneity = 0
machine_heterogeneity = 0
consistency_type = 0
accuracy_scores = []
classifiers = []
# Classifier configuration.
CLASSIFIER_STRING_ANN = 'ann'
CLASSIFIER_STRING_SVM = 'svm'
classifier_types = [CLASSIFIER_STRING_ANN, CLASSIFIER_STRING_SVM]
current_classifier_index = 1 # Only modify this.
current_classifier_str = classifier_types[current_classifier_index]
# Base path for classifier persistence.
model_base_path = './models/' + current_classifier_str + '/' + str(task_amount) + 'x' + str(machine_amount) \
    + '-' + str(task_heterogeneity) + str(machine_heterogeneity) \
    + str(consistency_type) + '/'
baseDir = './data-processed/' + str(task_amount) + 'x' \
    + str(machine_amount) + '-' + str(task_heterogeneity) \
    + str(machine_heterogeneity) + str(consistency_type) + '/'
model_file_prefix = 'clf-' + current_classifier_str
model_file_extension = '.pkl'

if current_classifier_str == CLASSIFIER_STRING_ANN:
    dimension = task_amount * machine_amount
    # Reference: https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
    ns = 600  # Amount of training examples.
    ni = dimension
    no = 1  # Amount of output neurons.
    alpha = 2
    hidden_layer_amount = 2 #int(math.ceil(ns / (alpha * (ni + no)))) # Con 2 hardcodeado parece aprender mejor
    # Each hidden layer has an intermediate amount of neurons (between the neuron amount
    # present in the output layer and the input layer).
    # A tuple is generated to set up the MLPClassifier.
    hidden_layer_neuron_amount = tuple([int(math.ceil((task_amount - no) / 2))]
                                       * hidden_layer_amount)  
elif current_classifier_str == CLASSIFIER_STRING_SVM:
    # No mandatory config for SVC method.
    pass

In the following section, classifiers are loaded (or generated if they don't exist)

In [3]:
# TODO maybe specify classifier configuration along with this (so as to not specify something that might already exist)
for i in range(0, task_amount):
    try:
        classifier = joblib.load(model_base_path + model_file_prefix + str(i) \
                                 + model_file_extension)
    except Exception:
        print('The classifier for output ' + str(i) + ' didn\'t exist.')
        if current_classifier_str == CLASSIFIER_STRING_ANN:
            classifier = MLPClassifier(solver='lbfgs', alpha=1e-2, 
                hidden_layer_sizes=hidden_layer_neuron_amount, random_state=1)
        elif current_classifier_str == CLASSIFIER_STRING_SVM:
            classifier = svm.SVC()
    finally:
        # Append classifier to classifier list (in memory).
        classifiers.append(classifier)  

The classifier for output 0 didn't exist.
The classifier for output 1 didn't exist.
The classifier for output 2 didn't exist.
The classifier for output 3 didn't exist.
The classifier for output 4 didn't exist.
The classifier for output 5 didn't exist.
The classifier for output 6 didn't exist.
The classifier for output 7 didn't exist.
The classifier for output 8 didn't exist.
The classifier for output 9 didn't exist.
The classifier for output 10 didn't exist.
The classifier for output 11 didn't exist.
The classifier for output 12 didn't exist.
The classifier for output 13 didn't exist.
The classifier for output 14 didn't exist.
The classifier for output 15 didn't exist.
The classifier for output 16 didn't exist.
The classifier for output 17 didn't exist.
The classifier for output 18 didn't exist.
The classifier for output 19 didn't exist.
The classifier for output 20 didn't exist.
The classifier for output 21 didn't exist.
The classifier for output 22 didn't exist.
The classifier for ou

In the following section, each classifier is trained, and the following is done for each one (after training it):
* Its accuracy is determined using the training set
* Each training set instance is iterated over, and the time each machine uses in execution is stored (for calculating the makespan afterwards, in another section) in an array
    * Each entry of the array will be an array of machine_amount elements, in which each element corresponds to the time each machine uses up

In [None]:
# No threading version.
start = time.time()
# Each index corresponds to an instance.
makespan_instance_machines_heuristic = []
makespan_instance_machines_prediction = []
SCALE_DATA = True
# Within each index, there'll be an array of machine_amount elements, in which each element
# is the time during which each machine is running
# Something along the lines of [[10,20,9,40], [99,88,22,11], ..., [10,9,21,35]]
for i in task_amount: # For each task/classifier
    print("Training classifier " + str(i) + "...")
    # Data is loaded.
    TRAINING_FILE = baseDir + 'training/' + str(i) + '.csv' # Training file for current classifier
    TEST_FILE = baseDir + 'test/' + str(i) + '.csv' # Test file for current classifier
    training_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
    test_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
    
    # Create dataframe for data and separate target.
    df_training = pd.DataFrame(training_set)
    df_training_input = df_training.iloc[:, :-1] 
    df_training_output = df_training.iloc[:, -1]
    
    # Validation/testing data is loaded.
    df_test = pd.DataFrame(test_set)
    df_test_input = df_test.iloc[:, :-1]
    df_test_output = df_test.iloc[:, -1]
    
    if SCALE_DATA:
        # Scale data because http://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use
        scaler = StandardScaler()  
        # Fit only on training data.
        scaler.fit(df_training_input)
        # Reconvert input training data to dataframe after scaling (which converts it to an array of arrays).
        df_training_input = pd.DataFrame(scaler.transform(df_training_input))
        # Re-init scaler just in case.
        scaler = StandardScaler()  
        scaler.fit(df_test_input)
        # Scale test data.
        df_test_input = pd.DataFrame(scaler.transform(df_test_input))        

    
    print("######### df_training_input #########")
    print(df_training_input)
    
    # Classifier is trained using the data.
    classifiers[i].fit(df_training_input, df_training_output)
    # Classifier directory is generated if it doesn't exist.
    generar_jobs.generate_dir(model_base_path)
    # Classifier is persisted.
    joblib.dump(classifiers[i], model_base_path + model_file_prefix + str(i) \
                + model_file_extension)
    # Classifier accuracy is determined using test data.
    results = []
    # Go through every test instance manually to calculate makespan for each
    # problem-classifier/task pair
    current_task_index = i * machine_amount # Column index within etc matrix
    print("    Doing makespan stuff...")
    test_instance_amount = len(df_test)
    for j in range(0, test_instance_amount): # For every validation instance
        # df_test.iloc[j] is an ETC matrix + the corresponding classification for one task
        etc_matrix_scaled = df_test_input.iloc[j] # Scaled data for classification (since classifiers were
        # trained using scaled data)
        # Non-scaled data is used to calculate real makespan, using the original units of the problem.
        etc_matrix = df_test.iloc[j][:-1] # Get j problem instance, ignoring last column (the output/classification).
        classification_heuristic = float(df_test_output[j])
        # Every test example is classified, and its classification is appended
        # to a results array.
        # Make prediction for current problem instance or etc matrix (using scaled data).
        prediction_pandas = float(classifiers[i].predict(etc_matrix_scaled.values.reshape(1, -1)))
        results.append(prediction_pandas)
        prediction = float(prediction_pandas) # To work in floats.

        # Get subrow from original input data, to get the task/machine times right.
        sub_row_for_current_task = etc_matrix[current_task_index:current_task_index + machine_amount]
        # Makespan value for prediction
        current_makespan_prediction = sub_row_for_current_task[current_task_index + prediction]
        # Makespan value for heuristic
        current_makespan_heuristic = sub_row_for_current_task[current_task_index + classification_heuristic]
        if len(makespan_instance_machines_prediction) <= j: # If there's no entry for this problem instance.
            # Init entry for problem instance, with each machine's makespan starting at 0.0.
            makespan_instance_machines_prediction.append([0.0] * machine_amount)
            makespan_instance_machines_heuristic.append([0.0] * machine_amount)
        makespan_instance_machines_prediction[j][int(prediction)] += current_makespan_prediction
        makespan_instance_machines_heuristic[j][int(classification_heuristic)] += current_makespan_heuristic
    print("    Done with makespan stuff...")
    # Actual classification results are compared to expected values.
    accuracy = accuracy_score(df_test_output, results)
    print("    Classifier accuracy: " + str(accuracy))
    # Calculated accuracy is added to accuracies list.
    accuracy_scores.append(accuracy)
end = time.time()
print('The execution took ' + str(end - start) + ' seconds')

The following section takes the makespan data (which determines how much time each machine takes for each problem instance) and determines an average makespan for all of the problem instances (how much time the slowest machine takes in completing the tasks).

In [None]:
# Array that holds makespan values for the prediction.
makespan_prediction = []
for i in range(0, len(makespan_instance_machines_prediction)):
    makespan_prediction.append(np.max(makespan_instance_machines_prediction[i]))
# Array that holds makespan values for the heuristic
makespan_heuristic = []
for i in range(0, len(makespan_instance_machines_heuristic)):
    makespan_heuristic.append(np.max(makespan_instance_machines_heuristic[i]))
# Array that holds the difference between heuristic and prediction makespan.
makespan_diff = []
for i in range(0, len(makespan_prediction)):
    makespan_diff.append(makespan_prediction[i] - makespan_heuristic[i])
# Calculate average difference between methods.
avg_difference_between_methods = np.mean(makespan_diff)
print('Average difference between techniques: ' + str(avg_difference_between_methods))
if avg_difference_between_methods > 0:
    print('The heuristic works better on average')
elif avg_difference_between_methods < 0:
    print('Savant works better on average')
else:
    print('Both techniques work equivalently on average')


The following section has multithreading code, needs to be reviewed and updated to match the non-multithreading version of the code (besides it isn't certain that this actually works on a cluster).

In [None]:
# TODO update this to include makespan calculation

# # Multithreading version
# import os

# def train_and_persist_classifier(classifier_index):
#     # Data is loaded.
#     TRAINING_FILE = baseDir + 'training/' + str(classifier_index) + '.csv'
#     TEST_FILE = baseDir + 'test/' + str(classifier_index) + '.csv'
#     training_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
#     test_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
#     # Create dataframe for data and separate target.
#     df_training = pd.DataFrame(training_set)
#     y_training = df_training.iloc[:, -1]
#     # Validation/testing data is loaded.
#     df_test = pd.DataFrame(test_set)
#     y_test = df_test.iloc[:, -1]
#     # Classifier is trained using the data.
#     classifiers[classifier_index].fit(df_training.iloc[:, :-1], y_training)
#     # Classifier directory is generated if it doesn't exist.
#     generar_jobs.generate_dir(model_base_path)
#     # Classifier is persisted.
#     joblib.dump(classifiers[classifier_index], model_base_path + model_file_prefix + str(classifier_index) \
#                 + model_file_extension)
#     # Classifier accuracy is determined using test data.
#     results = []
#     for i in range(0, len(df_test)):
#         # Every test example is classified, and its classification is appended
#         # to a results array.
#         results.append(classifiers[classifier_index].predict(
#             df_test.iloc[i][:-1].values.reshape(1, -1)))
#     # Actual results are compared to expected values.
#     accuracy = accuracy_score(y_test, results)
#     os.write(1,'Classifier ' + str(classifier_index) + ':\n') # Print directly to console
#     os.write(1, 'Accuracy: ' + str(accuracy) + ', ') # Print directly to console
#     # Calculated accuracy is added to accuracies list.
#     accuracy_scores.append(accuracy)
# #     os.write(1, 'Training of classifier ' + str(classifier_index) + ' finished.\n') 
#     return

# from joblib import Parallel, delayed
# import multiprocessing

# if __name__ == '__main__':
#     ##### VERSION 1 #####
# #     jobs = []
# #     for i in range(0, task_amount):
# #         print('Starting training of classifier ' + str(i))
# #         p = multiprocessing.Process(target=train_and_persist_classifier(i))
# #         jobs.append(p)
# #         p.start()
#     ##### END VERSION 1 #####
#     ##### VERSION 2 #####
#     start = time.time()
#     num_cores = multiprocessing.cpu_count() * 4
#     # For every task, train a classifier.
#     Parallel(n_jobs=num_cores)(delayed(train_and_persist_classifier)(i) for i in range(0,task_amount))
#     end = time.time()
#     print('The execution took ' + str(end - start) + ' seconds')    
    

This section determines the average accuracy for the created classifiers.

In [None]:
# Average accuracy (for all classifiers) is calculated (nothing to do with threading).
promedio = 0.
score_amount = len(accuracy_scores)
for i in range(0, score_amount):
    promedio += accuracy_scores[i]
promedio /= score_amount
print ('The average accuracy is {}'.format(promedio))

Utilities

In [None]:
def logToConsole(msg):
    '''
    Logs messages to console from within a Jupyter Notebook.
    '''
    os.write(1, msg + '\n')

In [7]:
# SVM parameter search bla
clf = classifiers[0]

# use a full grid over all parameters
param_grid = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
 ]
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)
report(grid_search.cv_results_)
print(time() - start)
# param_grid = [
#   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
#  ]
# TODO usar esto dentro del entrenamiento a ver que pasa

In [8]:
# No threading version.
start = time.time()
# Each index corresponds to an instance.
makespan_instance_machines_heuristic = []
makespan_instance_machines_prediction = []
SCALE_DATA = True
# Within each index, there'll be an array of machine_amount elements, in which each element
# is the time during which each machine is running
# Something along the lines of [[10,20,9,40], [99,88,22,11], ..., [10,9,21,35]]
for i in range(0, 1): # For each task/classifier
    print("Training classifier " + str(i) + "...")
    # Data is loaded.
    TRAINING_FILE = baseDir + 'training/' + str(i) + '.csv' # Training file for current classifier
    TEST_FILE = baseDir + 'test/' + str(i) + '.csv' # Test file for current classifier
    training_set = pd.read_csv(TRAINING_FILE, header=None, delimiter=',')
    test_set = pd.read_csv(TEST_FILE, header=None, delimiter=',')
    
    # Create dataframe for data and separate target.
    df_training = pd.DataFrame(training_set)
    df_training_input = df_training.iloc[:, :-1] 
    df_training_output = df_training.iloc[:, -1]
    
    # Validation/testing data is loaded.
    df_test = pd.DataFrame(test_set)
    df_test_input = df_test.iloc[:, :-1]
    df_test_output = df_test.iloc[:, -1]
    
    if SCALE_DATA:
        # Scale data because http://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use
        scaler = StandardScaler()  
        # Fit only on training data.
        scaler.fit(df_training_input)
        # Reconvert input training data to dataframe after scaling (which converts it to an array of arrays).
        df_training_input = pd.DataFrame(scaler.transform(df_training_input))
        # Re-init scaler just in case.
        scaler = StandardScaler()  
        scaler.fit(df_test_input)
        # Scale test data.
        df_test_input = pd.DataFrame(scaler.transform(df_test_input))        

    
#     print("######### df_training_input #########")
#     print(df_training_input)
    
    ##########################################
    #SVM SELECTION PARAMETERS
    ##########################################
    # use a full grid over all parameters
    param_grid = [
      {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
     ]
    # run grid search
    grid_search = GridSearchCV(classifiers[i], param_grid=param_grid)
    grid_search.fit(df_training_input, df_training_output)
    print('########START CUSTOM RESULTS########')
    print(classifiers[i])
    print('####################')
    print(grid_search.estimator)
    # Should use grid_search as new classifier, persist it, and use it for prediction
    # as a normal classifier (according to documentation it uses the best estimator)
    # However, it fits every possible estimator with the data, so that's something of note.
    # summarize the results of the grid search
#     print(grid_search.best_score_)
    print('########END CUSTOM RESULTS########')    
#     report(grid_search.cv_results_)    
    ##########################################
    #SVM SELECTION PARAMETERS
    ##########################################    
    
    
    # Classifier is trained using the data.
    classifiers[i].fit(df_training_input, df_training_output)
    # Classifier directory is generated if it doesn't exist.
    generar_jobs.generate_dir(model_base_path)
    # Classifier is persisted.
    joblib.dump(classifiers[i], model_base_path + model_file_prefix + str(i) \
                + model_file_extension)
    # Classifier accuracy is determined using test data.
    results = []
    # Go through every test instance manually to calculate makespan for each
    # problem-classifier/task pair
    current_task_index = i * machine_amount # Column index within etc matrix
    print("    Doing makespan stuff...")
    test_instance_amount = len(df_test)
    for j in range(0, test_instance_amount): # For every validation instance
        # df_test.iloc[j] is an ETC matrix + the corresponding classification for one task
        etc_matrix_scaled = df_test_input.iloc[j] # Scaled data for classification (since classifiers were
        # trained using scaled data)
        # Non-scaled data is used to calculate real makespan, using the original units of the problem.
        etc_matrix = df_test.iloc[j][:-1] # Get j problem instance, ignoring last column (the output/classification).
        classification_heuristic = float(df_test_output[j])
        # Every test example is classified, and its classification is appended
        # to a results array.
        # Make prediction for current problem instance or etc matrix (using scaled data).
        prediction_pandas = float(classifiers[i].predict(etc_matrix_scaled.values.reshape(1, -1)))
        results.append(prediction_pandas)
        prediction = float(prediction_pandas) # To work in floats.

        # Get subrow from original input data, to get the task/machine times right.
        sub_row_for_current_task = etc_matrix[current_task_index:current_task_index + machine_amount]
        # Makespan value for prediction
        current_makespan_prediction = sub_row_for_current_task[current_task_index + prediction]
        # Makespan value for heuristic
        current_makespan_heuristic = sub_row_for_current_task[current_task_index + classification_heuristic]
        if len(makespan_instance_machines_prediction) <= j: # If there's no entry for this problem instance.
            # Init entry for problem instance, with each machine's makespan starting at 0.0.
            makespan_instance_machines_prediction.append([0.0] * machine_amount)
            makespan_instance_machines_heuristic.append([0.0] * machine_amount)
        makespan_instance_machines_prediction[j][int(prediction)] += current_makespan_prediction
        makespan_instance_machines_heuristic[j][int(classification_heuristic)] += current_makespan_heuristic
    print("    Done with makespan stuff...")
    # Actual classification results are compared to expected values.
    accuracy = accuracy_score(df_test_output, results)
    print("    Classifier accuracy: " + str(accuracy))
    # Calculated accuracy is added to accuracies list.
    accuracy_scores.append(accuracy)
end = time.time()
print('The execution took ' + str(end - start) + ' seconds')

Training classifier 0...
########START CUSTOM RESULTS########
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
####################
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
########END CUSTOM RESULTS########
    Doing makespan stuff...
    Done with makespan stuff...
    Classifier accuracy: 0.48
The execution took 20.8246250153 seconds


In [4]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")