# AutoML tested on every file individually (4 cores, 60 seconds per file)
AutoML's data preprocessing is applied to these tests. Results from each model's tests are compiled into a single average.

In [1]:
import sys
sys.path.append("../../")
from time import process_time
from os import listdir, chdir

def warn(*args, **kwargs):
  pass
import warnings
warnings.warn = warn

from modules.NetworkTraffic import NetworkTraffic
from sklearn import model_selection, metrics
from autosklearn.experimental.askl2 import AutoSklearn2Classifier

FilesToTest = list()
chdir("../../data")
for file in listdir():
  if file.endswith(".csv"):
    FilesToTest.append(file)

TestSize = [0.6]
ModelsToTest = [AutoSklearn2Classifier(time_left_for_this_task=30, memory_limit=4096, n_jobs=-1)]
OutputResults = dict()
ModelResults = dict()

In [2]:
def testModel(model, x_train, x_test, y_train, y_test):
  start = process_time()

  ### Begin timing
  temp_clf = model
  temp_clf.fit(x_train, y_train)

  y_pred = temp_clf.predict(x_test)
  ### End timing

  stop = process_time()

  # Results
  tempDict = {
    "Accuracy": metrics.accuracy_score(y_test, y_pred),
    "Balanced Accuracy": metrics.balanced_accuracy_score(y_test, y_pred),
    "F1 Micro": metrics.f1_score(y_test, y_pred, average='micro'),
    "Precision Micro": metrics.f1_score(y_test, y_pred, average='micro'),
    "Recall Micro": metrics.recall_score(y_test, y_pred, average='micro'),
    "Runtime": stop-start,
  }
  # try:
  #   tempDict["Leaderboard"] = str(temp_clf.leaderboard())
  # except KeyError:
  #   tempDict["Leaderboard"] = None
  try:
    tempDict["Final Ensemble"] = temp_clf.show_models()
  except KeyError:
    tempDict["Final Ensemble"] = None
  return tempDict

In [3]:
def updateModelResults(size, model, results):
  def changeKeyValue():
    try:
      ModelResults[size][model][key] += results[key]
    except KeyError:
      ModelResults[size][model][key] = results[key]
  
  # For each metric, attempt to set or add to the value
  for key in results:
    if key not in ["Leaderboard", "Final Ensemble"]:
      try:
        changeKeyValue()
      except KeyError:
        ModelResults[size][model] = dict()
        changeKeyValue()

# Divide each metric by the total number of files tested
def findAveragesForModelResults(fileCount):
  for size in ModelResults:
    for model in ModelResults[size]:
      for metric in ModelResults[size][model]:
        if metric not in ["Leaderboard", "Final Ensemble"]:
          ModelResults[size][model][metric] /= fileCount

In [4]:
OutputResults.clear()
ModelResults.clear()

for size in TestSize:
  print(f"\nSearching with test size of {size*100}%...")
  OutputResults[size] = dict()
  ModelResults[size] = dict()

  for file in FilesToTest:
    OutputResults[size][file] = dict()
    currentFileData = NetworkTraffic(file)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(currentFileData.data, currentFileData.target)

    for model in ModelsToTest:
      print(f"{file} : {str(model)}", end=", ")
      results = testModel(model, x_train, x_test, y_train, y_test)
      OutputResults[size][file].update({str(model): results})
      updateModelResults(size, str(model), results)

findAveragesForModelResults(len(FilesToTest))


Searching with test size of 60.0%...
b5000d100.csv : AutoSklearn2Classifier(memory_limit=4096, n_jobs=-1, time_left_for_this_task=30), b5000d30.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                       per_run_time_limit=12, time_left_for_this_task=30), b100d10.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                       per_run_time_limit=12, time_left_for_this_task=30), 

ValueError: Some errors were detected !
    Line #2 (got 5 columns instead of 25)
    Line #3 (got 4 columns instead of 25)
    Line #4 (got 5 columns instead of 25)
    Line #5 (got 4 columns instead of 25)

In [None]:
import json
with open("EveryFileIndividually_Untuned_AllTestResults.json", "a") as f:
  f.write(json.dumps(OutputResults, indent=2))
with open("EveryFileIndividually_Untuned_ModelResults.json", "a") as f2:
  f2.write(json.dumps(ModelResults, indent=2))

TypeError: Object of type int64 is not JSON serializable

In [None]:
with open("EveryFileIndividual_Untuned_ModelResults.csv", "w") as f3:
  f3.write("Test Size,Model,Accuracy,Runtime\n")
  for size in ModelResults:
    for model in ModelResults[size]:
      f3.write(f"{size},{model},{ModelResults[size][model]['Accuracy']},{ModelResults[size][model]['Runtime']}\n")