# AutoML (askl2) trained on every file individually, tested on the rest (4 cores, 30 seconds per file)
AutoML's data preprocessing is applied to these tests. The model will be trained on file A, tested on files B,C,D.., then on B, tested on A,C,D.. etc

In [1]:
import sys
sys.path.append("../../")
from time import process_time
from os import listdir, chdir
from copy import deepcopy

def warn(*args, **kwargs):
  pass
import warnings
warnings.warn = warn

from modules.NetworkTraffic import NetworkTraffic
from sklearn import model_selection, metrics
from autosklearn.experimental.askl2 import AutoSklearn2Classifier

FilesToTest = list()
chdir("../../data")
for file in listdir():
  if file.endswith(".csv"):
    FilesToTest.append(file)

TestSize = [0.4]
ModelsToTest = [AutoSklearn2Classifier(time_left_for_this_task=300, memory_limit=4096, n_jobs=-1)]
OutputResults = dict()
#ModelResults = dict()

In [2]:
def testModel(model, x_train, x_test, y_train, y_test):
  start = process_time()

  ### Begin timing
  temp_clf = model
  temp_clf.fit(x_train, y_train)

  y_pred = temp_clf.predict(x_test)
  ### End timing

  stop = process_time()

  # Results
  tempDict = {
    "Accuracy": metrics.accuracy_score(y_test, y_pred),
    "Balanced Accuracy": metrics.balanced_accuracy_score(y_test, y_pred),
    "F1 Micro": metrics.f1_score(y_test, y_pred, average='micro'),
    "Precision Micro": metrics.f1_score(y_test, y_pred, average='micro'),
    "Recall Micro": metrics.recall_score(y_test, y_pred, average='micro'),
    "Runtime": stop-start,
  }
  # try:
  #   tempDict["Leaderboard"] = str(temp_clf.leaderboard())
  # except KeyError:
  #   tempDict["Leaderboard"] = None
  try:
    tempDict["Final Ensemble"] = temp_clf.show_models()
  except KeyError:
    tempDict["Final Ensemble"] = None
  try:
    tempDict["Leaderboard"] = str(temp_clf.leaderboard())
  except:
    pass
  return tempDict

In [3]:
def updateModelResults(size, model, results):
  def changeKeyValue():
    try:
      ModelResults[size][model][key] += results[key]
    except KeyError:
      ModelResults[size][model][key] = results[key]
  
  # For each metric, attempt to set or add to the value
  for key in results:
    if key not in ["Leaderboard", "Final Ensemble"]:
      try:
        changeKeyValue()
      except KeyError:
        ModelResults[size][model] = dict()
        changeKeyValue()

# Divide each metric by the total number of files tested
def findAveragesForModelResults(fileCount):
  for size in ModelResults:
    for model in ModelResults[size]:
      for metric in ModelResults[size][model]:
        if metric not in ["Leaderboard", "Final Ensemble"]:
          ModelResults[size][model][metric] /= fileCount

In [4]:
OutputResults.clear()
#ModelResults.clear()

for size in TestSize:
  print(f"\nSearching with test size of {size*100}%...")
  OutputResults[size] = dict()
  #ModelResults[size] = dict()

  for index, file in enumerate(FilesToTest):
    print(file, end=', ')
    OutputResults[size][file] = dict()
    currentFileData = NetworkTraffic(file, testSize=size, doNorm=True, doNormAll=True)
    restOfFiles = deepcopy(FilesToTest)
    restOfFiles.pop(index)
    restOfTheFilesData = NetworkTraffic(restOfFiles, testSize=size, doNorm=True, doNormAll=True)
    x_train, y_train, x_test, y_test = currentFileData.data, currentFileData.target, restOfTheFilesData.data, restOfTheFilesData.target

    for model in ModelsToTest:
      print(f"{file} : {str(model)}", end=", ")
      results = testModel(model, x_train, x_test, y_train, y_test)
      OutputResults[size][file].update({str(model): results})
      #updateModelResults(size, str(model), results)

#findAveragesForModelResults(len(FilesToTest))


Searching with test size of 40.0%...
b5000d100.csv, b5000d100.csv : AutoSklearn2Classifier(memory_limit=4096, n_jobs=-1,
                       time_left_for_this_task=300), b5000d30.csv, b5000d30.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                       per_run_time_limit=120, time_left_for_this_task=300), b100d10.csv, b100d10.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                       per_run_time_limit=120, time_left_for_this_task=300), b1000d10.csv, b1000d10.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                       per_run_time_limit=120, time_left_for_this_task=300), b1000d100.csv, b1000d100.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                       per_run_time_limit=120, time_left_for_this_task=300), b100d100.csv, b100d100.csv : AutoSklearn2Classifier(memory_limit=4096, metric=accuracy, n_jobs=-1,
                      

RuntimeError: No model found. Try increasing 'time_left_for_this_task'.

In [None]:
copyOfOutput = OutputResults
for size in OutputResults:
  for file in OutputResults[size]:
    for model in OutputResults[size][file]:
      for attribute in OutputResults[size][file][model]:
        if type(OutputResults[size][file][model][attribute]) not in [str, int, float]:
          copyOfOutput[size][file][model][attribute] = str(OutputResults[size][file][model][attribute])

In [None]:
import json
with open("EveryFileIndividually_Untuned_AllTestResults.json", "a") as f:
  f.write(json.dumps(copyOfOutput, indent=2))

In [None]:
with open("EveryFileTransfer_Untuned_ModelResults.csv", "w") as f3:
  f3.write("File Trained On,Model,Accuracy,Runtime\n")
  for size in OutputResults:
    for file in OutputResults[size]:
      for model in OutputResults[size][file]:
        f3.write(f"{file},{model},{OutputResults[size][file][model]['Accuracy']},{OutputResults[size][file][model]['Runtime']}\n")