# Traditional Methods tested on every file individually (untuned)
MinMaxScaler is applied to these tests. All models are untuned. Results from each model's tests are compiled into a single average.

In [1]:
import sys
sys.path.append("../../")
from time import process_time
from os import listdir, chdir

def warn(*args, **kwargs):
  pass
import warnings
warnings.warn = warn

from modules.NetworkTraffic import NetworkTraffic
from sklearn import model_selection, metrics

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier

FilesToTest = list()
chdir("../../data")
for file in listdir():
  if file.endswith(".csv"):
    FilesToTest.append(file)

TestSize = [0.4]
ModelsToTest = [RandomForestClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), MLPClassifier(), LinearSVC()]
OutputResults = dict()
ModelResults = dict()

In [2]:
def testModel(model, x_train, x_test, y_train, y_test):
  start = process_time()

  ### Begin timing
  temp_clf = model
  temp_clf.fit(x_train, y_train)

  y_pred = temp_clf.predict(x_test)
  ### End timing

  stop = process_time()

  # Results
  tempDict = {
    "Accuracy": metrics.accuracy_score(y_test, y_pred),
    "Balanced Accuracy": metrics.balanced_accuracy_score(y_test, y_pred),
    "F1 Micro": metrics.f1_score(y_test, y_pred, average='micro'),
    "Precision Micro": metrics.f1_score(y_test, y_pred, average='micro'),
    "Recall Micro": metrics.recall_score(y_test, y_pred, average='micro'),
    "Runtime": stop-start,
  }
  return tempDict

In [3]:
def updateModelResults(size, model, results):
  def changeKeyValue():
    try:
      ModelResults[size][model][key] += results[key]
    except KeyError:
      ModelResults[size][model][key] = results[key]
  
  # For each metric, attempt to set or add to the value
  for key in results:
    try:
      changeKeyValue()
    except KeyError:
      ModelResults[size][model] = dict()
      changeKeyValue()

# Divide each metric by the total number of files tested
def findAveragesForModelResults(fileCount):
  for size in ModelResults:
    for model in ModelResults[size]:
      for metric in ModelResults[size][model]:
        ModelResults[size][model][metric] /= fileCount

In [4]:
OutputResults.clear()
ModelResults.clear()

for size in TestSize:
  print(f"\nSearching with test size of {size*100}%...")
  OutputResults[size] = dict()
  ModelResults[size] = dict()

  for file in FilesToTest:
    print(file, end=', ')
    OutputResults[size][file] = dict()
    currentFileData = NetworkTraffic(file, testSize=size, doNorm=True, doNormAll=False, doTransform=True)
    x_train, x_test, y_train, y_test = currentFileData.x_train, currentFileData.x_test, currentFileData.y_train, currentFileData.y_test

    for model in ModelsToTest:
      #print(f"{file} : {str(model)}...")
      results = testModel(model, x_train, x_test, y_train, y_test)
      OutputResults[size][file].update({str(model): results})
      updateModelResults(size, str(model), results)

findAveragesForModelResults(len(FilesToTest))


Searching with test size of 40.0%...
b5000d100.csv, b5000d30.csv, b100d10.csv, b1000d10.csv, b1000d100.csv, b100d100.csv, b5000d10.csv, b1000d30.csv, b100d30.csv, 

In [5]:
import json
with open("EveryFileIndividually_Untuned_AllTestResults.json", "w") as f:
  f.write(json.dumps(OutputResults, indent=2))
with open("EveryFileIndividually_Untuned_ModelResults.json", "w") as f2:
  f2.write(json.dumps(ModelResults, indent=2))

In [6]:
with open("EveryFileIndividual_Untuned_ModelResults.csv", "w") as f3:
  f3.write("Test Size,Model,Accuracy,Runtime\n")
  for size in ModelResults:
    for model in ModelResults[size]:
      f3.write(f"{size},{model},{ModelResults[size][model]['Accuracy']},{ModelResults[size][model]['Runtime']}\n")