In [7]:
import re
import ast
import numpy as np

In [8]:
# This script fixes the formatting of the results.txt file, created after 30+ hours of training.
# It basically creates .csv files for each machine learning technique with useful informations. 

# Dictionary used to store the results of each machine learning technique.
results_dict = {'Linear SVC': [], 'AdaBoost': [], 'Decision Tree': [], 'Random Forest': [], 'Bernoulli NB': [], 'Gaussian NB': []}

with open('results.txt', 'r') as f:
    for line in f:
        # Removes the newline character.
        line = line.strip('\n')
        
        # Adjusts the precision, recall and f-score arrays.
        # e.g. array([0.98869048, 0.70039387]), array([0.89989186, 0.95786739]), array([0.94220358, 0.80914204])
        # => 0.98869048, 0.70039387, 0.89989186, 0.95786739, 0.94220358, 0.80914204.
        res_line = re.sub(r"array\(\[(\d\.\d+)\s*,\s*(\d\.\d+)\s*\]\)", "\g<1>, \g<2>", line)
        
        # Adjusts the support array.
        # e.g. array([749969, 183231]) => 749969, 183231.
        res_line = re.sub(r"array\(\[(\d+)\s*,\s*(\d+)\]\)", "\g<1>, \g<2>", res_line)
        
        # Adjusts the confusion matrix array.
        # e.g. array([[674891,  75078],       [  7720, 175511]]) => 674891, 75078, 7720, 175511.
        res_line = re.sub(r"array\(\[\[(\d+)\s*,\s*(\d+)\]\s*,\s*\[\s+(\d+),\s+(\d+)\]\]\)", "\g<1>, \g<2>, \g<3>, \g<4>", res_line)
        
        # Replaces the ' by ".
        res_line = res_line.replace('\'', '\"')
        
        # Converts the string representation of an array to a literal array.
        res_array = ast.literal_eval(res_line)
        
        # Accuracy.
        acc = res_array[3]
        
        # Precision.
        pre_normal = res_array[4]
        pre_attack = res_array[5]
        
        # Recall.
        rec_normal = res_array[6]
        rec_attack = res_array[7]
        
        # F-Score.
        fsc_normal = res_array[8]
        fsc_attack = res_array[9]
        
        # Support.
        sup_normal = res_array[10]
        sup_attack = res_array[11]
        
        # Fit and test times.
        fit_time = res_array[16]
        tst_time = res_array[17]
        
        # Weighted precision, recall and f-score.
        # UPDATE: I'm not using the weighted version of these metrics anymore.
        # Instead, I'm analyzing the classifers based on the metrics obtained in the minority class (attacks).
        # pre_weighted = (pre_normal * sup_normal + pre_attack * sup_attack) / (sup_normal + sup_attack)
        # rec_weighted = (rec_normal * sup_normal + rec_attack * sup_attack) / (sup_normal + sup_attack)
        # fsc_weighted = (fsc_normal * sup_normal + fsc_attack * sup_attack) / (sup_normal + sup_attack)
        
        # Dump the results to results_dict.
        results_dict[res_array[2]].append([acc, pre_attack, rec_attack, fsc_attack, fit_time, tst_time])
    
# Creates the .csv files.
for key, values in results_dict.items():
    with open(key + '.csv', 'w') as f:
        for value in values:
            value = [str(x) for x in value]
            csv_value = ', '.join(value)
            f.write(csv_value + '\n')
            