Construct Graph function

In [4]:
from graph import Node

def construct_graph(command_list, command_dict={}):

    filter_empty = lambda x: (len(x) > 0)
    cur_node = None
    child_node = None
    
    for session in command_list:
        # remove commands of length 0 ( not good to modify list within loop )
        
        session = list(filter(filter_empty, session))

        try: 
            first_cmd = session[0]
        except Exception as inst:
            print(session)
            continue

        program = session[0].split()[0]

        if command_dict.get(program) is None:
            cur_node = Node(program=program, frequency=1)
            command_dict[program] = cur_node 
        else:
            cur_node = command_dict.get(program)
            cur_node.frequency += 1

        if cur_node.commands.get(first_cmd) is None:
            cur_node.commands[first_cmd] = 1
        else:
            cur_node.commands[first_cmd] += 1

        for cmd in range(1, len(session)):

            if session[cmd].isprintable() is False:
                break
            else:
                program = session[cmd].split()[0]

            if cur_node.children.get(program) is None:
                child_node = Node(program = program, frequency=1)
                cur_node.children[program] = child_node
            else:
                child_node = cur_node.children.get(program)
                child_node.frequency += 1
            
            if child_node.commands.get(session[cmd]) is None:
                child_node.commands[session[cmd]] = 1
            else:
                child_node.commands[session[cmd]] += 1
            
            cur_node = child_node
        
                
        
    return command_dict


Get Prediction function

In [5]:
def get_prediction(command_list, graph):
    if len(command_list) == 0:
        return None

    commands = command_list[-3:-1]
    previous_command = command_list[-1]

    if len(commands) == 0:
        return None

    program = commands[0].split()[0]
    
    if graph.get(program) is None:
        return None
    else:
        node = graph[program]

    for command in commands[1:]:
        program = command.split()[0]
        if node.children.get(program) is not None:
            node = node.children[program]
        else:
            return None

    return node.get_prediction(previous_command)

Accuracy function

In [51]:
from thefuzz import fuzz
from thefuzz import process
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

def get_accuracy(command_subsets, fail_list = [], print_fails = False):
    train_data, test_data = train_test_split(command_subsets, test_size=0.2, train_size=0.8, random_state=42)

    train_graph = construct_graph(train_data)

    test_size = len(test_data)

    correct = 0
    has_prediction = 0
    incorrect = 0
    none_count = 0

    for commands in test_data:
        results = get_prediction(commands, train_graph)
        
        prev_correct = correct

        if results is not None:
            has_prediction += 1
            
            for result in results:
                if fuzz.ratio(result[0], commands[-1]) > 75:
                    correct += 1
                    break
            if prev_correct == correct:
                incorrect += 1
        else:
            none_count += 1

        if prev_correct == correct and print_fails:
            fail_list.append(("results:", results, "expected:", commands[-1], "command sequence:", commands))
                    

    return 'Correct Proportion: {:.2f}% | Has Prediction and is Correct: {:.2f}% | Incorrect Proportion: {:.2f}% | None Proportion {:.2f}%'.format(100 * correct/test_size, 100 * correct/has_prediction, 100 * incorrect/test_size, 100 * none_count/test_size)

In [7]:
from parse import Parser

parser = Parser()

100%|██████████| 52/52 [00:17<00:00,  3.03it/s]
100%|██████████| 36/36 [00:09<00:00,  3.86it/s]
100%|██████████| 25/25 [00:03<00:00,  6.78it/s]
100%|██████████| 56/56 [00:10<00:00,  5.50it/s]


In [58]:
science = parser.filter_commands_with_pipe(parser.scientists_commands)
experienced = parser.filter_commands_with_pipe(parser.experienced_commands)
non = parser.filter_commands_with_pipe(parser.non_programmers_commands)
novice = parser.filter_commands_with_pipe(parser.novice_commands)

science_pipes = parser.expand_piped_commands(science)
experienced_pipes = parser.expand_piped_commands(experienced)
non_pipes = parser.expand_piped_commands(non)
novice_pipes = parser.expand_piped_commands(novice)

print(get_accuracy(science_pipes))
print(get_accuracy(experienced_pipes))
print(get_accuracy(non_pipes))
print(get_accuracy(novice_pipes))

"""
old, returning all potential commands - args not replaced

91.19%
96.35%
90.31%
93.18%
"""

science_pipes_expand = parser.replace_arg_expanded_pipe(science_pipes)
experienced_pipes_expand = parser.replace_arg_expanded_pipe(experienced_pipes)
non_pipes_expand = parser.replace_arg_expanded_pipe(non_pipes)
novice_pipes_expand = parser.replace_arg_expanded_pipe(novice_pipes)

Correct Proportion: 87.42% | Has Prediction and is Correct: 88.72% | Incorrect Proportion: 11.11% | None Proportion 1.47%
Correct Proportion: 90.41% | Has Prediction and is Correct: 90.41% | Incorrect Proportion: 9.59% | None Proportion 0.00%
Correct Proportion: 88.76% | Has Prediction and is Correct: 93.47% | Incorrect Proportion: 6.20% | None Proportion 5.04%
Correct Proportion: 77.27% | Has Prediction and is Correct: 79.07% | Incorrect Proportion: 20.45% | None Proportion 2.27%


In [60]:
print(get_accuracy(science_pipes_expand))
print(get_accuracy(experienced_pipes_expand))
print(get_accuracy(non_pipes_expand))
print(get_accuracy(novice_pipes_expand))

"""
[0:5]

89.52%
95.43%
93.80%
90.91%
"""

"""
[0:15]

85.53%
95.43%
93.80%
90.91%
"""

[]
Correct Proportion: 84.28% | Has Prediction and is Correct: 85.71% | Incorrect Proportion: 14.05% | None Proportion 1.68%
Correct Proportion: 91.78% | Has Prediction and is Correct: 91.78% | Incorrect Proportion: 8.22% | None Proportion 0.00%
Correct Proportion: 87.60% | Has Prediction and is Correct: 92.43% | Incorrect Proportion: 7.17% | None Proportion 5.23%
Correct Proportion: 86.36% | Has Prediction and is Correct: 86.36% | Incorrect Proportion: 13.64% | None Proportion 0.00%


'\n[0:15]\n\n85.53%\n95.43%\n93.80%\n90.91%\n'

In [33]:
subset_size = 5

science_replaced = parser.replace_args(parser.scientists_commands)
science_replaced = parser.parse_commands_into_subsets(science_replaced, subset_size)

experienced_replaced = parser.replace_args(parser.experienced_commands)
experienced_replaced = parser.parse_commands_into_subsets(experienced_replaced, subset_size)

non_replaced = parser.replace_args(parser.non_programmers_commands)
non_replaced = parser.parse_commands_into_subsets(non_replaced, subset_size)

novice_replaced = parser.replace_args(parser.novice_commands)
novice_replaced = parser.parse_commands_into_subsets(novice_replaced, subset_size)

In [61]:
print("Scientist replaced:", get_accuracy(science_replaced))
print("Experienced replaced:", get_accuracy(experienced_replaced))
print("Non programmer replaced:", get_accuracy(non_replaced))
print("Novice replaced:", get_accuracy(novice_replaced))

"""
[0:15]:

Scientist replaced accuracy: 91.83%
Experienced replaced accuracy: 92.64%
Non programmer replaced accuracy: 94.70%
Novice replaced accuracy: 95.65%
"""

"""
Returning all children commands to match

Scientist replaced accuracy: 91.83%
Experienced replaced accuracy: 92.64%
Non programmer replaced accuracy: 94.70%
Novice replaced accuracy: 95.65%
"""

"""
Top 5 returned to match

Scientist replaced accuracy: 85.06%
Experienced replaced accuracy: 86.43%
Non programmer replaced accuracy: 89.36%
Novice replaced accuracy: 90.15%
"""

Scientist replaced: Correct Proportion: 96.88% | Has Prediction and is Correct: 96.88% | Incorrect Proportion: 3.12% | None Proportion 0.00%
Experienced replaced: Correct Proportion: 97.17% | Has Prediction and is Correct: 97.17% | Incorrect Proportion: 2.83% | None Proportion 0.00%
Non programmer replaced: Correct Proportion: 96.85% | Has Prediction and is Correct: 96.87% | Incorrect Proportion: 3.13% | None Proportion 0.02%
Novice replaced: Correct Proportion: 98.15% | Has Prediction and is Correct: 98.15% | Incorrect Proportion: 1.85% | None Proportion 0.00%


'\nTop 5 returned to match\n\nScientist replaced accuracy: 85.06%\nExperienced replaced accuracy: 86.43%\nNon programmer replaced accuracy: 89.36%\nNovice replaced accuracy: 90.15%\n'

Save not matching or none result to file

In [17]:
from tqdm import tqdm

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced", "Scientist pipes replaced", "Experienced pipes replaced"," Non programmer pipes replaced", "Novice pipes replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced, science_pipes_expand, experienced_pipes_expand, non_pipes_expand, novice_pipes_expand]

fail_list = []

for i in tqdm(range(len(datasets))):
    with open("./fails/" + dataset_names[i] + ".txt", "w") as file_object:
        get_accuracy(datasets[i], fail_list = fail_list, print_fails=True)
        file_object.write(pp.pformat(fail_list))


 50%|█████     | 4/8 [00:24<00:22,  5.54s/it]

[]


100%|██████████| 8/8 [00:27<00:00,  3.41s/it]


Save training graphs to JSON files. Files are 48.9 MB and very hard to open.

Instead of this, we can just save it into a list.

In [39]:
import jsonpickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced", "Scientist pipes replaced", "Experienced pipes replaced","Non programmer pipes replaced", "Novice pipes replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced, science_pipes_expand, experienced_pipes_expand, non_pipes_expand, novice_pipes_expand]

train_graphs = []
test_graphs = []

for i in tqdm(range(len(datasets))):
    train_data, test_data = train_test_split(datasets[i], test_size=0.2, train_size=0.8, random_state=42)

    train_graph = construct_graph(train_data)
    test_graph = construct_graph(test_data)

    train_graphs.append(train_graph)
    test_graphs.append(test_graph)

    train_object = jsonpickle.encode(train_graph)
    test_object = jsonpickle.encode(test_graph)

    with open("./train_graphs/" + dataset_names[i] + " train.json", "w") as file_object:
        file_object.write(train_object)

    with open ("./test_graphs/" + dataset_names[i] + " test.json", "w") as file_object:
        file_object.write(test_object)

 50%|█████     | 4/8 [02:06<02:05, 31.49s/it]

[]


100%|██████████| 8/8 [04:11<00:00, 31.38s/it]


In [None]:
import itertools

all = list(itertools.combinations(parser.parse_commands_into_subsets(parser.novice_commands, 5), 5))


In [None]:
import sys

sys.getsizeof(all)