Construct Graph function

In [8]:
from graph import Node

def construct_graph(command_list, command_dict={}):

    filter_empty = lambda x: (len(x) > 0)
    cur_node = None
    child_node = None
    
    for session in command_list:
        # remove commands of length 0 ( not good to modify list within loop )
        
        session = list(filter(filter_empty, session))

        try: 
            first_cmd = session[0]
        except Exception as inst:
            print(session)
            continue

        program = session[0].split()[0]

        if command_dict.get(program) is None:
            cur_node = Node(program=program, frequency=1)
            command_dict[program] = cur_node 
        else:
            cur_node = command_dict.get(program)
            cur_node.frequency += 1

        if cur_node.commands.get(first_cmd) is None:
            cur_node.commands[first_cmd] = 1
        else:
            cur_node.commands[first_cmd] += 1

        for cmd in range(1, len(session)):

            if session[cmd].isprintable() is False:
                break
            else:
                program = session[cmd].split()[0]

            if cur_node.children.get(program) is None:
                child_node = Node(program = program, frequency=1)
                cur_node.children[program] = child_node
            else:
                child_node = cur_node.children.get(program)
                child_node.frequency += 1
            
            if child_node.commands.get(session[cmd]) is None:
                child_node.commands[session[cmd]] = 1
            else:
                child_node.commands[session[cmd]] += 1
            
            cur_node = child_node
        
                
        
    return command_dict


Get Prediction function

In [9]:
def get_prediction(command_list, graph, result_size=5):
    if len(command_list) == 0:
        return None

    commands = command_list[-3:-1]
    previous_command = command_list[-1]

    if len(commands) == 0:
        return None

    program = commands[0].split()[0]
    
    if graph.get(program) is None:
        return None
    else:
        node = graph[program]

    for command in commands[1:]:
        program = command.split()[0]
        if node.children.get(program) is not None:
            node = node.children[program]
        else:
            return None

    return node.get_prediction(previous_command, num_to_return=result_size)

Accuracy function

In [46]:
from thefuzz import fuzz
from thefuzz import process
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

def append_list(lst, results, commands):
    # lst.append(("results:", results, "expected:", commands[-1], "command sequence:", commands))
    lst.append({"Results": results, "Expected": commands[-1], "Command Sequence": commands})

def get_accuracy(command_subsets, fail_list = [], succeed_list = [], correct_15_not_5 = [], print_fails = False):
    train_data, test_data = train_test_split(command_subsets, test_size=0.2, train_size=0.8, random_state=42)

    train_graph = construct_graph(train_data)

    test_size = len(test_data)

    return_5 = 5
    return_15 = 15

    correct = 0
    correct_with_15 = 0
    first_prediction = 0
    has_prediction = 0
    incorrect = 0
    none_count = 0

    for commands in test_data:
        results = get_prediction(commands, train_graph, return_5)
        results_15 = get_prediction(commands, train_graph, return_15)
        
        prev_correct = correct

        if results is not None:
            has_prediction += 1

            for i in range(len(results)):
                if fuzz.ratio(results[i][0], commands[-1]) > 90:
                    correct += 1

                    if i == 0:
                        first_prediction += 1

                    if correct <= 100:
                        append_list(succeed_list, results, commands)

                    break

            if prev_correct == correct:
                incorrect += 1

            for i in range(len(results_15)):
                if fuzz.ratio(results_15[i][0], commands[-1]) > 90:
                    correct_with_15 += 1

                    if correct != correct_with_15:
                        append_list(correct_15_not_5, results_15, commands)

                    break

        else:
            none_count += 1

        if prev_correct == correct and print_fails:
            append_list(fail_list, results, commands)                    

    return 'Correct Proportion: {:.2f}% |\n Correct in 15 not 5: {:.2f}% |\n Has Prediction and is Correct: {:.2f}% |\n Incorrect Proportion: {:.2f}% |\n None Proportion: {:.2f}% |\n First Prediction: {:.2f}%'.format(100 * correct/test_size, 100 * correct_with_15/test_size, 100 * correct/has_prediction, 100 * incorrect/test_size, 100 * none_count/test_size, 100 * first_prediction/test_size)

In [11]:
from parse import Parser

parser = Parser()

100%|██████████| 52/52 [00:10<00:00,  5.11it/s]
100%|██████████| 36/36 [00:05<00:00,  6.35it/s]
100%|██████████| 25/25 [00:02<00:00, 10.81it/s]
100%|██████████| 56/56 [00:06<00:00,  8.77it/s]


In [37]:
science = parser.filter_commands_with_pipe(parser.scientists_commands)
experienced = parser.filter_commands_with_pipe(parser.experienced_commands)
non = parser.filter_commands_with_pipe(parser.non_programmers_commands)
novice = parser.filter_commands_with_pipe(parser.novice_commands)

science_pipes = parser.expand_piped_commands(science)
experienced_pipes = parser.expand_piped_commands(experienced)
non_pipes = parser.expand_piped_commands(non)
novice_pipes = parser.expand_piped_commands(novice)

science_pipes_expand = parser.replace_arg_expanded_pipe(science_pipes)
experienced_pipes_expand = parser.replace_arg_expanded_pipe(experienced_pipes)
non_pipes_expand = parser.replace_arg_expanded_pipe(non_pipes)
novice_pipes_expand = parser.replace_arg_expanded_pipe(novice_pipes)

# print(get_accuracy(science_pipes))
# print(get_accuracy(experienced_pipes))
# print(get_accuracy(non_pipes))
# print(get_accuracy(novice_pipes))

"""
old, returning all potential commands - args not replaced

91.19%
96.35%
90.31%
93.18%
"""

'\nold, returning all potential commands - args not replaced\n\n91.19%\n96.35%\n90.31%\n93.18%\n'

In [47]:
print("Science pipes metrics:\n", get_accuracy(science_pipes_expand), "\n")
print("Experienced pipes metrics:\n", get_accuracy(experienced_pipes_expand), "\n")
print("Non programmers pipes metrics:\n", get_accuracy(non_pipes_expand), "\n")
print("Novice pipes metrics:\n", get_accuracy(novice_pipes_expand), "\n")

"""
Fuzzy match ratio == 90

Correct Proportion: 54.09% | Has Prediction and is Correct: 55.13% | Incorrect Proportion: 44.03% | None Proportion: 1.89% | First Prediction: 37.11%
Correct Proportion: 69.86% | Has Prediction and is Correct: 70.51% | Incorrect Proportion: 29.22% | None Proportion: 0.91% | First Prediction: 54.79%
Correct Proportion: 73.06% | Has Prediction and is Correct: 77.10% | Incorrect Proportion: 21.71% | None Proportion: 5.23% | First Prediction: 58.72%
Correct Proportion: 25.00% | Has Prediction and is Correct: 25.00% | Incorrect Proportion: 75.00% | None Proportion: 0.00% | First Prediction: 25.00%
"""

[]
Science pipes metrics:
 Correct Proportion: 58.07% |
 Correct in 15 not 5: 67.51% |
 Has Prediction and is Correct: 59.19% |
 Incorrect Proportion: 40.04% |
 None Proportion: 1.89% |
 First Prediction: 38.78% 

Experienced pipes metrics:
 Correct Proportion: 73.06% |
 Correct in 15 not 5: 86.76% |
 Has Prediction and is Correct: 73.73% |
 Incorrect Proportion: 26.03% |
 None Proportion: 0.91% |
 First Prediction: 48.40% 

Non programmers pipes metrics:
 Correct Proportion: 73.26% |
 Correct in 15 not 5: 80.43% |
 Has Prediction and is Correct: 77.30% |
 Incorrect Proportion: 21.51% |
 None Proportion: 5.23% |
 First Prediction: 58.72% 

Novice pipes metrics:
 Correct Proportion: 47.73% |
 Correct in 15 not 5: 70.45% |
 Has Prediction and is Correct: 47.73% |
 Incorrect Proportion: 52.27% |
 None Proportion: 0.00% |
 First Prediction: 25.00% 



'\nFuzzy match ratio == 90\n\nCorrect Proportion: 54.09% | Has Prediction and is Correct: 55.13% | Incorrect Proportion: 44.03% | None Proportion: 1.89% | First Prediction: 37.11%\nCorrect Proportion: 69.86% | Has Prediction and is Correct: 70.51% | Incorrect Proportion: 29.22% | None Proportion: 0.91% | First Prediction: 54.79%\nCorrect Proportion: 73.06% | Has Prediction and is Correct: 77.10% | Incorrect Proportion: 21.71% | None Proportion: 5.23% | First Prediction: 58.72%\nCorrect Proportion: 25.00% | Has Prediction and is Correct: 25.00% | Incorrect Proportion: 75.00% | None Proportion: 0.00% | First Prediction: 25.00%\n'

In [23]:
subset_size = 5

science_replaced = parser.replace_args(parser.scientists_commands)
science_replaced = parser.parse_commands_into_subsets(science_replaced, subset_size)

experienced_replaced = parser.replace_args(parser.experienced_commands)
experienced_replaced = parser.parse_commands_into_subsets(experienced_replaced, subset_size)

non_replaced = parser.replace_args(parser.non_programmers_commands)
non_replaced = parser.parse_commands_into_subsets(non_replaced, subset_size)

novice_replaced = parser.replace_args(parser.novice_commands)
novice_replaced = parser.parse_commands_into_subsets(novice_replaced, subset_size)

In [48]:
print("Scientist replaced:\n", get_accuracy(science_replaced))
print("Experienced replaced:\n", get_accuracy(experienced_replaced))
print("Non programmer replaced:\n", get_accuracy(non_replaced))
print("Novice replaced:\n", get_accuracy(novice_replaced))

"""
Fuzz ratio == 90

Scientist replaced:
 Correct Proportion: 81.90% | Has Prediction and is Correct: 82.97% | Incorrect Proportion: 16.81% | None Proportion: 1.30% | First Prediction: 58.64%
Experienced replaced:
 Correct Proportion: 83.89% | Has Prediction and is Correct: 85.11% | Incorrect Proportion: 14.68% | None Proportion: 1.44% | First Prediction: 61.73%
Non programmer replaced:
 Correct Proportion: 85.04% | Has Prediction and is Correct: 85.77% | Incorrect Proportion: 14.11% | None Proportion: 0.85% | First Prediction: 60.01%
Novice replaced:
 Correct Proportion: 88.03% | Has Prediction and is Correct: 89.08% | Incorrect Proportion: 10.79% | None Proportion: 1.18% | First Prediction: 64.17%
"""

Scientist replaced:
 Correct Proportion: 81.62% |
 Correct in 15 not 5: 89.69% |
 Has Prediction and is Correct: 82.69% |
 Incorrect Proportion: 17.08% |
 None Proportion: 1.30% |
 First Prediction: 58.42%
Experienced replaced:
 Correct Proportion: 83.80% |
 Correct in 15 not 5: 90.88% |
 Has Prediction and is Correct: 85.02% |
 Incorrect Proportion: 14.77% |
 None Proportion: 1.44% |
 First Prediction: 61.64%
Non programmer replaced:
 Correct Proportion: 85.02% |
 Correct in 15 not 5: 92.22% |
 Has Prediction and is Correct: 85.75% |
 Incorrect Proportion: 14.13% |
 None Proportion: 0.85% |
 First Prediction: 59.97%
Novice replaced:
 Correct Proportion: 88.03% |
 Correct in 15 not 5: 94.16% |
 Has Prediction and is Correct: 89.08% |
 Incorrect Proportion: 10.79% |
 None Proportion: 1.18% |
 First Prediction: 64.15%


'\nFuzz ratio == 90\n\nScientist replaced:\n Correct Proportion: 81.90% | Has Prediction and is Correct: 82.97% | Incorrect Proportion: 16.81% | None Proportion: 1.30% | First Prediction: 58.64%\nExperienced replaced:\n Correct Proportion: 83.89% | Has Prediction and is Correct: 85.11% | Incorrect Proportion: 14.68% | None Proportion: 1.44% | First Prediction: 61.73%\nNon programmer replaced:\n Correct Proportion: 85.04% | Has Prediction and is Correct: 85.77% | Incorrect Proportion: 14.11% | None Proportion: 0.85% | First Prediction: 60.01%\nNovice replaced:\n Correct Proportion: 88.03% | Has Prediction and is Correct: 89.08% | Incorrect Proportion: 10.79% | None Proportion: 1.18% | First Prediction: 64.17%\n'

Save not matching or none result to file

In [19]:
from tqdm import tqdm

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced", "Scientist pipes replaced", "Experienced pipes replaced","Non programmer pipes replaced", "Novice pipes replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced, science_pipes_expand, experienced_pipes_expand, non_pipes_expand, novice_pipes_expand]

for i in tqdm(range(len(datasets))):
    fail_list = []
    succeed_list = []
    correct_15_not_5 = []

    get_accuracy(datasets[i], fail_list=fail_list, succeed_list=succeed_list, correct_15_not_5=correct_15_not_5, print_fails=True)

    with open("./fails/" + dataset_names[i] + ".txt", "w") as file_object:
        file_object.write(pp.pformat(fail_list))
    
    with open("./successes/" + dataset_names[i] + ".txt", "w") as file_object:
        file_object.write(pp.pformat(succeed_list))

    with open("./correct_with_15/" + dataset_names[i] + ".txt", "w") as file_object:
        file_object.write(pp.pformat(correct_15_not_5))

 50%|█████     | 4/8 [00:32<00:29,  7.48s/it]

[]


100%|██████████| 8/8 [00:33<00:00,  4.21s/it]


Save training graphs to JSON files. Files are 48.9 MB and very hard to open.

Instead of this, we can just save it into a list.

In [None]:
import jsonpickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced", "Scientist pipes replaced", "Experienced pipes replaced","Non programmer pipes replaced", "Novice pipes replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced, science_pipes_expand, experienced_pipes_expand, non_pipes_expand, novice_pipes_expand]

train_graphs = []
test_graphs = []

# for i in tqdm(range(len(datasets))):
#     train_data, test_data = train_test_split(datasets[i], test_size=0.2, train_size=0.8, random_state=42)

#     train_graph = construct_graph(train_data)
#     test_graph = construct_graph(test_data)

#     train_graphs.append(train_graph)
#     test_graphs.append(test_graph)

#     train_object = jsonpickle.encode(train_graph)
#     test_object = jsonpickle.encode(test_graph)

#     with open("./train_graphs/" + dataset_names[i] + " train.json", "w") as file_object:
#         file_object.write(train_object)

#     with open ("./test_graphs/" + dataset_names[i] + " test.json", "w") as file_object:
#         file_object.write(test_object)

In [None]:
import itertools

all = []

# don't run this, it'll destroy the computer
# for comb in itertools.combinations(parser.parse_commands_into_subsets(parser.novice_commands, 5), 5):
#     all.append(list(comb))


In [None]:
import sys

sys.getsizeof(parser.scientists_commands[1])

58