Construct Graph Function

In [4]:
from graph import Node

def construct_graph(command_list, command_dict={}):

    filter_empty = lambda x: (len(x) > 0)
    cur_node = None
    child_node = None
    
    for session in command_list:
        # remove commands of length 0 ( not good to modify list within loop )
        
        session = list(filter(filter_empty, session))

        try: 
            first_cmd = session[0]
        except Exception as inst:
            print(session)
            continue

        program = session[0].split()[0]

        if command_dict.get(program) is None:
            cur_node = Node(program=program, frequency=1)
            command_dict[program] = cur_node 
        else:
            cur_node = command_dict.get(program)
            cur_node.frequency += 1

        if cur_node.commands.get(first_cmd) is None:
            cur_node.commands[first_cmd] = 1
        else:
            cur_node.commands[first_cmd] += 1

        for cmd in range(1, len(session)):

            if session[cmd].isprintable() is False:
                break
            else:
                program = session[cmd].split()[0]

            if cur_node.children.get(program) is None:
                child_node = Node(program = program, frequency=1)
                cur_node.children[program] = child_node
            else:
                child_node = cur_node.children.get(program)
                child_node.frequency += 1
            
            if child_node.commands.get(session[cmd]) is None:
                child_node.commands[session[cmd]] = 1
            else:
                child_node.commands[session[cmd]] += 1
            
            cur_node = child_node
        
                
        
    return command_dict


Get Prediction Function

In [5]:
def get_prediction(command_list: list[str], graph, result_size=5, graph_depth=4):
    if len(command_list) == 0:
        return None

    commands = command_list[:graph_depth]
    previous_command = command_list[graph_depth]

    if len(commands) == 0:
        return None

    program = commands[0].split()[0]
    
    if graph.get(program) is None:
        return None
    else:
        node = graph[program]

    for command in commands[1:]:
                
        program = command.split()[0]
        if node.children.get(program) is not None:
            node = node.children[program]
        else:
            return None

    return node.get_prediction(previous_command, num_to_return=result_size)

In [14]:
from thefuzz import fuzz
from thefuzz import process

import pprint
pp = pprint.PrettyPrinter(indent=4)

def append_list(lst, results, commands, graph_depth):
    lst.append({"Results": results, "Expected": commands[graph_depth], "Command Sequence": commands})

def get_accuracy_colearn(command_subsets, test_data, fail_list = [], succeed_list = [], correct_15_not_5 = [], print_fails=False, graph_depth=4):
    train_graph = construct_graph(command_subsets)
    test_data = [x for x in test_data if len(x) > graph_depth]
    test_size = len(test_data)

    return_5 = 5
    return_15 = 15

    correct = 0
    correct_with_15 = 0
    first_prediction = 0
    has_prediction = 0
    incorrect = 0
    none_count = 0

    for commands in test_data:
        results = get_prediction(commands, train_graph, return_5, graph_depth)
        results_15 = get_prediction(commands, train_graph, return_15, graph_depth)
        
        prev_correct = correct

        if results is not None:
            has_prediction += 1

            for i in range(len(results)):
                if fuzz.ratio(results[i][0], commands[graph_depth]) > 85:
                    correct += 1

                    if i == 0:
                        first_prediction += 1

                    if correct <= 100:
                        append_list(succeed_list, results, commands, graph_depth)

                    break

            if prev_correct == correct:
                incorrect += 1

            for i in range(len(results_15)):
                if fuzz.ratio(results_15[i][0], commands[graph_depth]) > 85:
                    correct_with_15 += 1

                    if correct != correct_with_15 and correct_with_15 <= 100:
                        append_list(correct_15_not_5, results_15, commands, graph_depth)

                    break

        else:
            none_count += 1

        if prev_correct == correct and print_fails:
            append_list(fail_list, results, commands, graph_depth)  
                      
    return 'Correct Proportion: {:.2f}% |\n Correct in 15 not 5: {:.2f}% |\n Has Prediction and is Correct: {:.2f}% |\n Incorrect Proportion: {:.2f}% |\n None Proportion: {:.2f}% |\n First Prediction: {:.2f}%'.format(100 * correct/test_size, 100 * correct_with_15/test_size, 100 * correct/has_prediction, 100 * incorrect/test_size, 100 * none_count/test_size, 100 * first_prediction/test_size)

In [7]:
from parse import Parser

parser = Parser()

100%|██████████| 52/52 [00:18<00:00,  2.76it/s]
100%|██████████| 36/36 [00:10<00:00,  3.50it/s]
100%|██████████| 25/25 [00:03<00:00,  6.28it/s]
100%|██████████| 56/56 [00:11<00:00,  4.97it/s]


Colearning train tests, all but last file

In [8]:
science_session_train = parser.parse_commands_per_session(parser.scientists_files[:-1])
science_session_train = parser.parse_commands_into_subsets_sliding_window(science_session_train, 5)
science_session_train = parser.replace_args_nested(science_session_train)

experienced_session_train = parser.parse_commands_per_session(parser.experienced_files[:-1])
experienced_session_train = parser.parse_commands_into_subsets_sliding_window(experienced_session_train, 5)
experienced_session_train = parser.replace_args_nested(experienced_session_train)

non_session_train = parser.parse_commands_per_session(parser.non_programmers_files[:-1])
non_session_train = parser.parse_commands_into_subsets_sliding_window(non_session_train, 5)
non_session_train = parser.replace_args_nested(non_session_train)

novice_session_train = parser.parse_commands_per_session(parser.novice_files[:-1])
novice_session_train = parser.parse_commands_into_subsets_sliding_window(novice_session_train, 5)
novice_session_train = parser.replace_args_nested(novice_session_train)

100%|██████████| 51/51 [00:18<00:00,  2.78it/s]
100%|██████████| 35/35 [00:10<00:00,  3.30it/s]
100%|██████████| 24/24 [00:03<00:00,  6.03it/s]
100%|██████████| 55/55 [00:11<00:00,  4.67it/s]


Colearning test set, last file

In [12]:
science_session_test = parser.parse_commands_per_session(parser.scientists_files[-1:])
science_session_test = parser.parse_commands_into_subsets_sliding_window(science_session_test, 5)
science_session_test = parser.replace_args_nested(science_session_test)

experienced_session_test = parser.parse_commands_per_session(parser.experienced_files[-1:])
experienced_session_test = parser.parse_commands_into_subsets_sliding_window(experienced_session_test, 5)
experienced_session_test = parser.replace_args_nested(experienced_session_test)

non_session_test = parser.parse_commands_per_session(parser.non_programmers_files[-1:])
non_session_test = parser.parse_commands_into_subsets_sliding_window(non_session_test, 5)
non_session_test = parser.replace_args_nested(non_session_test)

novice_session_test = parser.parse_commands_per_session(parser.novice_files[-1:])
novice_session_test = parser.parse_commands_into_subsets_sliding_window(novice_session_test, 5)
novice_session_test = parser.replace_args_nested(novice_session_test)

100%|██████████| 1/1 [00:00<00:00,  6.98it/s]
100%|██████████| 1/1 [00:00<00:00,  3.23it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
100%|██████████| 1/1 [00:00<00:00,  5.81it/s]


Accuracy predicting N+1 command from N commands with all commands in graph

TODO: rename/add train/test args

In [15]:
for i in range(1, 5):
    print("Accuracy predicting command {} from commands {} through {}:\n".format(i+1, 1, i))

    print("Scientist session:\n", get_accuracy_colearn(science_session_train, science_session_test, graph_depth=i))
    print("Experienced session:\n", get_accuracy_colearn(experienced_session_train, experienced_session_test, graph_depth=i))
    print("Non programmer session:\n", get_accuracy_colearn(non_session_train, non_session_test, graph_depth=i))
    print("Novice session:\n", get_accuracy_colearn(novice_session_train, novice_session_test, graph_depth=i))

Accuracy predicting command 2 from commands 1 through 1:

Scientist session:
 Correct Proportion: 56.48% |
 Correct in 15 not 5: 65.41% |
 Has Prediction and is Correct: 56.84% |
 Incorrect Proportion: 42.88% |
 None Proportion: 0.65% |
 First Prediction: 35.97%
Experienced session:
 Correct Proportion: 42.01% |
 Correct in 15 not 5: 53.63% |
 Has Prediction and is Correct: 47.72% |
 Incorrect Proportion: 46.03% |
 None Proportion: 11.96% |
 First Prediction: 27.21%
Non programmer session:
 Correct Proportion: 52.99% |
 Correct in 15 not 5: 65.25% |
 Has Prediction and is Correct: 53.41% |
 Incorrect Proportion: 46.22% |
 None Proportion: 0.79% |
 First Prediction: 28.23%
Novice session:
 Correct Proportion: 55.93% |
 Correct in 15 not 5: 63.46% |
 Has Prediction and is Correct: 57.48% |
 Incorrect Proportion: 41.38% |
 None Proportion: 2.69% |
 First Prediction: 39.58%
Accuracy predicting command 3 from commands 1 through 2:

Scientist session:
 Correct Proportion: 50.09% |
 Correct i