Construct Graph function

In [1]:
from graph import Node

def construct_graph(command_list, command_dict={}):

    filter_empty = lambda x: (len(x) > 0)
    cur_node = None
    child_node = None
    
    for session in command_list:
        # remove commands of length 0 ( not good to modify list within loop )
        
        session = list(filter(filter_empty, session))

        try: 
            first_cmd = session[0]
        except Exception as inst:
            print(session)
            continue

        program = session[0].split()[0]

        if command_dict.get(program) is None:
            cur_node = Node(program=program, frequency=1)
            command_dict[program] = cur_node 
        else:
            cur_node = command_dict.get(program)
            cur_node.frequency += 1

        if cur_node.commands.get(first_cmd) is None:
            cur_node.commands[first_cmd] = 1
        else:
            cur_node.commands[first_cmd] += 1

        for cmd in range(1, len(session)):

            if session[cmd].isprintable() is False:
                break
            else:
                program = session[cmd].split()[0]

            if cur_node.children.get(program) is None:
                child_node = Node(program = program, frequency=1)
                cur_node.children[program] = child_node
            else:
                child_node = cur_node.children.get(program)
                child_node.frequency += 1
            
            if child_node.commands.get(session[cmd]) is None:
                child_node.commands[session[cmd]] = 1
            else:
                child_node.commands[session[cmd]] += 1
            
            cur_node = child_node
        
                
        
    return command_dict


Get Prediction function

In [2]:
def get_prediction(command_list, graph, result_size=5):
    if len(command_list) == 0:
        return None

    commands = command_list[-3:-1]
    previous_command = command_list[-1]

    if len(commands) == 0:
        return None

    program = commands[0].split()[0]
    
    if graph.get(program) is None:
        return None
    else:
        node = graph[program]

    for command in commands[1:]:
        program = command.split()[0]
        if node.children.get(program) is not None:
            node = node.children[program]
        else:
            return None

    return node.get_prediction(previous_command, num_to_return=result_size)

Accuracy function

In [3]:
from thefuzz import fuzz
from thefuzz import process
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

def append_list(lst, results, commands):
    # lst.append(("results:", results, "expected:", commands[-1], "command sequence:", commands))
    lst.append({"Results": results, "Expected": commands[-1], "Command Sequence": commands})

def get_accuracy(command_subsets, fail_list = [], succeed_list = [], correct_15_not_5 = [], print_fails = False):
    train_data, test_data = train_test_split(command_subsets, test_size=0.2, train_size=0.8, random_state=42)

    train_graph = construct_graph(train_data)

    test_size = len(test_data)

    return_5 = 5
    return_15 = 15

    correct = 0
    correct_with_15 = 0
    first_prediction = 0
    has_prediction = 0
    incorrect = 0
    none_count = 0

    for commands in test_data:
        results = get_prediction(commands, train_graph, return_5)
        results_15 = get_prediction(commands, train_graph, return_15)
        
        prev_correct = correct

        if results is not None:
            has_prediction += 1

            for i in range(len(results)):
                if fuzz.ratio(results[i][0], commands[-1]) > 85:
                    correct += 1

                    if i == 0:
                        first_prediction += 1

                    if correct <= 100:
                        append_list(succeed_list, results, commands)

                    break

            if prev_correct == correct:
                incorrect += 1

            for i in range(len(results_15)):
                if fuzz.ratio(results_15[i][0], commands[-1]) > 85:
                    correct_with_15 += 1

                    if correct != correct_with_15:
                        append_list(correct_15_not_5, results_15, commands)

                    break

        else:
            none_count += 1

        if prev_correct == correct and print_fails:
            append_list(fail_list, results, commands)                    

    return 'Correct Proportion: {:.2f}% |\n Correct in 15 not 5: {:.2f}% |\n Has Prediction and is Correct: {:.2f}% |\n Incorrect Proportion: {:.2f}% |\n None Proportion: {:.2f}% |\n First Prediction: {:.2f}%'.format(100 * correct/test_size, 100 * correct_with_15/test_size, 100 * correct/has_prediction, 100 * incorrect/test_size, 100 * none_count/test_size, 100 * first_prediction/test_size)

In [4]:
from parse import Parser

parser = Parser()

100%|██████████| 52/52 [00:16<00:00,  3.16it/s]
100%|██████████| 36/36 [00:09<00:00,  3.84it/s]
100%|██████████| 25/25 [00:03<00:00,  6.70it/s]
100%|██████████| 56/56 [00:10<00:00,  5.43it/s]


Expand pipes and replace args

In [6]:
science = parser.filter_commands_with_pipe(parser.scientists_commands)
experienced = parser.filter_commands_with_pipe(parser.experienced_commands)
non = parser.filter_commands_with_pipe(parser.non_programmers_commands)
novice = parser.filter_commands_with_pipe(parser.novice_commands)

science_pipes = parser.expand_piped_commands(science)
experienced_pipes = parser.expand_piped_commands(experienced)
non_pipes = parser.expand_piped_commands(non)
novice_pipes = parser.expand_piped_commands(novice)

science_pipes_expand = parser.replace_arg_expanded_pipe(science_pipes)
experienced_pipes_expand = parser.replace_arg_expanded_pipe(experienced_pipes)
non_pipes_expand = parser.replace_arg_expanded_pipe(non_pipes)
novice_pipes_expand = parser.replace_arg_expanded_pipe(novice_pipes)

# print(get_accuracy(science_pipes))
# print(get_accuracy(experienced_pipes))
# print(get_accuracy(non_pipes))
# print(get_accuracy(novice_pipes))

"""
old, returning all potential commands - args not replaced

91.19%
96.35%
90.31%
93.18%
"""

'\nold, returning all potential commands - args not replaced\n\n91.19%\n96.35%\n90.31%\n93.18%\n'

In [8]:
print("Science pipes metrics:\n", get_accuracy(science_pipes_expand), "\n")
print("Experienced pipes metrics:\n", get_accuracy(experienced_pipes_expand), "\n")
print("Non programmers pipes metrics:\n", get_accuracy(non_pipes_expand), "\n")
print("Novice pipes metrics:\n", get_accuracy(novice_pipes_expand), "\n")

[]
Science pipes metrics:
 Correct Proportion: 78.41% |
 Correct in 15 not 5: 83.23% |
 Has Prediction and is Correct: 83.48% |
 Incorrect Proportion: 15.51% |
 None Proportion: 6.08% |
 First Prediction: 63.10% 

Experienced pipes metrics:
 Correct Proportion: 91.32% |
 Correct in 15 not 5: 94.06% |
 Has Prediction and is Correct: 93.02% |
 Incorrect Proportion: 6.85% |
 None Proportion: 1.83% |
 First Prediction: 74.43% 

Non programmers pipes metrics:
 Correct Proportion: 88.18% |
 Correct in 15 not 5: 91.28% |
 Has Prediction and is Correct: 93.05% |
 Incorrect Proportion: 6.59% |
 None Proportion: 5.23% |
 First Prediction: 68.60% 

Novice pipes metrics:
 Correct Proportion: 88.64% |
 Correct in 15 not 5: 90.91% |
 Has Prediction and is Correct: 92.86% |
 Incorrect Proportion: 6.82% |
 None Proportion: 4.55% |
 First Prediction: 52.27% 



In [11]:
from tqdm import tqdm

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist pipes replaced", "Experienced pipes replaced","Non programmer pipes replaced", "Novice pipes replaced"]
datasets = [science_pipes_expand, experienced_pipes_expand, non_pipes_expand, novice_pipes_expand]

for i in tqdm(range(len(datasets))):
    fail_list = []
    succeed_list = []
    correct_15_not_5 = []

    get_accuracy(datasets[i], fail_list=fail_list, succeed_list=succeed_list, correct_15_not_5=correct_15_not_5, print_fails=False)

    with open("fails/" + dataset_names[i] + ".txt", "w") as file_object:
        file_object.write(pp.pformat(fail_list))
    
    with open("successes/" + dataset_names[i] + ".txt", "w") as file_object:
        file_object.write(pp.pformat(succeed_list))

    with open("correct_with_15/" + dataset_names[i] + ".txt", "w") as file_object:
        file_object.write(pp.pformat(correct_15_not_5))



 25%|██▌       | 1/4 [00:00<00:00,  6.11it/s]

[['cat $0 $1 > $2', 'nroff -me > $3 < $4'], ['cat $0 $1 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -o35-38 -me < $3'], ['cat $0 $1 $2 > $3', 'tbl < $4 > $3', 'itroff -me < $4'], ['cat $0 $1 > $2', 'dpic < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'dpic < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'dpic < $3'], ['spell $0 > $1', 'more < $2'], ['spell $0 > $1', 'mor

 50%|█████     | 2/4 [00:00<00:00,  6.56it/s]

[['cat $0 $1 > $2', 'nroff -me > $3 < $4'], ['cat $0 $1 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -o35-38 -me < $3'], ['cat $0 $1 $2 > $3', 'tbl < $4 > $3', 'itroff -me < $4'], ['cat $0 $1 > $2', 'dpic < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'dpic < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'dpic < $3'], ['spell $0 > $1', 'more < $2'], ['spell $0 > $1', 'mor

100%|██████████| 4/4 [00:00<00:00,  5.71it/s]

[]
[['cat $0 $1 > $2', 'nroff -me > $3 < $4'], ['cat $0 $1 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me > $4 < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -me < $3'], ['cat $0 $1 > $2', 'tbl < $3 > $2', 'nroff -o35-38 -me < $3'], ['cat $0 $1 $2 > $3', 'tbl < $4 > $3', 'itroff -me < $4'], ['cat $0 $1 > $2', 'dpic < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'dpic < $3 > $2', 'itroff -me < $3'], ['cat $0 $1 > $2', 'dpic < $3'], ['spell $0 > $1', 'more < $2'], ['spell $0 > $1', '




Subsets of 5

In [7]:
subset_size = 5

science_replaced = parser.replace_args(parser.scientists_commands)
science_replaced = parser.parse_commands_into_subsets(science_replaced, subset_size)

experienced_replaced = parser.replace_args(parser.experienced_commands)
experienced_replaced = parser.parse_commands_into_subsets(experienced_replaced, subset_size)

non_replaced = parser.replace_args(parser.non_programmers_commands)
non_replaced = parser.parse_commands_into_subsets(non_replaced, subset_size)

novice_replaced = parser.replace_args(parser.novice_commands)
novice_replaced = parser.parse_commands_into_subsets(novice_replaced, subset_size)

In [8]:
print("Scientist replaced:\n", get_accuracy(science_replaced))
print("Experienced replaced:\n", get_accuracy(experienced_replaced))
print("Non programmer replaced:\n", get_accuracy(non_replaced))
print("Novice replaced:\n", get_accuracy(novice_replaced))

"""
Fuzz ratio == 90

Scientist replaced:
 Correct Proportion: 81.90% | Has Prediction and is Correct: 82.97% | Incorrect Proportion: 16.81% | None Proportion: 1.30% | First Prediction: 58.64%
Experienced replaced:
 Correct Proportion: 83.89% | Has Prediction and is Correct: 85.11% | Incorrect Proportion: 14.68% | None Proportion: 1.44% | First Prediction: 61.73%
Non programmer replaced:
 Correct Proportion: 85.04% | Has Prediction and is Correct: 85.77% | Incorrect Proportion: 14.11% | None Proportion: 0.85% | First Prediction: 60.01%
Novice replaced:
 Correct Proportion: 88.03% | Has Prediction and is Correct: 89.08% | Incorrect Proportion: 10.79% | None Proportion: 1.18% | First Prediction: 64.17%
"""

Scientist replaced:
 Correct Proportion: 83.56% |
 Correct in 15 not 5: 90.48% |
 Has Prediction and is Correct: 84.92% |
 Incorrect Proportion: 14.84% |
 None Proportion: 1.60% |
 First Prediction: 60.78%
Experienced replaced:
 Correct Proportion: 84.89% |
 Correct in 15 not 5: 91.15% |
 Has Prediction and is Correct: 86.21% |
 Incorrect Proportion: 13.58% |
 None Proportion: 1.52% |
 First Prediction: 62.36%
Non programmer replaced:
 Correct Proportion: 84.74% |
 Correct in 15 not 5: 91.94% |
 Has Prediction and is Correct: 85.40% |
 Incorrect Proportion: 14.49% |
 None Proportion: 0.77% |
 First Prediction: 59.51%
Novice replaced:
 Correct Proportion: 88.41% |
 Correct in 15 not 5: 94.58% |
 Has Prediction and is Correct: 89.24% |
 Incorrect Proportion: 10.65% |
 None Proportion: 0.93% |
 First Prediction: 64.56%


'\nFuzz ratio == 90\n\nScientist replaced:\n Correct Proportion: 81.90% | Has Prediction and is Correct: 82.97% | Incorrect Proportion: 16.81% | None Proportion: 1.30% | First Prediction: 58.64%\nExperienced replaced:\n Correct Proportion: 83.89% | Has Prediction and is Correct: 85.11% | Incorrect Proportion: 14.68% | None Proportion: 1.44% | First Prediction: 61.73%\nNon programmer replaced:\n Correct Proportion: 85.04% | Has Prediction and is Correct: 85.77% | Incorrect Proportion: 14.11% | None Proportion: 0.85% | First Prediction: 60.01%\nNovice replaced:\n Correct Proportion: 88.03% | Has Prediction and is Correct: 89.08% | Incorrect Proportion: 10.79% | None Proportion: 1.18% | First Prediction: 64.17%\n'

Save not matching or none result to file

In [27]:
from tqdm import tqdm

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced]

# for i in tqdm(range(len(datasets))):
#     fail_list = []
#     succeed_list = []
#     correct_15_not_5 = []

#     get_accuracy(datasets[i], fail_list=fail_list, succeed_list=succeed_list, correct_15_not_5=correct_15_not_5, print_fails=True)

#     with open("./fails/" + dataset_names[i] + ".txt", "w") as file_object:
#         file_object.write(pp.pformat(fail_list))
    
#     with open("./successes/" + dataset_names[i] + ".txt", "w") as file_object:
#         file_object.write(pp.pformat(succeed_list))

#     with open("./correct_with_15/" + dataset_names[i] + ".txt", "w") as file_object:
#         file_object.write(pp.pformat(correct_15_not_5))

 50%|█████     | 4/8 [00:58<00:53, 13.37s/it]

[]


100%|██████████| 8/8 [00:59<00:00,  7.46s/it]


Save training graphs to JSON files. Files are 48.9 MB and very hard to open.

Instead of this, we can just save it into a list.

In [10]:
import jsonpickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced", "Scientist pipes replaced", "Experienced pipes replaced","Non programmer pipes replaced", "Novice pipes replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced, science_pipes_expand, experienced_pipes_expand, non_pipes_expand, novice_pipes_expand]

train_graphs = []
test_graphs = []

# for i in tqdm(range(len(datasets))):
#     train_data, test_data = train_test_split(datasets[i], test_size=0.2, train_size=0.8, random_state=42)

#     train_graph = construct_graph(train_data)
#     test_graph = construct_graph(test_data)

#     train_graphs.append(train_graph)
#     test_graphs.append(test_graph)

#     train_object = jsonpickle.encode(train_graph)
#     test_object = jsonpickle.encode(test_graph)

#     with open("./train_graphs/" + dataset_names[i] + " train.json", "w") as file_object:
#         file_object.write(train_object)

#     with open ("./test_graphs/" + dataset_names[i] + " test.json", "w") as file_object:
#         file_object.write(test_object)

Sliding window subsets and results

In [12]:
science_session = parser.parse_commands_per_session(parser.scientists_files)
science_session = parser.parse_commands_into_subsets_sliding_window(science_session, 5)
science_session = parser.replace_args_nested(science_session)

experienced_session = parser.parse_commands_per_session(parser.experienced_files)
experienced_session = parser.parse_commands_into_subsets_sliding_window(experienced_session, 5)
experienced_session = parser.replace_args_nested(experienced_session)

non_session = parser.parse_commands_per_session(parser.non_programmers_files)
non_session = parser.parse_commands_into_subsets_sliding_window(non_session, 5)
non_session = parser.replace_args_nested(non_session)

novice_session = parser.parse_commands_per_session(parser.novice_files)
novice_session = parser.parse_commands_into_subsets_sliding_window(novice_session, 5)
novice_session = parser.replace_args_nested(novice_session)

100%|██████████| 52/52 [00:17<00:00,  3.05it/s]
100%|██████████| 36/36 [00:09<00:00,  3.75it/s]
100%|██████████| 25/25 [00:03<00:00,  6.43it/s]
100%|██████████| 56/56 [00:10<00:00,  5.11it/s]


In [13]:
print("Scientist session:\n", get_accuracy(science_session))
print("Experienced session:\n", get_accuracy(experienced_session))
print("Non programmer session:\n", get_accuracy(non_session))
print("Novice session:\n", get_accuracy(novice_session))

Scientist session:
 Correct Proportion: 86.00% |
 Correct in 15 not 5: 94.86% |
 Has Prediction and is Correct: 86.04% |
 Incorrect Proportion: 13.95% |
 None Proportion: 0.05% |
 First Prediction: 60.97%
Experienced session:
 Correct Proportion: 88.93% |
 Correct in 15 not 5: 96.05% |
 Has Prediction and is Correct: 88.97% |
 Incorrect Proportion: 11.02% |
 None Proportion: 0.05% |
 First Prediction: 65.22%
Non programmer session:
 Correct Proportion: 86.34% |
 Correct in 15 not 5: 94.22% |
 Has Prediction and is Correct: 86.37% |
 Incorrect Proportion: 13.63% |
 None Proportion: 0.04% |
 First Prediction: 60.92%
Novice session:
 Correct Proportion: 90.79% |
 Correct in 15 not 5: 96.89% |
 Has Prediction and is Correct: 90.85% |
 Incorrect Proportion: 9.14% |
 None Proportion: 0.06% |
 First Prediction: 66.81%


In [14]:
from random import sample
import math

subset_size = 5

science_session_sample = parser.parse_commands_per_session(sample(parser.scientists_files, math.floor(0.2 * len(parser.scientists_files))))
science_session_sample = parser.parse_commands_into_subsets_sliding_window(science_session_sample, 5)
science_session_sample = parser.replace_args_nested(science_session_sample)

experienced_session_sample = parser.parse_commands_per_session(sample(parser.experienced_files, math.floor(0.2 * len(parser.experienced_files))))
experienced_session_sample = parser.parse_commands_into_subsets_sliding_window(experienced_session_sample, 5)
experienced_session_sample = parser.replace_args_nested(experienced_session_sample)

non_session_sample = parser.parse_commands_per_session(sample(parser.non_programmers_files, math.floor(0.2 * len(parser.non_programmers_files))))
non_session_sample = parser.parse_commands_into_subsets_sliding_window(non_session_sample, 5)
non_session_sample = parser.replace_args_nested(non_session_sample)

novice_session_sample = parser.parse_commands_per_session(sample(parser.novice_files, math.floor(0.2 * len(parser.novice_files))))
novice_session_sample = parser.parse_commands_into_subsets_sliding_window(novice_session_sample, 5)
novice_session_sample = parser.replace_args_nested(novice_session_sample)

100%|██████████| 10/10 [00:02<00:00,  3.54it/s]
100%|██████████| 7/7 [00:02<00:00,  3.44it/s]
100%|██████████| 5/5 [00:00<00:00,  6.38it/s]
100%|██████████| 11/11 [00:02<00:00,  4.47it/s]


In [15]:
print("Scientist session:\n", get_accuracy(science_session_sample))
print("Experienced session:\n", get_accuracy(experienced_session_sample))
print("Non programmer session:\n", get_accuracy(non_session_sample))
print("Novice session:\n", get_accuracy(novice_session_sample))

Scientist session:
 Correct Proportion: 87.77% |
 Correct in 15 not 5: 96.35% |
 Has Prediction and is Correct: 87.77% |
 Incorrect Proportion: 12.23% |
 None Proportion: 0.00% |
 First Prediction: 65.11%
Experienced session:
 Correct Proportion: 91.32% |
 Correct in 15 not 5: 97.49% |
 Has Prediction and is Correct: 91.33% |
 Incorrect Proportion: 8.67% |
 None Proportion: 0.01% |
 First Prediction: 65.74%
Non programmer session:
 Correct Proportion: 81.75% |
 Correct in 15 not 5: 91.94% |
 Has Prediction and is Correct: 81.75% |
 Incorrect Proportion: 18.25% |
 None Proportion: 0.00% |
 First Prediction: 59.64%
Novice session:
 Correct Proportion: 93.18% |
 Correct in 15 not 5: 98.21% |
 Has Prediction and is Correct: 93.18% |
 Incorrect Proportion: 6.82% |
 None Proportion: 0.00% |
 First Prediction: 70.18%


Objects are 84 MB instead of 53 MB with sampling 10% of the data

In [16]:
import jsonpickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import pprint
pp = pprint.PrettyPrinter(indent=4)

dataset_names = ["Scientist replaced", "Experienced replaced", "Non programmer replaced", "Novice replaced"]
datasets = [science_replaced, experienced_replaced, non_replaced, novice_replaced]

train_graphs = []
test_graphs = []

# for i in tqdm(range(len(datasets))):
#     train_data, test_data = train_test_split(datasets[i], test_size=0.2, train_size=0.8, random_state=42)

#     train_graph = construct_graph(train_data)
#     print("Train graph", sys.getsizeof(train_graph)/1000000.0)
#     test_graph = construct_graph(test_data)
#     print("Test graph", sys.getsizeof(test_graph)/1000000.0)

#     train_graphs.append(train_graph)
#     test_graphs.append(test_graph)

#     train_object = jsonpickle.encode(train_graph)
#     print("Train JSON", sys.getsizeof(train_object)/1000000.0)
#     test_object = jsonpickle.encode(test_graph)
#     print("Test JSON", sys.getsizeof(test_object)/1000000.0)

#     with open("./train_graphs/" + dataset_names[i] + " train.json", "w") as file_object:
#         file_object.write(train_object)

#     with open ("./test_graphs/" + dataset_names[i] + " test.json", "w") as file_object:
#         file_object.write(test_object)

Map Reduce to see most common how often command sequences occur

In [15]:
def map_reduce(parsed_nested_list):
    reduced = dict()

    for command_seq in parsed_nested_list:
        command_seq = tuple(command_seq)

        if command_seq in reduced:
            reduced[command_seq] += 1
        else:
            reduced[command_seq] = 1

    return reduced

def len_n_commands(session_dict, command_len):
    len_n_dict = dict()

    for command_seq, freq in session_dict.items():
        if freq > 1 and len(command_seq) == command_len:
            len_n_dict[command_seq] = freq
    
    return len_n_dict

def popular_seq_by_user(file_name, command_len, num_to_return=5):
    if type(file_name) != list:
        file_name = [file_name]

    session_subsets = parser.parse_commands_per_session(file_name)
    session_subsets = parser.parse_commands_into_subsets_sliding_window(session_subsets, 5)
    session_subsets = parser.replace_args_nested(session_subsets)

    session_dict = map_reduce(session_subsets)

    session_dict = len_n_commands(session_dict, command_len)

    return sorted(session_dict.items(), key=lambda item : item[1], reverse=True)[:num_to_return]


Save most common commands of length `command_len` to file. Uses parse commands by subset.

In [29]:
%%capture

import pprint
pp = pprint.PrettyPrinter(indent=4)

files = [parser.scientists_files, parser.experienced_files, parser.non_programmers_files, parser.novice_files]
dataset_name = ["scientists", "experienced", "non_programmers", "novice"]

command_len = 2

for file_list in range(len(files)):
    for file in files[file_list]:
        with open("./highest_freq_len_5/" + dataset_name[file_list] + ".txt", "a") as file_object:
            # file_object.write(pp.pformat([file.split('/')[3], popular_seq_by_user(file)]))
            file_name = file.split('/')[3]
            seq_freq = popular_seq_by_user(file, command_len)
            seq_freq_lst = []
            
            for seq, freq in seq_freq:
                seq_freq_lst.append(str(seq) + ": " + str(freq) + '\n')

            line = "{}:\n\t {}\n".format(file_name, ', '.join(seq_freq_lst))

            file_object.write(line)

Overall Top 10 by type of User

In [16]:
files = [parser.scientists_files, parser.experienced_files, parser.non_programmers_files, parser.novice_files]

for i in range(len(files)):
    pp.pprint(popular_seq_by_user(files[i], 2, num_to_return=6)) 

100%|██████████| 52/52 [00:10<00:00,  4.97it/s]


[   (('cd $0', 'ls', 'cd $0', 'ls'), 1016),
    (('ls', 'cd $0', 'ls', 'cd $0'), 569),
    (('average', 'average', 'average', 'average'), 443),
    (('lpq -Pip', 'lpq -Pip', 'lpq -Pip', 'lpq -Pip'), 191),
    (('cd $0', 'cd $0', 'ls', 'cd $0'), 189)]


100%|██████████| 36/36 [00:05<00:00,  6.01it/s]


[   (('cd $0', 'ls', 'cd $0', 'ls'), 668),
    (('ls', 'cd $0', 'ls', 'cd $0'), 467),
    (('cd $0', 'cd $0', 'ls', 'cd $0'), 170),
    (('jobs', 'jobs', 'jobs', 'jobs'), 164),
    (('ls', 'cd $0', 'cd $0', 'ls'), 134)]


100%|██████████| 25/25 [00:02<00:00, 10.35it/s]


[   (('emacs $0', 'emacs $0', 'emacs $0', 'emacs $0'), 194),
    (('cd $0', 'ls', 'cd $0', 'ls'), 151),
    (('rm $0', 'rm $0', 'rm $0', 'rm $0'), 113),
    (('ls', 'cd $0', 'ls', 'e $0'), 109),
    (('ls', 'cd $0', 'ls', 'cd $0'), 101)]


100%|██████████| 56/56 [00:06<00:00,  8.36it/s]


[   (('umacs $0', 'pix $0', 'umacs $0', 'pix $0'), 4536),
    (('pix $0', 'umacs $0', 'pix $0', 'umacs $0'), 4079),
    (('pix $0', 'umacs $0', 'pix $0', 'pix $0'), 865),
    (('pix $0', 'pix $0', 'umacs $0', 'pix $0'), 813),
    (('pix $0', 'pix $0', 'pix $0', 'pix $0'), 757)]
