In [1]:
import bashlex
import copy
import re
import os

from tqdm import tqdm

In [2]:
re_command_filter = "C"
re_error_filter = "X"
re_start_filter = "S" 

In [3]:
scientists_dir = "./unix-data/computer-scientists/"
experienced_dir = "./unix-data/experienced-programmers/"
non_programmers_dir = "./unix-data/non-programmers/"
novice_dir = "./unix-data/novice-programmers/"

scientists_files = os.listdir(scientists_dir)
experienced_files = os.listdir(experienced_dir)
non_programmers_files = os.listdir(non_programmers_dir)
novice_files = os.listdir(novice_dir)

scientist_commands = []
experienced_commands = []
non_programmers_commands = []
novice_commands = []

scientist_parsed = []
experienced_parsed = []
non_programmers_parsed = []
novice_parsed = []

Generates list of files

In [4]:
for i in range(len(scientists_files)):
    scientists_files[i] = scientists_dir + scientists_files[i]

for i in range(len(experienced_files)):
    experienced_files[i] = experienced_dir + experienced_files[i]

for i in range(len(non_programmers_files)):
    non_programmers_files[i] = non_programmers_dir + non_programmers_files[i]

for i in range(len(novice_files)):
    novice_files[i] = novice_dir + novice_files[i]

Parse function that returns nested list of parsed commands based on session

In [5]:
def parse_commands_per_session(command_list, parsed_list, files_list):
    for file_path in tqdm(files_list):
        file1 = open(file_path, encoding="ISO-8859-1")
        lines = file1.readlines()

        command_sublist = []
        parsed_sublist = []

        for line in range(len(lines)):
            
            if re.match(re_start_filter, lines[line]) is not None:
                if len(command_sublist) != 0 and len(parsed_sublist) != 0:
                    command_list.append(copy.deepcopy(command_sublist))
                    parsed_list.append(copy.deepcopy(parsed_sublist))


                command_sublist = []
                parsed_sublist = []
            
            if re.match(re_command_filter, lines[line]) is not None:
                command_sublist.append(lines[line][2:-1])
                
                try:
                    parts = list(bashlex.split(command_sublist[-1]))
                    parsed_sublist.append(parts)
                except Exception as inst:
                    command_sublist.pop(-1)
                    
    
    # Not necessary as list is created by ref parameter 
    return command_list
            

Function that parses just by command

In [6]:
def parse_commands(command_list, parsed_list, files_list):
    for file_path in tqdm(files_list): 
        file1 = open(file_path, encoding="ISO-8859-1")
        lines = file1.readlines()

        for line in range(len(lines)):
            if re.match(re_command_filter, lines[line]) is not None:
                command_list.append(lines[line][2:-1])
                
                try:
                    parts = list(bashlex.split(command_list[-1]))
                    parsed_list.append(parts)
                except Exception as inst:
                    command_list.pop(-1)

    return command_list

In [7]:
scientist_commands = parse_commands_per_session(scientist_commands, scientist_parsed, scientists_files)
experienced_commands = parse_commands_per_session(experienced_commands, experienced_parsed, experienced_files)
non_programmers_commands = parse_commands_per_session(non_programmers_commands, non_programmers_parsed, non_programmers_files)
novice_commands = parse_commands_per_session(novice_commands, novice_parsed, novice_files)

filter_empty = lambda x: (len(x) > 0)

scientist_commands = list(filter(filter_empty, scientist_commands))
experienced_commands = list(filter(filter_empty, experienced_commands))
non_programmers_commands = list(filter(filter_empty, non_programmers_commands))
novice_commands = list(filter(filter_empty, novice_commands))

100%|██████████| 52/52 [00:05<00:00,  9.27it/s]
100%|██████████| 36/36 [00:03<00:00, 11.66it/s]
100%|██████████| 25/25 [00:01<00:00, 19.57it/s]
100%|██████████| 56/56 [00:03<00:00, 15.48it/s]


In [8]:
print(len(scientist_parsed))
print(len(experienced_parsed))
print(len(non_programmers_parsed))
print(len(novice_parsed))

print(scientist_commands[1])

7699
3823
1881
5109
['cat pre cr31 | tbl | nroff -me > cr31out &', 'f', 'mail ling', 'more cr31out']


Defunct, predicts next argument in command with this structure

In [9]:
from graph import Node

def construct_graph_defunct(parsed_commands, command_dict = {}):

    for cmd in parsed_commands:

        cur_node = None

        if command_dict.get(cmd[0]) is None:
            cur_node = Node(command = cmd[0], frequency = 1)
            command_dict[cmd[0]] = cur_node
        else:
            command_dict.get(cmd[0]).frequency += 1
            cur_node = command_dict.get(cmd[0])

        for i in range(1, len(cmd)):
            child_node = None

            if cur_node.children.get(cmd[i]) is None:
                child_node = Node(cmd[i], 1)
                cur_node.children[cmd[i]] = child_node
            else:
                cur_node.children.get(cmd[i]).frequency += 1
                child_node = cur_node.children.get(cmd[i])
            
            cur_node = child_node
        
    return command_dict


Graph structure for next command. Use command list instead of bashlex output since we are comparing whole commands as nodes in a graph.

Making command dict key the program (e.g. `cat`) and value the node with that program and have that nodes children be the full command. And then have the children of those nodes be determined by fuzzy matching. So a node can be a child to many parents if its fuzzy matched.

In [10]:
from graph import Node

def construct_graph(command_list, command_dict={}):

    filter_empty = lambda x: (len(x) > 0)
    cur_node = None
    child_node = None
    
    for session in command_list:
        # remove commands of length 0 ( not good to modify list within loop )
        
        session = list(filter(filter_empty, session))

        

        first_cmd = session[0]

        program = session[0].split()[0]

        if command_dict.get(program) is None:
            cur_node = Node(program=program, frequency=1)
            command_dict[program] = cur_node 
        else:
            cur_node = command_dict.get(program)
            cur_node.frequency += 1

        if cur_node.commands.get(first_cmd) is None:
            cur_node.commands[first_cmd] = 1
        else:
            cur_node.commands[first_cmd] += 1

        for cmd in range(1, len(session)):

            if session[cmd].isprintable() is False:
                break
            else:
                program = session[cmd].split()[0]

            if cur_node.children.get(program) is None:
                child_node = Node(program = program, frequency=1)
                cur_node.children[program] = child_node
            else:
                child_node = cur_node.children.get(program)
                child_node.frequency += 1
            
            if child_node.commands.get(session[cmd]) is None:
                child_node.commands[session[cmd]] = 1
            else:
                child_node.commands[session[cmd]] += 1
            
            cur_node = child_node
        
                
        
    return command_dict

In [12]:
import pprint
pp = pprint.PrettyPrinter(indent = 4)
scientist_graph = construct_graph(scientist_commands)
