In [1]:
from enum import Enum
import input
import re

input_data = input.read_input(7, True).splitlines()

match_command_pattern = re.compile(r'^\$')
match_dir_pattern = re.compile(r'^dir')
match_file_pattern = re.compile(r'^\d')

class LineType(Enum):
    COMMAND = 1
    DIR = 2
    FILE = 3

def parse_input(input_data):
    commands = []
    for line in input_data:
        if match_command_pattern.match(line):
            commands.append((LineType.COMMAND, line))
        elif match_dir_pattern.match(line):
            commands.append((LineType.DIR, line))
        elif match_file_pattern.match(line):
            commands.append((LineType.FILE, line))
    return commands
  
data = parse_input(input_data)
data[:6]

[(<LineType.COMMAND: 1>, '$ cd /'),
 (<LineType.COMMAND: 1>, '$ ls'),
 (<LineType.DIR: 2>, 'dir a'),
 (<LineType.FILE: 3>, '14848514 b.txt'),
 (<LineType.FILE: 3>, '8504156 c.dat'),
 (<LineType.DIR: 2>, 'dir d')]

In [2]:
class Node:
  def __init__(self, name, parent=None, size=0):
    self.name = name
    self.parent = parent
    self.children = []
    self.size = size
    
  def print(self, indent=0, last=False):
    branch = '│   '
    tee = '├── '
    cap = '└── '
    
    size = self.get_size()
    size_label = f'({size})' if size > 0 else ''
    
    if indent == 0:
      print(f'{self.name} {size_label}')
    else:
      print(branch * (indent - 1), end='')
      print(tee if not last else cap, end='')
      print(f'{self.name} {size_label}')
    
    for i, child in enumerate(self.children):
      child.print(indent + 1, i == len(self.children) - 1)
      
  def find_child(self, name):
    for child in self.children:
      if child.name == name:
        return child
    return None
  
  def get_size(self):
    size = self.size
    for child in self.children:
      size += child.get_size()
    return size
  
  # find all nodes with size <= max_size_limit
  def query_max(self, max_size_limit):
    result = []
    if self.get_size() <= max_size_limit and self.size == 0: # only include dirs
      result.append(self)
    for child in self.children:
      result.extend(child.query_max(max_size_limit))
    return result
  
  # find all nodes with size >= min_size_limit
  def query_min(self, min_size_limit):
    result = []
    if self.get_size() >= min_size_limit and self.size == 0:
      result.append(self)
    for child in self.children:
      result.extend(child.query_min(min_size_limit))
    return result

In [3]:
root = Node('/')
current_node = root

for line_type, line in data:
  if line_type == LineType.COMMAND:
    if 'ls' in line:
      continue
    
    if 'cd ..' in line:
      if current_node.parent:
        current_node = current_node.parent
      continue
    
    if 'cd /' in line:
      current_node = root
      continue
    
    ## cd into a directory
    pattern = r'\$ cd (.*)'
    dir_name = re.match(pattern, line).group(1)
    
    # does the dir exist?
    child = current_node.find_child(dir_name)
    if child:
      current_node = child
      continue
      
    # if not, create it    
    current_node.children.append(
      Node(dir_name, current_node)
    )
    current_node = current_node.children[-1]
  elif line_type == LineType.DIR:
    pattern = r'dir (.*)'
    dir_name = re.match(pattern, line).group(1)
    current_node.children.append(Node(dir_name, current_node))
  elif line_type == LineType.FILE:
    pattern = r'(\d+) (.*)'
    size, file_name = re.match(pattern, line).group(1, 2)
    size = int(size)
    current_node.children.append(Node(file_name, current_node, size))

In [4]:
root.print()

/ (48381165)
├── a (94853)
│   ├── e (584)
│   │   └── i (584)
│   ├── f (29116)
│   ├── g (2557)
│   └── h.lst (62596)
├── b.txt (14848514)
├── c.dat (8504156)
└── d (24933642)
│   ├── j (4060174)
│   ├── d.log (8033020)
│   ├── d.ext (5626152)
│   └── k (7214296)


In [5]:
small_dirs = root.query_max(100_000)
sum([dir.get_size() for dir in small_dirs])

95437

In [6]:
used_space = root.get_size()
max_space = 70_000_000
update_size = 30_000_000
space_needed = abs(max_space - update_size - used_space)

large_dirs = root.query_min(space_needed)
large_dirs = sorted(large_dirs, key=lambda dir: dir.get_size())
print(large_dirs[0].name, large_dirs[0].get_size())

d 24933642
