In [1]:
import os
import glob
import pathlib
from typing import (
    List,
    Set,
    Generator,
    Iterator,
    Tuple,
    Union
)

import regex as re

# Relative to absolute path

In [2]:
%%timeit
os.path.abspath(".")

10.7 µs ± 76.6 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [3]:
%%timeit
pathlib.Path("../").resolve()

15.3 µs ± 84.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [26]:
#!rm -rf ./path

./path  [error opening dir]

0 directories, 0 files


In [5]:
PATH_TO_DIR = './path/to/dir'

In [6]:
%%bash

DIR='./path/to/dir'
mkdir -p ${DIR}
mkdir -p "./path/nofile.txt"
for run in {1..2}; do
  touch "./path/to/kid${run}"
  touch "./path/to/kid${run}.log"
  touch "${DIR}/grandkid${run}"
  touch "${DIR}/grandkid${run}.txt"
done

# List all files in a directory tree

## os.walk

* [os.walk(top, topdown=True, onerror=None, followlinks=False)](https://docs.python.org/3/library/os.html#os.walk)

> Generate the file names in a directory tree by walking the tree either top-down or bottom-up. For each directory in the tree rooted at directory top (including top itself), it yields a 3-tuple ```(dirpath: str, dirnames: List[str], filenames: List[str])```.

In [7]:
!tree --filesfirst ./path

[01;34m./path[0m
├── [01;34mnofile.txt[0m
└── [01;34mto[0m
    ├── [00mkid1[0m
    ├── [00mkid1.log[0m
    ├── [00mkid2[0m
    ├── [00mkid2.log[0m
    └── [01;34mdir[0m
        ├── [00mgrandkid1[0m
        ├── [00mgrandkid1.txt[0m
        ├── [00mgrandkid2[0m
        └── [00mgrandkid2.txt[0m

3 directories, 8 files


In [8]:
#%%timeit
print(f"{'index'.ljust(6)}{'dir'.ljust(25)}{'directories'.ljust(20)}{'files'.ljust(50)}")
print("-" * 110)
for index, (dir, dirnames, filenames) in enumerate(os.walk('./path')):
    print(f"{index:<6}{dir:20}{str(dirnames):25}{str(filenames):50}")
    if index > 2:
        break

index dir                      directories         files                                             
--------------------------------------------------------------------------------------------------------------
0     ./path              ['nofile.txt', 'to']     []                                                
1     ./path/nofile.txt   []                       []                                                
2     ./path/to           ['dir']                  ['kid2', 'kid1.log', 'kid2.log', 'kid1']          
3     ./path/to/dir       []                       ['grandkid2', 'grandkid1', 'grandkid2.txt', 'grandkid1.txt']


In [9]:
def list_files_in_directory_tree(directory: str):
    """List all files in the directory tree
    """
    assert os.path.isdir(directory) and os.access(directory, os.R_OK)
    return [
        os.path.join(folder, file)
        for folder, subfolders, files in os.walk(directory)
        for file in files
    ]

In [10]:
# excludes "nofile.txt" which is a directory
list_files_in_directory_tree("./path")

['./path/to/kid2',
 './path/to/kid1.log',
 './path/to/kid2.log',
 './path/to/kid1',
 './path/to/dir/grandkid2',
 './path/to/dir/grandkid1',
 './path/to/dir/grandkid2.txt',
 './path/to/dir/grandkid1.txt']

In [11]:
%%timeit
list_files_in_directory_tree("./path")

63.6 µs ± 426 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


# List files with name matching pattern

os.walk + regexp

In [12]:
def find_files_in_directory(
        path: str,
        filename_pattern: str = r'^\w+\.(txt|log)',
        recursive: bool = True,
        as_tuple: bool = False
) -> Union[
    Generator[str, None, None],
    Generator[Tuple[str, str], None, None]
]:
    """List all files in the directory tree that matches the filename patter
    Args:
        path: path to the directory to start
        filename_pattern: regexp pattern to match the filenames
        recursive: find file in the entire directory tree
        as_tuple: return (folder, filename) tuples if True, else 'folder/filename'
    Returns:
    """
    assert os.path.isdir(path) and os.access(path, os.R_OK), \
        f"directory [{path}] is not a directory or not readable."
    entities: Union[
            Iterator[tuple[str, list[str], list[str]]],
            tuple[str, list[str], list[str]]
    ] = os.walk(path) if recursive else [next(os.walk(path))]
        
    return (
        (folder, filename) if as_tuple else os.path.join(folder, filename)
        for folder, subfolders, filenames in entities
        for filename in filenames
        if re.fullmatch(pattern=filename_pattern, string=filename)
    )


In [13]:
list(find_files_in_directory(path=PATH_TO_DIR, filename_pattern=r'^\w+\.(txt|log)', as_tuple=True))

[('./path/to/dir', 'grandkid2.txt'), ('./path/to/dir', 'grandkid1.txt')]

In [14]:
%%timeit
list(find_files_in_directory(path=PATH_TO_DIR, filename_pattern=r'^\w+\.(txt|log)', as_tuple=True))

32.1 µs ± 440 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## glob.glob

glob can **not** specify multiple patterns at the same time and cannot distinguish files from directories. if ```a.txt``` is a directory, it gets returned as well.


In [15]:
def find_files_in_directory_tree(directory: str, pattern='*.txt') -> List[str]:
    """List files in the directory tree that matches the file name pattern
    Args:
        directory: target directory 
        pattern: glob filename pattern (do not include '**')
    """
    # assert os.path.isdir(directory) and os.access(directory, os.R_OK)
    return glob.glob(
        os.path.sep.join([directory, '**', pattern]), 
        recursive=True
    )

In [16]:
# includes "nofile.txt" which is a directory
find_files_in_directory_tree(directory=PATH_TO_DIR, pattern="*.txt")

['./path/to/dir/grandkid2.txt', './path/to/dir/grandkid1.txt']

In [17]:
%%timeit
find_files_in_directory_tree(directory=PATH_TO_DIR, pattern="*.txt")
find_files_in_directory_tree(directory=PATH_TO_DIR, pattern="*.log")

56.3 µs ± 587 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [18]:
%%timeit
list(glob.glob(os.path.sep.join([PATH_TO_DIR, '**', '*.txt']), recursive=True))
list(glob.glob(os.path.sep.join([PATH_TO_DIR, '**', '*.log']), recursive=True))

55.9 µs ± 293 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## pathlib.Path.rglob

* [Path.glob(pattern, *, case_sensitive=None)](list(search_files_in_directory_tree("./path")))

In [19]:
def search_files_in_directory_tree(directory: str, pattern='*.txt') -> List[str]:
    """List files in the directory tree that matches the file name pattern
    Args:
        directory: target directory 
        pattern: glob filename pattern (do not include '**')
    """
    # assert os.path.isdir(directory) and os.access(directory, os.R_OK)
    return pathlib.Path(directory).rglob(pattern)

In [20]:
# includes "nofile.txt" which is a directory
list(search_files_in_directory_tree(directory=PATH_TO_DIR, pattern="*.txt"))

[PosixPath('path/to/dir/grandkid2.txt'),
 PosixPath('path/to/dir/grandkid1.txt')]

In [21]:
%%timeit
list(search_files_in_directory_tree(directory=PATH_TO_DIR, pattern="*.txt"))
list(search_files_in_directory_tree(directory=PATH_TO_DIR, pattern="*.log"))

62.2 µs ± 1.87 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [22]:
%%timeit
list(pathlib.Path(PATH_TO_DIR).rglob('*.log'))
list(pathlib.Path(PATH_TO_DIR).rglob('*.txt'))

61.5 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [23]:
next(os.walk("./path/to/dir"))[2]

['grandkid2', 'grandkid1', 'grandkid2.txt', 'grandkid1.txt']

In [24]:
pathlib.Path(PATH_TO_DIR).rglob('*.log')


<generator object Path.rglob at 0x1041b9040>

In [25]:
"/a/b/c/".rstrip('/').lstrip('/a')

'b/c'