# Pathlib

## Object oriented Pythonic paths
aka
"The Right Way to do Paths"

https://docs.python.org/3/library/pathlib.html

In [None]:
# Pathlib is a standard Python library
# You will usually want to import Path and/or PurePath

import pathlib
from pathlib import Path, PurePath

In [None]:
# Let's do a few more imports for later
import os
import shutil

## What _are_ paths, really?

Depends on the context.  

Is this a path?
It's certainly a URL...
https://autumn-data.com/runs/sm_sir/

How about just this bit?  
sm_sir/malaysia/1634732134/ec182f

Paths are abstract representations of a nested, tree-like structure  
Such things include filesystems, bits of web addresses, certain representations of S3 etc etc... 

Pathlib deals with all these things.  It also deals with the bits that are specifically about filesystems, but makes a distinction between the two (for good reasons)

## Filesystems
### Pathlib vs os.* vs string manipulation

In [None]:
cwd = Path()

In [None]:
cwd

In [None]:
# Listdir
os.listdir(cwd)

In [None]:
cwd.glob('*')

In [None]:
list(cwd.glob('*'))

In [None]:
# Making glob a little easier to work with; the autumn display module contains
# some hooks for Jupyter notebooks

from autumn.tools.utils import display

In [None]:
cwd.glob('*')

In [None]:
cd ..

In [None]:
cwd.glob('*')

In [None]:
cd workshops/

In [None]:
cwd = cwd.absolute()
cwd

In [None]:
# This will work, but is a bad idea in most cases

(cwd / '..')

In [None]:
# Better

cwd.parent

In [None]:
# Like str.split, but better and safer...
cwd.parts

In [None]:
some_file = Path("path/to/file.txt")
tgz_file = Path("file.tar.gz")

In [None]:
# Strings are the worst...

In [None]:
str(some_file).split('.')[0], str(some_file).split('.')[-1]

In [None]:
just_filename = str(some_file).split('/')[-1]
just_filename.split('.')[0], just_filename.split('.')[-1]

In [None]:
str(tgz_file).split('.')[0], str(tgz_file).split('.')[-1]

In [None]:
# os.path is ... okish...

In [None]:
os.path.splitext(some_file)

In [None]:
os.path.splitext(tgz_file)

In [None]:
# Pathlib

some_file.stem, some_file.suffix

In [None]:
tgz_file.stem, tgz_file.suffix

In [None]:
tgz_file.suffixes

## A note on being a good programmer...

The string examples above tell us something - we all make assumptions, that turn into heuristics - they work well, until they don't

### It's not about being a "rockstar"... more like a buddhist monk with a management job

Learn how to delegate - use system libraries! Someone else has thought about this a lot more than you ever will (or will ever want to)
### ...although having a bit of London cabbie helps - read the documentation! Drive the roads! (well, use the library until you don't need to look up the documentation...)

## File handling ergonomics...

In [None]:
test_path = cwd / "test_files"

In [None]:
test_path.mkdir(exist_ok=True)

In [None]:
for i in range(5):
    (test_path / f"file{i}.txt").write_text(f"Some example file contents for file {i}")

In [None]:
contents_map = {f: f.read_text() for f in cwd.glob("*/*.txt")}

In [None]:
contents_map

In [None]:
# Bonus Python 3.8 syntax - the Walrus operator :=
{f: contents for f in cwd.glob("*/*.txt") if "3" in (contents := f.read_text())}

In [None]:
# We also have access to real properties of the filesystem - like file size etc

f = test_path / "file1.txt"
f.lstat()

In [None]:
test_path.rmdir()

In [None]:
os.rmdir(test_path)

In [None]:
# Still need to use shutil - same as it ever was
# https://docs.python.org/3/library/shutil.html

shutil.rmtree(test_path)

In [None]:
test_path

In [None]:
test_path.glob('*')

In [None]:
test_path.exists()

## Writing functions, calling functions

In [None]:
def do_something1(path_to_file):
    return os.path.exists(path_to_file)

In [None]:
def do_something2(path_to_file):
    return path_to_file.exists()

In [None]:
do_something1(test_path)

In [None]:
do_something2(test_path)

In [None]:
a_file = "this is not a file"

In [None]:
do_something1(a_file)

In [None]:
do_something2(a_file)

### Use type annotations!  (You should be doing this anyway)

In [None]:
def do_something3(path_to_file: Path) -> bool:
    # Users know what this function expects
    return path_to_file.exists()

In [None]:
# This will fail - but it's the user's fault now (in the nicest possible way...)

do_something3(a_file)

In [None]:
do_something3(Path(a_file))

In [None]:
from typing import Union

In [None]:
def do_something4(path_to_file: Union[Path, str]) -> bool:
    # Now we handle both cases
    path_to_file = Path(path_to_file) if isinstance(path_to_file, str) else path_to_file
    
    return path_to_file.exists()

In [None]:
do_something4(a_file), do_something4(Path(a_file)), 

In [None]:
# Bonus Python 3.10 version...

PathOrStr = Path|str

def do_something5(path_to_file: PathOrStr) -> bool:
    # Now we handle both cases
    path_to_file = Path(path_to_file) if isinstance(path_to_file, str) else path_to_file
    return path_to_file.exists()

In [None]:
do_something5

## PurePaths

PurePaths are 'pure' in that they are  
  

a) Abstract representations unencumbered by the weight of the real world...  
b) Functionally pure (ie they can't produce side effects)  

In [None]:
from pathlib import PurePath, PurePosixPath, PureWindowsPath

In [None]:
# Use PurePath directly if you want to work on abstract paths of the type of system you're working on...

pure_test = PurePath(test_path)
pure_test

In [None]:
pure_test.mkdir()

In [None]:
# Specify the path type if you have a particular filesystem in mind...

win_path = PureWindowsPath(test_path)
win_path

In [None]:
str(win_path)

In [None]:
str("C:" / win_path / "MicroSoft Style Folder Name (95)")

In [None]:
s3_bucket = PurePosixPath("autumn-data")

In [None]:
import s3fs

In [None]:
fs = s3fs.S3FileSystem()

In [None]:
fs.ls(s3_bucket)

In [None]:
fs.ls(s3_bucket / "sm_sir" / "malaysia")

In [None]:
# If you were just using 'Path' on a Windows system - 
# you'd have a WindowsPath object, and this would happen...
# That's why we use PurePosixPath - because the system we're talking to is Posix-like

fs.ls(PureWindowsPath(s3_bucket) / "sm_sir")

In [None]:
# Bonus round - glob is awesome

fs.glob(str(s3_bucket / "*" / "malaysia"))