<a href="https://colab.research.google.com/github/nceder/qpb4e/blob/main/code/Chapter%2020/Chapter_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 20 Basic file wrangling

# 20.2 Scenario: The product feed from hell

In [None]:
# run this cell first to create files

! touch item_info.txt item_attributes.txt related_items.txt

In [None]:
import pathlib
cur_path = pathlib.Path(".")
FILE_PATTERN = "*.txt"
path_list = cur_path.glob(FILE_PATTERN)
print(list(path_list))

[PosixPath('related_items.txt'), PosixPath('item_attributes.txt'), PosixPath('item_info.txt')]


### Listing 20.1 File files_01.py


In [None]:
# Listing 20.1 File files_01.py

import datetime
import pathlib

FILE_PATTERN = "*.txt"             #A
ARCHIVE = "archive"

def main():

    date_string = datetime.date.today().strftime("%Y-%m-%d")    #B

    cur_path = pathlib.Path(".")
    archive_path = cur_path.joinpath(ARCHIVE)
    archive_path.mkdir(exist_ok=True)        #C

    paths = cur_path.glob(FILE_PATTERN)

    for path in paths:
        new_filename = f"{path.stem}_{date_string}{path.suffix}"
        new_path = archive_path.joinpath(new_filename)        #D
        path.rename(new_path)                      #E

if __name__ == '__main__':
     main()

# 20.3 More organization

### Listing 20.2 File files_02.py

In [None]:
# run this before running cell below
# clear files from archive directory

! rm archive/*
! touch item_info.txt item_attributes.txt related_items.txt

In [None]:
# Listing 20.2 File files_02.py

import datetime
import pathlib

FILE_PATTERN = "*.txt"
ARCHIVE = "archive"

def main():

    date_string = datetime.date.today().strftime("%Y-%m-%d")

    cur_path = pathlib.Path(".")

    archive_path = cur_path.joinpath(ARCHIVE)
    archive_path.mkdir(exist_ok=True)                             #A
    new_path = archive_path.joinpath(date_string)
    new_path.mkdir(exist_ok=True)            #B

    paths = cur_path.glob(FILE_PATTERN)

    for path in paths:
        path.rename(new_path.joinpath(path.name))

if __name__ == '__main__':
     main()

## 20.4.1 Compressing files

### Listing 20.3 File files_03.py

In [None]:
# run this before running cell below
# clear files from archive directory

! rm -rfr archive/*
! touch item_info.txt item_attributes.txt related_items.txt

In [None]:
# Listing 20.3 File files_03.py

import datetime
import pathlib
import zipfile          #A

FILE_PATTERN = "*.txt"
ARCHIVE = "archive"

def main():

    date_string = datetime.date.today().strftime("%Y-%m-%d")

    cur_path = pathlib.Path(".")
    archive_path = cur_path.joinpath(ARCHIVE)
    archive_path.mkdir(exist_ok=True)

    paths = cur_path.glob(FILE_PATTERN)

    zip_file_path = cur_path.joinpath(ARCHIVE, date_string + ".zip")   #B
    zip_file = zipfile.ZipFile(str(zip_file_path), "w")       #C

    for path in paths:
        zip_file.write(str(path))                                 #D
        path.unlink()             #E

if __name__ == '__main__':
     main()

## 20.4.2 Grooming files

### Listing 20.4 File files_04.py

In [None]:
# run this before running cell below
# create zip files in archive directory
from datetime import datetime, timedelta


def populate_archive(zip_file_path, current_date):
    for days in range(30, 40):
        zip_date = current_date - timedelta(days=days)
        new_zip_path = zip_file_path.joinpath(f"{zip_date.strftime('%Y-%m-%d')}.zip")
        zip_file = new_zip_path.write_text("Test")

cur_path = pathlib.Path(".")
zip_file_path = cur_path.joinpath(ARCHIVE)
current_date = datetime.today()
populate_archive(zip_file_path, current_date)

In [None]:
# Listing 20.4 File files_04.py

from datetime import datetime, timedelta
import pathlib
import zipfile

FILE_PATTERN = "*.zip"
ARCHIVE = "archive"
ARCHIVE_WEEKDAY = 1
def main():
    cur_path = pathlib.Path(".")
    zip_file_path = cur_path.joinpath(ARCHIVE)

    paths = zip_file_path.glob(FILE_PATTERN)
    current_date = datetime.today()    #A

    for path in paths:
        name = path.stem              #B
        path_date = datetime.strptime(name, "%Y-%m-%d")     #C
        path_timedelta = current_date - path_date          #D
        if (path_timedelta > timedelta(days=30)
                and path_date.weekday() != ARCHIVE_WEEKDAY):    #E
            path.unlink()

if __name__ == '__main__':
     main()