In [None]:
import os
import logging
import pandas as pd
from pathlib import Path

In [None]:
logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()

In [None]:
#FILES_TO_FIX: str = "files.txt"
FILES_TO_FIX: str = "missing_files.txt"

In [None]:
files = Path(FILES_TO_FIX).read_text()
files = files.split("\n")
logger.info(f"there {len(files)} files")

In [None]:
files[0]

In [None]:
%%time
import time
import multiprocessing as mp
def handle_file(params):
    f, idx, num_files = params
    st = time.perf_counter()
    s3_path = f"s3://bigdatateaching/{f}"
    logger.info(f"{idx} of {num_files}, start for {s3_path}")
    df = pd.read_parquet(s3_path)
    cols = df.columns
    if "edited" in cols:
        df.edited = df.edited.astype(float)
    if "created_utc" in cols:
        df.created_utc = df.created_utc.astype(int)
    if "retrieved_on" in cols:
        df.retrieved_on = df.retrieved_on.astype(int)
    s3_dest_path = f"s3://bigdatateaching/reddit-project/{f}"
    df.to_parquet(s3_dest_path, compression='zstd')
    tt = time.perf_counter() - st
    logger.info(f"{idx} of {num_files}, finished handling for {s3_path}, written to {s3_dest_path}, time taken = {tt}")

overall_st = time.perf_counter()
num_files = len(files)
processes = 1
pool = mp.Pool(processes)


def exclude_file(f):
    files_to_exclude = ["comments_RC_2023-11.zst_163.parquet", 
                        "comments_RC_2024-05.zst_200.parquet",
                        "comments_RC_2024-05.zst_251.parquet",
                        "comments_RC_2023-11.zst_222.parquet",
                        "comments_RC_2023-11.zst_65.parquet",
                        "comments_RC_2024-02.zst_197.parquet",
                        "comments_RC_2023-07.zst_74.parquet",
                        "comments_RC_2023-07.zst_75.parquet",
                        "comments_RC_2023-11.zst_223.parquet",
                        "comments_RC_2023-11.zst_47.parquet",
                        "comments_RC_2023-11.zst_48.parquet", 
                        "comments_RC_2023-11.zst_5.parquet"
                       ]
    for exclude_pattern in files_to_exclude:
        if exclude_pattern in f:
            return True
    return False
file_tuples = [(f, i+1, num_files) for i, f in enumerate(files) if exclude_file(f) is False]
logger.info(f"there are {len(file_tuples)} to handle")
with mp.Pool(processes) as p:
    result = p.map(handle_file, file_tuples)
overall_total_time = time.perf_counter() - st
logger.info(f"{len(files)} files handled, overall time taken = {tt}")

    
