From b06cdf4fba2100901951e978b9bedb239dd0d4f5 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 29 May 2024 10:00:34 -0700 Subject: [PATCH 1/3] nit --- scripts/create-config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/create-config.py b/scripts/create-config.py index 66be0f1..a073609 100644 --- a/scripts/create-config.py +++ b/scripts/create-config.py @@ -253,6 +253,9 @@ def parse_markdown_to_dict(md_content, filename): ): text = prep_for_tts(text) print(f"Rewrote index {total_index} with AI for TTS formatting.") + # sometimes has bugs with commas, + if " , " in text: + text = text.replace(" , ", ", ") # remove :, -, and leading space from text text = text.replace(":", ",") From 354d5d74b66557ca4898a55212fd52e892d15ec5 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 29 May 2024 11:20:59 -0700 Subject: [PATCH 2/3] add start idx to image generate --- scripts/ttv-generate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/ttv-generate.py b/scripts/ttv-generate.py index 53b1bd6..84f4558 100644 --- a/scripts/ttv-generate.py +++ b/scripts/ttv-generate.py @@ -115,6 +115,7 @@ def get_image(idx, inputs, vivid=True, hd=True, rewrite=True, no_sleep=False): parser = argparse.ArgumentParser() parser.add_argument("--input", type=str, required=True, help="input text file dir") parser.add_argument("--do_not_gen", action="store_true", default=False, help="only download images") + parser.add_argument("--start_idx", type=int, default=0, help="start index for generation") args = parser.parse_args() # load yml file at args.input + config.yml @@ -167,7 +168,8 @@ def get_image(idx, inputs, vivid=True, hd=True, rewrite=True, no_sleep=False): # if --do_not_gen, do not do this if not args.do_not_gen: with Pool(processes=3) as pool: - pool.starmap(get_image, enumerate(zip(prompts, title))) + # enumerate based on start index + pool.starmap(get_image, enumerate(zip(prompts, title), start=args.start_idx)) # move all images from temp-images to args.input/images os.system(f"mv temp-images/* {args.input}images") From 78b9c4bb64fee7c5a974000c2df7fca1df66eb89 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 29 May 2024 11:47:48 -0700 Subject: [PATCH 3/3] up --- scripts/create-config.py | 4 ++-- scripts/list-artifacts.py | 37 +++++++++++++++++++++++++------------ scripts/ttv-generate.py | 4 +++- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/scripts/create-config.py b/scripts/create-config.py index a073609..d5600a3 100644 --- a/scripts/create-config.py +++ b/scripts/create-config.py @@ -23,7 +23,7 @@ ">": "", "**": "", "*": "", - "~": "approximately ", # for numbers + "~": "approximately ", # for numbers " | ": " ", "\\ ": " ", "e.g.": "e g", @@ -259,7 +259,7 @@ def parse_markdown_to_dict(md_content, filename): # remove :, -, and leading space from text text = text.replace(":", ",") - text = text.replace("--", ",") # simpler pause + text = text.replace("--", ",") # simpler pause if text.startswith(" "): text = text[1:] diff --git a/scripts/list-artifacts.py b/scripts/list-artifacts.py index c3f81a6..5217132 100644 --- a/scripts/list-artifacts.py +++ b/scripts/list-artifacts.py @@ -1,26 +1,28 @@ import argparse + from huggingface_hub import get_collection + def process_collection(collection_name, index, print_idx=False): collection = get_collection(collection_name) if index < 0 or index >= len(collection.items): return f"Error: invalid index: {index} for length of collection: {len(collection.items)}" - + markdown_list = f"\n# Artifacts Log N\n\n" - categories = {'model': [], 'dataset': [], 'Space': []} + categories = {"model": [], "dataset": [], "Space": []} for idx, item in enumerate(collection.items[index:]): - author, model_name = item.item_id.split('/') - if item.item_type == 'model': + author, model_name = item.item_id.split("/") + if item.item_type == "model": model_link = f"https://huggingface.co/{item.item_id}" else: model_link = f"https://huggingface.co/{item.item_type}s/{item.item_id}" entry = f"- **[{model_name}]({model_link})** by [{author}](https://huggingface.co/{author}): TODO\n" - + if print_idx: entry = f"- {idx + index} [{model_name}]({model_link}) by {author}\n" - + if item.item_type in categories: categories[item.item_type].append(entry) @@ -33,16 +35,27 @@ def process_collection(collection_name, index, print_idx=False): markdown_list += "\n References: ([2024 artifacts](https://huggingface.co/collections/natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988), [2023 artifacts](https://huggingface.co/collections/natolambert/2023-interconnects-artifacts-661b19d27082ad0b43d67b17), [MMLU vs training compute model](https://docs.google.com/spreadsheets/d/13LMlSGQQ3_qxbjIcEkgqofr2Ay1JT0XEH4S-AWQi8so/edit?usp=sharing)) \n" return markdown_list + def main(): - parser = argparse.ArgumentParser(description='Process a Hugging Face collection into a Markdown list.') - parser.add_argument('collection_name', nargs='?', default='natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988', - help='The name of the Hugging Face collection (default: natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988)') - parser.add_argument('--index', type=int, default=0, help='The start index of the collection list (to take the most recent elements)') - parser.add_argument('--print_idx', action='store_true', help='Print the index of the collection list') + parser = argparse.ArgumentParser(description="Process a Hugging Face collection into a Markdown list.") + parser.add_argument( + "collection_name", + nargs="?", + default="natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988", + help="The name of the Hugging Face collection (default: natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988)", + ) + parser.add_argument( + "--index", + type=int, + default=0, + help="The start index of the collection list (to take the most recent elements)", + ) + parser.add_argument("--print_idx", action="store_true", help="Print the index of the collection list") args = parser.parse_args() markdown_list = process_collection(args.collection_name, args.index, args.print_idx) print(markdown_list) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/scripts/ttv-generate.py b/scripts/ttv-generate.py index 84f4558..0504dd9 100644 --- a/scripts/ttv-generate.py +++ b/scripts/ttv-generate.py @@ -169,7 +169,9 @@ def get_image(idx, inputs, vivid=True, hd=True, rewrite=True, no_sleep=False): if not args.do_not_gen: with Pool(processes=3) as pool: # enumerate based on start index - pool.starmap(get_image, enumerate(zip(prompts, title), start=args.start_idx)) + pool.starmap( + get_image, enumerate(zip(prompts[args.start_idx :], title[args.start_idx :]), start=args.start_idx) + ) # move all images from temp-images to args.input/images os.system(f"mv temp-images/* {args.input}images")