natolambert · natolambert · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/scripts/create-config.py b/scripts/create-config.py
@@ -23,7 +23,7 @@
     ">": "",
     "**": "",
     "*": "",
-    "~": "approximately ", # for numbers
+    "~": "approximately ",  # for numbers
     " | ": " ",
     "\\ ": " ",
     "e.g.": "e g",
@@ -253,10 +253,13 @@ def parse_markdown_to_dict(md_content, filename):
                 ):
                     text = prep_for_tts(text)
                     print(f"Rewrote index {total_index} with AI for TTS formatting.")
+                    # sometimes has bugs with commas,
+                    if " , " in text:
+                        text = text.replace(" , ", ", ")
 
                 # remove :, -, and leading space from text
                 text = text.replace(":", ",")
-                text = text.replace("--", ",") # simpler pause
+                text = text.replace("--", ",")  # simpler pause
                 if text.startswith(" "):
                     text = text[1:]
 

diff --git a/scripts/list-artifacts.py b/scripts/list-artifacts.py
@@ -1,26 +1,28 @@
 import argparse
+
 from huggingface_hub import get_collection
 
+
 def process_collection(collection_name, index, print_idx=False):
     collection = get_collection(collection_name)
 
     if index < 0 or index >= len(collection.items):
         return f"Error: invalid index: {index} for length of collection: {len(collection.items)}"
-    
+
     markdown_list = f"\n# Artifacts Log N\n\n"
-    categories = {'model': [], 'dataset': [], 'Space': []}
+    categories = {"model": [], "dataset": [], "Space": []}
 
     for idx, item in enumerate(collection.items[index:]):
-        author, model_name = item.item_id.split('/')
-        if item.item_type == 'model':
+        author, model_name = item.item_id.split("/")
+        if item.item_type == "model":
             model_link = f"https://huggingface.co/{item.item_id}"
         else:
             model_link = f"https://huggingface.co/{item.item_type}s/{item.item_id}"
         entry = f"- **[{model_name}]({model_link})** by [{author}](https://huggingface.co/{author}): TODO\n"
-        
+
         if print_idx:
             entry = f"- {idx + index} [{model_name}]({model_link}) by {author}\n"
-        
+
         if item.item_type in categories:
             categories[item.item_type].append(entry)
 
@@ -33,16 +35,27 @@ def process_collection(collection_name, index, print_idx=False):
     markdown_list += "\n References: ([2024 artifacts](https://huggingface.co/collections/natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988), [2023 artifacts](https://huggingface.co/collections/natolambert/2023-interconnects-artifacts-661b19d27082ad0b43d67b17), [MMLU vs training compute model](https://docs.google.com/spreadsheets/d/13LMlSGQQ3_qxbjIcEkgqofr2Ay1JT0XEH4S-AWQi8so/edit?usp=sharing)) \n"
     return markdown_list
 
+
 def main():
-    parser = argparse.ArgumentParser(description='Process a Hugging Face collection into a Markdown list.')
-    parser.add_argument('collection_name', nargs='?', default='natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988',
-                        help='The name of the Hugging Face collection (default: natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988)')
-    parser.add_argument('--index', type=int, default=0, help='The start index of the collection list (to take the most recent elements)')
-    parser.add_argument('--print_idx', action='store_true', help='Print the index of the collection list')
+    parser = argparse.ArgumentParser(description="Process a Hugging Face collection into a Markdown list.")
+    parser.add_argument(
+        "collection_name",
+        nargs="?",
+        default="natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988",
+        help="The name of the Hugging Face collection (default: natolambert/2024-interconnects-artifacts-6619a19e944c1e47024e9988)",
+    )
+    parser.add_argument(
+        "--index",
+        type=int,
+        default=0,
+        help="The start index of the collection list (to take the most recent elements)",
+    )
+    parser.add_argument("--print_idx", action="store_true", help="Print the index of the collection list")
     args = parser.parse_args()
 
     markdown_list = process_collection(args.collection_name, args.index, args.print_idx)
     print(markdown_list)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/scripts/ttv-generate.py b/scripts/ttv-generate.py
@@ -115,6 +115,7 @@ def get_image(idx, inputs, vivid=True, hd=True, rewrite=True, no_sleep=False):
     parser = argparse.ArgumentParser()
     parser.add_argument("--input", type=str, required=True, help="input text file dir")
     parser.add_argument("--do_not_gen", action="store_true", default=False, help="only download images")
+    parser.add_argument("--start_idx", type=int, default=0, help="start index for generation")
     args = parser.parse_args()
 
     # load yml file at args.input + config.yml
@@ -167,7 +168,10 @@ def get_image(idx, inputs, vivid=True, hd=True, rewrite=True, no_sleep=False):
     # if --do_not_gen, do not do this
     if not args.do_not_gen:
         with Pool(processes=3) as pool:
-            pool.starmap(get_image, enumerate(zip(prompts, title)))
+            # enumerate based on start index
+            pool.starmap(
+                get_image, enumerate(zip(prompts[args.start_idx :], title[args.start_idx :]), start=args.start_idx)
+            )
 
         # move all images from temp-images to args.input/images
         os.system(f"mv temp-images/* {args.input}images")