scraper pipeline halfway there

nogibjj · May 8, 2024 · a3f58e7 · a3f58e7
1 parent 42c4e01
commit a3f58e7
Show file tree

Hide file tree

Showing 13,351 changed files with 8,214 additions and 17,654 deletions.
diff --git a/app_archive/app.py b/app_archive/app.py
@@ -2,7 +2,7 @@
 import numpy as np
 from PIL import Image
 import time
-from pipeline import scrape_archetypes
+from scraping_functions.pipeline import scrape_archetypes
 import pandas as pd 
 import os
 # import requests

diff --git a/experiment_scraper.py b/experiment_scraper.py
@@ -1,6 +1,7 @@
 from scraping_functions.pipeline import scrape_archetypes, dataset_cleaner
 from scraping_functions.card_sampler import build_sample
 import os
+import pandas as pd
 
 if __name__ == "__main__":
 
@@ -10,14 +11,23 @@
                                 {"archetype" : "All", "img_path" : "training_data_final/sampled_training_images", "csv_path" : "training_data_final", "csv_name" : "sampled_all_training_cards.csv"}]
 
     for archetypes in experiment_archetypes:
-        scrape_archetypes(archetypes["archetype"], data_path=archetypes["img_path"], csv_path=archetypes["csv_path"], csv_name=archetypes["csv_name"])
-        print("Scraping for", archetypes["archetype"], "completed!")
-        print("Starting to clean " + archetypes["archetype"])
-        dataset_cleaner(dataset_path = archetypes["img_path"], csv_path = archetypes["csv_path"] + os.path.sep + archetypes["csv_name"])
-
-        # if archetypes["archetype"].lower() == "all":
-        #     img_path = "training_data_final/training_images"
-        #     csv_path = "training_data_final/all_training_cards.csv"
-        #     build_sample(img_path, csv_path, random_state=42)    
-        #     print("Sampled data created!")
+        # scrape the archetype and remove the bad images/records.
+        archetype = archetypes["archetype"]
+        img_path = archetypes["img_path"]
+        csv_path = archetypes["csv_path"]
+        csv_name = archetypes["csv_name"]
+        scrape_archetypes(archetype, data_path=img_path, csv_path=csv_path, csv_name=csv_name)
+        print("Scraping for", archetype, "completed!")
+        print("Starting to clean " + archetype)
+        dataset_cleaner(dataset_path = img_path, csv_path = csv_path + os.path.sep + csv_name)
 
+        if archetypes["archetype"].lower() == "all":
+            # Sample from the larger dataset, but only half of it.
+            img_path = archetypes["img_path"]
+            csv_path = archetypes["csv_path"] + os.path.sep + archetypes["csv_name"]
+            df = pd.read_csv(csv_path)
+            print("The dataset contains {} images".format(len(os.listdir(img_path))))
+            print("The csv has ", df.shape)
+            print("Creating sampled data")
+            build_sample(img_path, csv_path, random_state=42)    
+            print("Sampled data created!")
diff --git a/final_todos.txt b/final_todos.txt
@@ -0,0 +1,14 @@
+remove training_data_final_original
+remove apikey
+remove scraping scripts
+remove test_all_scraper, test_scraper
+remove test ipynbs
+
+pipeline
+scrape data
+prepare subsets for GAN
+prepare the huggingface pushing
+
+
+app pushing
+
diff --git a/hf_pusher.py b/hf_pusher.py
@@ -3,15 +3,13 @@
 from datasets import load_dataset
 
 
-for a_dataset in ["darkmagician", "blueeyes", "elementalhero", "all"]:
-    # imagefolder as a name cleared out the errors for us
-    test_dataset = load_dataset("imagefolder", data_dir=f"training_data_final/{a_dataset}_data")
+if __name__ == "__main__":
+
+    huggingface_username = "steamcyclone"
 
-    # modify with your directory
-    test_dataset.push_to_hub(f"steamcyclone/{a_dataset}_data", private=True)
+    for a_dataset in ["darkmagician", "blueeyes", "elementalhero", "all"]:
+        # imagefolder as a name cleared out the errors for us
+        test_dataset = load_dataset("imagefolder", data_dir=f"training_data_final/{a_dataset}_data")
 
-# # imagefolder as a name cleared out the errors for us
-# test_dataset = load_dataset("imagefolder", data_dir="training_data_final/darkmagician_data")
-
-# # modify with your directory
-# test_dataset.push_to_hub("steamcyclone/darkmagician", private=True)
+        # modify with your directory
+        test_dataset.push_to_hub(f"{huggingface_username}/{a_dataset}_data", private=True)
diff --git a/lib/lib.py b/lib/lib.py
diff --git a/magician_scraper.py b/magician_scraper.py
diff --git a/card_sampler.py → scraping_functions/card_sampler.py b/card_sampler.py → scraping_functions/card_sampler.py
@@ -1,4 +1,4 @@
-""""""
+"""Functions to help sample all the cards to comply with Fair Use Policy."""
 import pandas as pd
 import os
 
@@ -19,7 +19,7 @@ def classifier_type(card):
     elif "monster" in card.lower():
         return "monster"
     else:
-        return "unknown"
+        return "unknown" 
 
 def sample_from_cards_csv(csv_path, random_state=42):
     """Samples from a csv file with all cards.
@@ -32,13 +32,12 @@ def sample_from_cards_csv(csv_path, random_state=42):
     # Step 1: Group your data by the strata you want to sample from
     df = pd.read_csv(csv_path)
 
-    df["simplified_type"] = df["type"].apply(classifier_type)
-
-    groups = df.groupby('simplified_type')
+    groups = df.groupby('type')
 
     # Step 2: Define a sampling function
     def stratified_sample(group):
-        return group.sample(frac=0.5, replace=False, random_state = random_state)  # You can adjust the fraction and other parameters as needed
+         # You can adjust the fraction and other parameters as needed
+        return group.sample(frac=0.5, replace=False, random_state = random_state) 
 
     # Step 3: Apply the sampling function to each group
     sampled_groups = groups.apply(stratified_sample)
@@ -72,17 +71,12 @@ def build_sample(img_path, csv_path, random_state=42):
     for image in os.listdir(img_path):
         if image not in allowed_images:
             os.remove(os.path.join(img_path, image))
-        # if image in allowed_images:
-            # copy image to new folder
-            # os.rename(os.path.join(img_path, image), os.path.join(new_image_path, image))
-            # os.remove(os.path.join(img_path, image))
-    # sampled_csv.rename(columns={"image_id" : "file_name"}, inplace=True)
+
     sampled_csv.to_csv(csv_path, index=False)
 
     return 
 
 
-
 if __name__ == "__main__":
     img_path = "training_data_final/training_images"
     csv_path = "training_data_final/all_training_cards.csv"

diff --git a/pipeline.py → scraping_functions/pipeline.py b/pipeline.py → scraping_functions/pipeline.py
@@ -1,5 +1,3 @@
-from lib.scraper_scripts.card_image_scraper import *
-from lib.scraper_scripts.card_info_extractor import *
 import requests
 import json
 import csv
@@ -9,7 +7,7 @@
 import pandas as pd
 import cv2
 
-
+# Processes the input of archetypes
 def process_archetype_input(archetypes):
     """Processes the input of archetypes.
     Args:
@@ -24,7 +22,7 @@ def process_archetype_input(archetypes):
     archetype_set = set(archetype_list)
     return archetype_set
 
-
+# Searches for the archetype in the card name or description
 def search_archetype(card_from_request, archetype_set):
     """Returns True if the card contains the archetype in
     its name or description.
@@ -42,7 +40,7 @@ def search_archetype(card_from_request, archetype_set):
             return True
     return False
 
-
+# Get card data from the api
 def extract_info(json_data, archetypes):
     """Extracts the card information from the JSON data.
     Args:
@@ -83,7 +81,29 @@ def extract_info(json_data, archetypes):
                     card_info.append(copy_card_dict)
     return card_info
 
+# Draws the download progress bar for images
+def draw_loader(file, total, count):
+    """
+    Draws a download progress bar.
+    Args:
+        file (str): The name of the file being downloaded.
+        total (int): Total number of files to download.
+        count (int): Number of files already downloaded.
+    """
+    os.system('cls' if os.name == 'nt' else 'clear')
+    print('Downloading images\n')
+    print(file, end='')
+    print(' ' + str(count) + ' of ' + str(total))
+    percent = (count/total) * 100
+    print('[', end='')
+    for i in range(30):
+        if (i/30) * 100 >= percent:
+            print('-', end='')
+        else:
+            print('#', end='')
+    print(']{:.2f}%'.format(percent))
 
+# Downloads the images of the cards to the data path
 def download_images(card_info, data_path="training_images", card_types = ["spell", "trap", "monster", "token"]):
     """Downloads the images of the cards to the data path
     and adds the path to the card information.
@@ -122,7 +142,7 @@ def download_images(card_info, data_path="training_images", card_types = ["spell
             pass
     print("Process Finished!")
 
-
+# Scrapes text and images from archetypes
 def scrape_archetypes(archetypes, data_path="training_images", csv_path="training_images", 
                       csv_name="training_cards.csv", card_types = ["spell", "trap", "monster", "token"]):
     """
@@ -165,9 +185,20 @@ def scrape_archetypes(archetypes, data_path="training_images", csv_path="trainin
     except requests.exceptions.RequestException as e:
         print("Error fetching data:", e)
 
+# Removes images and records that cannot be read by cv2
 def dataset_cleaner(dataset_path, csv_path):
     """
     Removes images and records that cannot be read by cv2
+
+    Args:
+
+        dataset_path (str): The path to the image folder.
+
+        csv_path (str): The path to the csv file.
+
+    Returns:
+
+        None
     """
     df = pd.read_csv(csv_path)
     erased_images_list = []