Skip to content

Commit

Permalink
scraper pipeline halfway there
Browse files Browse the repository at this point in the history
  • Loading branch information
EricR401S committed May 8, 2024
1 parent 42c4e01 commit a3f58e7
Show file tree
Hide file tree
Showing 13,351 changed files with 8,214 additions and 17,654 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
2 changes: 1 addition & 1 deletion app_archive/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from PIL import Image
import time
from pipeline import scrape_archetypes
from scraping_functions.pipeline import scrape_archetypes
import pandas as pd
import os
# import requests
Expand Down
30 changes: 20 additions & 10 deletions experiment_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from scraping_functions.pipeline import scrape_archetypes, dataset_cleaner
from scraping_functions.card_sampler import build_sample
import os
import pandas as pd

if __name__ == "__main__":

Expand All @@ -10,14 +11,23 @@
{"archetype" : "All", "img_path" : "training_data_final/sampled_training_images", "csv_path" : "training_data_final", "csv_name" : "sampled_all_training_cards.csv"}]

for archetypes in experiment_archetypes:
scrape_archetypes(archetypes["archetype"], data_path=archetypes["img_path"], csv_path=archetypes["csv_path"], csv_name=archetypes["csv_name"])
print("Scraping for", archetypes["archetype"], "completed!")
print("Starting to clean " + archetypes["archetype"])
dataset_cleaner(dataset_path = archetypes["img_path"], csv_path = archetypes["csv_path"] + os.path.sep + archetypes["csv_name"])

# if archetypes["archetype"].lower() == "all":
# img_path = "training_data_final/training_images"
# csv_path = "training_data_final/all_training_cards.csv"
# build_sample(img_path, csv_path, random_state=42)
# print("Sampled data created!")
# scrape the archetype and remove the bad images/records.
archetype = archetypes["archetype"]
img_path = archetypes["img_path"]
csv_path = archetypes["csv_path"]
csv_name = archetypes["csv_name"]
scrape_archetypes(archetype, data_path=img_path, csv_path=csv_path, csv_name=csv_name)
print("Scraping for", archetype, "completed!")
print("Starting to clean " + archetype)
dataset_cleaner(dataset_path = img_path, csv_path = csv_path + os.path.sep + csv_name)

if archetypes["archetype"].lower() == "all":
# Sample from the larger dataset, but only half of it.
img_path = archetypes["img_path"]
csv_path = archetypes["csv_path"] + os.path.sep + archetypes["csv_name"]
df = pd.read_csv(csv_path)
print("The dataset contains {} images".format(len(os.listdir(img_path))))
print("The csv has ", df.shape)
print("Creating sampled data")
build_sample(img_path, csv_path, random_state=42)
print("Sampled data created!")
14 changes: 14 additions & 0 deletions final_todos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
remove training_data_final_original
remove apikey
remove scraping scripts
remove test_all_scraper, test_scraper
remove test ipynbs

pipeline
scrape data
prepare subsets for GAN
prepare the huggingface pushing


app pushing

18 changes: 8 additions & 10 deletions hf_pusher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
from datasets import load_dataset


for a_dataset in ["darkmagician", "blueeyes", "elementalhero", "all"]:
# imagefolder as a name cleared out the errors for us
test_dataset = load_dataset("imagefolder", data_dir=f"training_data_final/{a_dataset}_data")
if __name__ == "__main__":

huggingface_username = "steamcyclone"

# modify with your directory
test_dataset.push_to_hub(f"steamcyclone/{a_dataset}_data", private=True)
for a_dataset in ["darkmagician", "blueeyes", "elementalhero", "all"]:
# imagefolder as a name cleared out the errors for us
test_dataset = load_dataset("imagefolder", data_dir=f"training_data_final/{a_dataset}_data")

# # imagefolder as a name cleared out the errors for us
# test_dataset = load_dataset("imagefolder", data_dir="training_data_final/darkmagician_data")

# # modify with your directory
# test_dataset.push_to_hub("steamcyclone/darkmagician", private=True)
# modify with your directory
test_dataset.push_to_hub(f"{huggingface_username}/{a_dataset}_data", private=True)
Empty file removed lib/lib.py
Empty file.
17 changes: 0 additions & 17 deletions magician_scraper.py

This file was deleted.

18 changes: 6 additions & 12 deletions card_sampler.py → scraping_functions/card_sampler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""""""
"""Functions to help sample all the cards to comply with Fair Use Policy."""
import pandas as pd
import os

Expand All @@ -19,7 +19,7 @@ def classifier_type(card):
elif "monster" in card.lower():
return "monster"
else:
return "unknown"
return "unknown"

def sample_from_cards_csv(csv_path, random_state=42):
"""Samples from a csv file with all cards.
Expand All @@ -32,13 +32,12 @@ def sample_from_cards_csv(csv_path, random_state=42):
# Step 1: Group your data by the strata you want to sample from
df = pd.read_csv(csv_path)

df["simplified_type"] = df["type"].apply(classifier_type)

groups = df.groupby('simplified_type')
groups = df.groupby('type')

# Step 2: Define a sampling function
def stratified_sample(group):
return group.sample(frac=0.5, replace=False, random_state = random_state) # You can adjust the fraction and other parameters as needed
# You can adjust the fraction and other parameters as needed
return group.sample(frac=0.5, replace=False, random_state = random_state)

# Step 3: Apply the sampling function to each group
sampled_groups = groups.apply(stratified_sample)
Expand Down Expand Up @@ -72,17 +71,12 @@ def build_sample(img_path, csv_path, random_state=42):
for image in os.listdir(img_path):
if image not in allowed_images:
os.remove(os.path.join(img_path, image))
# if image in allowed_images:
# copy image to new folder
# os.rename(os.path.join(img_path, image), os.path.join(new_image_path, image))
# os.remove(os.path.join(img_path, image))
# sampled_csv.rename(columns={"image_id" : "file_name"}, inplace=True)

sampled_csv.to_csv(csv_path, index=False)

return



if __name__ == "__main__":
img_path = "training_data_final/training_images"
csv_path = "training_data_final/all_training_cards.csv"
Expand Down
43 changes: 37 additions & 6 deletions pipeline.py → scraping_functions/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from lib.scraper_scripts.card_image_scraper import *
from lib.scraper_scripts.card_info_extractor import *
import requests
import json
import csv
Expand All @@ -9,7 +7,7 @@
import pandas as pd
import cv2


# Processes the input of archetypes
def process_archetype_input(archetypes):
"""Processes the input of archetypes.
Args:
Expand All @@ -24,7 +22,7 @@ def process_archetype_input(archetypes):
archetype_set = set(archetype_list)
return archetype_set


# Searches for the archetype in the card name or description
def search_archetype(card_from_request, archetype_set):
"""Returns True if the card contains the archetype in
its name or description.
Expand All @@ -42,7 +40,7 @@ def search_archetype(card_from_request, archetype_set):
return True
return False


# Get card data from the api
def extract_info(json_data, archetypes):
"""Extracts the card information from the JSON data.
Args:
Expand Down Expand Up @@ -83,7 +81,29 @@ def extract_info(json_data, archetypes):
card_info.append(copy_card_dict)
return card_info

# Draws the download progress bar for images
def draw_loader(file, total, count):
"""
Draws a download progress bar.
Args:
file (str): The name of the file being downloaded.
total (int): Total number of files to download.
count (int): Number of files already downloaded.
"""
os.system('cls' if os.name == 'nt' else 'clear')
print('Downloading images\n')
print(file, end='')
print(' ' + str(count) + ' of ' + str(total))
percent = (count/total) * 100
print('[', end='')
for i in range(30):
if (i/30) * 100 >= percent:
print('-', end='')
else:
print('#', end='')
print(']{:.2f}%'.format(percent))

# Downloads the images of the cards to the data path
def download_images(card_info, data_path="training_images", card_types = ["spell", "trap", "monster", "token"]):
"""Downloads the images of the cards to the data path
and adds the path to the card information.
Expand Down Expand Up @@ -122,7 +142,7 @@ def download_images(card_info, data_path="training_images", card_types = ["spell
pass
print("Process Finished!")


# Scrapes text and images from archetypes
def scrape_archetypes(archetypes, data_path="training_images", csv_path="training_images",
csv_name="training_cards.csv", card_types = ["spell", "trap", "monster", "token"]):
"""
Expand Down Expand Up @@ -165,9 +185,20 @@ def scrape_archetypes(archetypes, data_path="training_images", csv_path="trainin
except requests.exceptions.RequestException as e:
print("Error fetching data:", e)

# Removes images and records that cannot be read by cv2
def dataset_cleaner(dataset_path, csv_path):
"""
Removes images and records that cannot be read by cv2
Args:
dataset_path (str): The path to the image folder.
csv_path (str): The path to the csv file.
Returns:
None
"""
df = pd.read_csv(csv_path)
erased_images_list = []
Expand Down
Loading

0 comments on commit a3f58e7

Please sign in to comment.