In [1]:
import pandas as pd

In [2]:
df_lego_retirados = pd.read_csv('../01_Data_Cleaning/df_lego_final_retirados.csv')

In [3]:
df_lego_retirados

Unnamed: 0,SetID,Number,YearFrom,Category,Theme,Subtheme,SetName,ImageFilename,USRetailPrice,Pieces,...,YearsSinceExit,PriceChange,ResaleDemand,AppreciationTrend,SizeCategory,Exclusivity,ThemePopularity,InvestmentScore,AnnualPriceIncrease,AnnualPercentageIncrease
0,7530,10,1973,Normal,Duplo,Unknown,Pre-School Set,Unknown,0.0,17.0,...,51,0.0,0.000000,0.0,Small,Regular,0.0,0.0,0.000000,0.0
1,7531,20,1973,Normal,Duplo,Unknown,Building Set,Unknown,0.0,25.0,...,51,0.0,0.000000,0.0,Small,Regular,0.0,0.0,0.000000,0.0
2,1119,28,1979,Normal,Duplo,Unknown,Nursery Furniture,028-1,0.0,7.0,...,45,0.0,0.000000,0.0,Small,Regular,0.0,0.0,0.000000,0.0
3,7532,30,1973,Normal,Duplo,Unknown,Building Set,Unknown,0.0,29.0,...,51,0.0,0.000000,0.0,Small,Regular,0.0,0.0,0.000000,0.0
4,1382,32,1979,Normal,Duplo,Unknown,Living Room Furniture,032-1,0.0,14.0,...,45,0.0,0.000000,0.0,Small,Regular,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7710,22656,WISHINGWELL,2013,Other,Friends,Promotional,Wishing Well,WISHINGWELL-1,0.0,28.0,...,11,0.0,0.000000,0.0,Small,Regular,0.0,0.0,0.000000,0.0
7711,30208,XWING,2019,Other,Star Wars,Promotional,X-wing Trench Run,XWING-2,0.0,52.0,...,5,,0.694707,,Small,Exclusive,0.0,,2.940000,0.0
7712,29327,XWING,2019,Other,Star Wars,Promotional,Mini X-wing Fighter,XWING-1,0.0,60.0,...,5,0.0,0.000000,0.0,Small,Exclusive,0.0,10.0,0.000000,0.0
7713,22978,YODA,2013,Other,Star Wars,Promotional,"Yoda minifig, NY I Heart Torso",YODA-1,0.0,3.0,...,11,,0.000000,,Small,Exclusive,0.0,,256.936364,0.0


In [83]:
# Filtro los sets retirados en el último año
df_recent_retirados_last_year = df_lego_retirados[df_lego_retirados["YearsSinceExit"] == 1]

In [84]:
# Defino los temas que queremos filtrar
selected_themes = [
    "Speed Champions", "Architecture", "BrickHeadz", "Star Wars", "Ideas", "Collectable Minifigures",
    "Technic", "Minecraft", "Harry Potter", "Icons", "Ninjago", "Education", "Jurassic World", "DC Comics Super Heroes", "Marvel Super Heroes", "Creator", "City",
    "Classic", "Disney", "Creator Expert"
]


In [85]:
df_filtered = df_lego_retirados[(df_lego_retirados["YearsSinceExit"] == 1) & 
                                (df_lego_retirados["Theme"].isin(selected_themes))]

In [86]:
# Número de sets por archivo
num_sets_por_archivo = 95
num_batches = (len(df_filtered) // num_sets_por_archivo) + (1 if len(df_filtered) % num_sets_por_archivo > 0 else 0)


In [None]:
# Creo y guardo cada archivo
arch_part = []
for i in range(num_batches):
    start_idx = i * num_sets_por_archivo
    end_idx = start_idx + num_sets_por_archivo
    df_part = df_filtered.iloc[start_idx:end_idx]
    
    # Guardo cada partición como archivo CSV
    nombre_particion = f"../04_Extra/API_Brickeconomy/lego_scraping_brickeco_{i+1}.csv"
    df_part.to_csv(nombre_particion, index=False)
    arch_part.append(nombre_particion)

In [None]:
arch_part

['../04_Extra/API_Brickeconomy/lego_scraping_brickeco_1.csv',
 '../04_Extra/API_Brickeconomy/lego_scraping_brickeco_2.csv',
 '../04_Extra/API_Brickeconomy/lego_scraping_brickeco_3.csv',
 '../04_Extra/API_Brickeconomy/lego_scraping_brickeco_4.csv',
 '../04_Extra/API_Brickeconomy/lego_scraping_brickeco_5.csv']

In [None]:
import requests
import pandas as pd
import time
import os
from dotenv import load_dotenv


# Configuración del entorno
load_dotenv()
API_KEY = os.getenv("BRICKECONOMY_API_KEY")
BASE_URL = "https://www.brickeconomy.com/api/v1/set/"
HEADERS = {
    "Accept": "application/json",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "x-apikey": API_KEY
}
REQUEST_DELAY = 10  # Espera entre requests para evitar bloqueos
MAX_REQUESTS_PER_DAY = 95  # Límite de requests diarios
BATCH_FOLDER = "../04_Extra/API_Brickeconomy/"

# Obtenemos el siguiente archivo de lote disponible
def get_next_batch():
    batch_files = sorted([f for f in os.listdir(BATCH_FOLDER) if f.startswith("lego_scraping_brickeco_") and f.endswith(".csv")])
    if batch_files:
        return batch_files[0]  # Elegimos siempre el primer archivo disponible
    return None

batch_filename = get_next_batch()
if not batch_filename:
    print("No hay más archivos de scraping disponibles para descargar de la API.")
    exit()

file_path = os.path.join(BATCH_FOLDER, batch_filename)
df = pd.read_csv(file_path)

# HAcemos una lista para almacenar los resultados
scraped_data = []

# Limitamos el número de sets a descargar según el límite de la API
df = df.head(MAX_REQUESTS_PER_DAY)

# Bucle de cada uno de lso sets
for index, row in df.iterrows():
    set_number = row["Number"]
    url = f"{BASE_URL}{set_number}"
    
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        data = response.json().get("data", {})
        
        # Obtener precios históricos de nuevo y usado
        price_events_new = data.get("price_events_new", [])
        price_events_used = data.get("price_events_used", [])
        
        for event in price_events_new:
            scraped_data.append({
                "Number": data.get("set_number", "N/A"),
                "SetName": data.get("name", "N/A"),
                "Theme": data.get("theme", "N/A"),
                "Year": data.get("year", "N/A"),
                "Pieces": data.get("pieces_count", "N/A"),
                "Minifigs": data.get("minifigs_count", "N/A"),
                "RetailPriceUSD": data.get("retail_price_us", "N/A"),
                "CurrentValueNew": data.get("current_value_new", "N/A"),
                "ForecastValueNew2Y": data.get("forecast_value_new_2_years", "N/A"),
                "ForecastValueNew5Y": data.get("forecast_value_new_5_years", "N/A"),
                "RollingGrowthLastYear": data.get("rolling_growth_lastyear", "N/A"),
                "RollingGrowth12M": data.get("rolling_growth_12months", "N/A"),
                "PriceType": "New",
                "PriceDate": event["date"],
                "PriceValue": event["value"],
                "Currency": data.get("currency", "N/A"),
                "URL": url
            })
        
        for event in price_events_used:
            scraped_data.append({
                "Number": data.get("set_number", "N/A"),
                "SetName": data.get("name", "N/A"),
                "Theme": data.get("theme", "N/A"),
                "Year": data.get("year", "N/A"),
                "Pieces": data.get("pieces_count", "N/A"),
                "Minifigs": data.get("minifigs_count", "N/A"),
                "RetailPriceUSD": data.get("retail_price_us", "N/A"),
                "CurrentValueUsed": data.get("current_value_used", "N/A"),
                "ForecastValueNew2Y": data.get("forecast_value_new_2_years", "N/A"),
                "ForecastValueNew5Y": data.get("forecast_value_new_5_years", "N/A"),
                "RollingGrowthLastYear": data.get("rolling_growth_lastyear", "N/A"),
                "RollingGrowth12M": data.get("rolling_growth_12months", "N/A"),
                "PriceType": "Used",
                "PriceDate": event["date"],
                "PriceValue": event["value"],
                "Currency": data.get("currency", "N/A"),
                "URL": url
            })
        
        print(f"Scraped {set_number}: {data.get('current_value_new', 'N/A')} USD, {data.get('forecast_value_new_5_years', 'N/A')} USD")
        
        time.sleep(REQUEST_DELAY)  # Evitamos los bloqueos con el límite
        
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {set_number}: {e}")

# Guardamos los datos en un CSV
output_file = os.path.join(BATCH_FOLDER, f"scraped_{batch_filename}")
pd.DataFrame(scraped_data).to_csv(output_file, index=False)

# Eliminar el archivo del scraping procesado para evitar los duplicados
os.remove(file_path)
print(f"Scraping completado. Datos guardados en {output_file}. Eliminado {batch_filename} para evitar reuso.")


Scraped 122327: 3.02 USD, 3.65 USD
Scraped 122328: 3.52 USD, 3.72 USD
Scraped 122329: 2.98 USD, 4.27 USD
Scraped 122330: 2.82 USD, 3.98 USD
Scraped 122331: 3.16 USD, 3.34 USD
Scraped 122332: 3.18 USD, 3.84 USD
Scraped 122333: 3.36 USD, 4.06 USD
Scraped 122334: 3.93 USD, 5.52 USD
Scraped 212325: 2.6 USD, 3.18 USD
Scraped 212326: 3.46 USD, 4.0 USD
Scraped 212327: 4.22 USD, 5.71 USD
Scraped 212328: 3.5 USD, 4.74 USD
Scraped 212329: 3.34 USD, 4.85 USD
Scraped 212330: 3.91 USD, 4.91 USD
Scraped 242316: 3.59 USD, 4.86 USD
Scraped 242317: 3.98 USD, 5.59 USD
Scraped 242318: 4.47 USD, 5.4 USD
Scraped 242319: 4.65 USD, 6.07 USD
Error scraping 242320: HTTPSConnectionPool(host='www.brickeconomy.com', port=443): Max retries exceeded with url: /api/v1/set/242320 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
Scraped 242321: 5.49 USD, 6.79 USD
Scraped 662302: 4.69 USD, 6.12 USD
Scraped 662303: 4.47 USD, 5.17 USD
Scraped 662304: 4.44 USD, 6.45 USD
Scraped 6

In [98]:
# Quiero unificar todos los df de la API en uno solo
# Lista de archivos a unificar
file_paths = [
    "../04_Extra/API_Brickeconomy/scraped_lego_scraping_brickeco_1.csv",
    "../04_Extra/API_Brickeconomy/scraped_lego_scraping_brickeco_2.csv",
    "../04_Extra/API_Brickeconomy/scraped_lego_scraping_brickeco_3.csv",
    "../04_Extra/API_Brickeconomy/scraped_lego_scraping_brickeco_4.csv",
    "../04_Extra/API_Brickeconomy/scraped_lego_scraping_brickeco_5.csv"
]

# Cargamos y hacemos un concat con los dataframes
df_list = [pd.read_csv(file) for file in file_paths]
df_concat = pd.concat(df_list, ignore_index=True)

# Guardamos el dataframe unificado
url_salida = "../04_Extra/APP/data/scraped_lego_data.csv"
df_concat.to_csv(url_salida, index=False)