In [1]:
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import os

# Cargar variables de entorno desde .env
load_dotenv()

def procesar_y_guardar_lego(csv_file, db_name="dbo_lego", collection_name="lego_work"):
    """
    Carga un CSV de sets de LEGO, filtra los datos según los temas seleccionados y
    los guarda en una base de datos MongoDB.
    """

    # Obtener mongo_uri desde .env
    mongo_uri = os.getenv("MONGO_URI")

    if not mongo_uri:
        raise ValueError("La variable MONGO_URI no está definida en el archivo .env")

    # Cargar CSV
    df_lego_inicial = pd.read_csv(csv_file)

    # Lista de temas elegidos
    selected_themes = [
        'Animal Crossing', 'Architecture', 'Art',
        'Avatar: The Last Airbender', 'Avatar',
        'The LEGO Batman Movie', 'Batman', 'Botanicals',
        'BrickHeadz', 'World City', 'City', 'Classic',
        'Creator', 'Creator Expert', 'DC Super Hero Girls', 'DC Comics Super Heroes',
        'Despicable Me', 'Disney', 'Dots', 'Dreamzzz', 'DREAMZzz',
        'Duplo', 'Education', 'Friends', 'Fortnite', "Gabby's Dollhouse",
        'Harry Potter', 'Icons', 'Ideas', 'Indiana Jones', 'Jurassic World',
        'The Lord of the Rings', 'Marvel Super Heroes', 'Minecraft',
        'Collectable Minifigures', 'Monkie Kid', 'The LEGO Ninjago Movie', 'Ninjago',
        'Powered Up', 'Serious Play', 'Sonic the Hedgehog', 'Speed Champions',
        'Spider-Man', 'Star Wars', 'Super Mario', 'Technic', 'The Legend of Zelda',
        'Wednesday', 'Wicked',
    ]

    # Filtrar dataset
    df_lego_work = df_lego_inicial[df_lego_inicial['Theme'].isin(selected_themes)]

    # Eliminar columnas innecesarias
    columns_to_drop = ['Own', 'Want', 'Unnamed: 49', 'Flag2', 'Flag3', 'Flag4', 'Flag5', 'Flag6', 'Flag7', 'Flag8', 'UserNotes',
                       "Variant", "ThemeGroup", "EAN", "UPC", "UKRetailPrice", 'QtyOwned', 'QtyOwnedNew', 'QtyOwnedUsed',
                       'QtyWanted', 'WantedPriority', "CARetailPrice", "DERetailPrice", "AdditionalImageCount", "InstructionsCount",
                       "USDateAdded", "USDateRemoved", "Designers", "Image", "USItemNumber", "EUItemNumber"]

    df_lego_work = df_lego_work.drop(columns=columns_to_drop, errors='ignore')

    # Conectar a MongoDB
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    # Convertir el DataFrame a lista de diccionarios e insertarlo en MongoDB
    data = df_lego_work.to_dict(orient="records")
    if data:
        collection.insert_many(data)
        print(f"Datos insertados en MongoDB ({len(data)} registros).")
    else:
        print("No hay datos para insertar.")

    # Cerrar conexión
    client.close()


In [None]:
#primera funcion aplicada
csv_file = "Set_Lanzanzados.csv"
procesar_y_guardar_lego(csv_file)

  df_lego_inicial = pd.read_csv(csv_file)


Datos insertados en MongoDB (9205 registros).


In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
from pymongo import MongoClient

def clean_lego_data(df_lego):
    df_lego['Subtheme'] = df_lego['Subtheme'].fillna('Unknown')
    columns_zero = ['Pieces', 'BrickLinkSoldPriceNew', 'BrickLinkSoldPriceNewUS', 'USRetailPrice',
                    'BrickLinkSoldPriceUsed', 'Depth', 'Height', 'Width', 'Weight', 'Minifigs', 'AgeMin', 'AgeMax']
    for col in columns_zero:
        if col in df_lego.columns:
            df_lego[col] = df_lego[col].fillna(0)
    df_lego['ImageFilename'] = df_lego['ImageFilename'].fillna('Unknown')
    df_lego['LaunchDate'] = pd.to_datetime(df_lego['LaunchDate'], errors='coerce')
    df_lego['ExitDate'] = pd.to_datetime(df_lego['ExitDate'], errors='coerce')
    df_lego['Duration'] = (df_lego['ExitDate'] - df_lego['LaunchDate']).dt.days / 365.25
    theme_median_duration = df_lego.groupby('Theme')['Duration'].median()
    for theme, median_duration in theme_median_duration.items():
        mask = (df_lego['Theme'] == theme) & df_lego['ExitDate'].isna() & df_lego['LaunchDate'].notna()
        df_lego.loc[mask, 'ExitDate'] = df_lego.loc[mask, 'LaunchDate'] + pd.to_timedelta(median_duration * 365.25, unit='D')
    mask_launch = df_lego['LaunchDate'].isna() & df_lego['YearFrom'].notna()
    df_lego.loc[mask_launch, 'LaunchDate'] = pd.to_datetime(df_lego.loc[mask_launch, 'YearFrom'].astype(int).astype(str) + '-01-01')
    df_lego['LaunchYear'] = df_lego['LaunchDate'].dt.year
    df_lego['LaunchMonth'] = df_lego['LaunchDate'].dt.month
    df_lego['ExitYear'] = df_lego['ExitDate'].dt.year
    df_lego['ExitMonth'] = df_lego['ExitDate'].dt.month
    df_lego.drop(columns=['LaunchDate', 'ExitDate', 'Duration'], inplace=True)
    df_lego['Duration'] = df_lego['ExitYear'] - df_lego['LaunchYear']
    theme_avg_duration = df_lego.groupby('Theme')['Duration'].mean()
    year_avg_duration = df_lego.groupby('LaunchYear')['Duration'].mean()
    for index, row in df_lego.iterrows():
        if pd.isna(row['ExitYear']) and not pd.isna(row['LaunchYear']):
            estimated_duration = theme_avg_duration.get(row['Theme'], year_avg_duration.get(row['LaunchYear'], None))
            if pd.notna(estimated_duration):
                df_lego.at[index, 'ExitYear'] = int(row['LaunchYear'] + round(estimated_duration))
                df_lego.at[index, 'ExitMonth'] = 12
    df_lego.drop(columns=['Duration'], inplace=True)
    df_lego['PackagingType'] = df_lego['PackagingType'].replace({
        '{Not specified}': 'Unknown', 'Plastic canister': 'Canister', 'Plastic box': 'Box',
        'Metal canister': 'Canister', 'Box with handle': 'Box', 'Box with backing card': 'Box',
        'None (loose parts)': 'None'})
    df_lego['Availability'] = df_lego['Availability'].replace({
        '{Not specified}': 'Unknown', 'Promotional (Airline)': 'Promotional'})
    df_lego.loc[df_lego['Theme'] == 'Creator Expert', 'Theme'] = 'Icons'
    return df_lego

def process_lego_data(df_lego):
    current_year = datetime.now().year
    df_lego['YearsSinceExit'] = (current_year - df_lego['ExitYear']).fillna(0).astype(int)
    df_lego['PriceChange'] = ((df_lego['BrickLinkSoldPriceNew'] - df_lego['USRetailPrice']) / df_lego['USRetailPrice']) * 100
    df_lego['PriceChange'] = df_lego['PriceChange'].fillna(0)
    df_lego['ResaleDemand'] = df_lego.apply(lambda row: row['BrickLinkSoldPriceNew'] / row['BrickLinkSoldPriceUsed']
                                             if row['BrickLinkSoldPriceUsed'] > 0 else 0, axis=1)
    df_lego['AppreciationTrend'] = df_lego.apply(lambda row: row['PriceChange'] / row['YearsSinceExit']
                                                 if row['YearsSinceExit'] > 0 else 0, axis=1)
    size_labels = ['Small', 'Medium', 'Large']
    df_lego['SizeCategory'] = pd.cut(df_lego['Pieces'], bins=[0, 249, 1000, float('inf')], labels=size_labels, include_lowest=True)
    exclusive_themes = ['Star Wars', 'Modular Buildings', 'Ideas', 'Creator Expert', 'Harry Potter',
                        'Marvel Super Heroes', 'Ghostbusters', 'Icons', 'The Lord of the Rings',
                        'Pirates of the Caribbean', 'Pirates', 'Trains', 'Architecture']
    df_lego['Exclusivity'] = df_lego['Theme'].apply(lambda x: 'Exclusive' if x in exclusive_themes else 'Regular')
    theme_popularity = df_lego.groupby('Theme')['PriceChange'].mean().replace([np.inf, -np.inf], np.nan)
    df_lego['ThemePopularity'] = df_lego['Theme'].map(theme_popularity).fillna(0)
    df_lego['InvestmentScore'] = df_lego.apply(lambda row: (row['PriceChange'] * 0.4) +
                                                         (row['AppreciationTrend'] * 0.3) +
                                                         (row['ThemePopularity'] * 0.2) +
                                                         (10 if row['Exclusivity'] == 'Exclusive' else 0), axis=1)
    return df_lego

def main():

     # Obtener mongo_uri desde .env
    mongo_uri = os.getenv("MONGO_URI")

    # Conectar a MongoDB
    client = MongoClient(mongo_uri)
    db = client['dbo_lego']  # Reemplaza con el nombre de tu BBDD
    collection = db['lego_work']
    data = pd.DataFrame(list(collection.find()))
    if '_id' in data.columns:
        data.drop(columns=['_id'], inplace=True)
    data_cleaned = clean_lego_data(data)
    data_processed = process_lego_data(data_cleaned)
    db['lego_work_clean'].insert_many(data_processed.to_dict(orient='records'))
    print("Datos limpiados y guardados en lego_work_clean")

if __name__ == "__main__":
    main()

  df_lego['ExitDate'] = pd.to_datetime(df_lego['ExitDate'], errors='coerce')


Datos limpiados y guardados en lego_work_clean
