In [20]:
import pandas as pd

In [21]:
df_lego_mongo = pd.read_csv('../TRABAJO/dbo_lego.lego_final_venta_20250309.csv')
df_lego_venta = pd.read_csv('../01_Data_Cleaning/df_lego_final_venta.csv')

In [22]:
# Hay columnas con diferencias de datos. Todas son int64, mientras que en el archivo de venta son float64
# Se convierten a float64
cols_to_convert = ['ThemePopularity', 'AppreciationTrend', 'Pieces', 'AgeMax', 'Minifigs', 'AgeMin']
df_lego_mongo[cols_to_convert] = df_lego_mongo[cols_to_convert].astype(float)

In [23]:
# Borramos las columnas con diferencias numéricas para volver a crearlas por si hay error en la creación
cols_to_drop = ['AnnualPriceIncrease', 'AnnualPercentageIncrease', 'YearsSinceExit', 'AppreciationTrend']

df_lego_mongo.drop(columns=cols_to_drop, inplace=True)

***APLICO LA NORMALIZACIÓN DE 01***

In [None]:
from datetime import datetime
import numpy as np

# Obtengo el año actual
current_year = datetime.now().year

# Calculo los años desde la retirada del set
df_lego_mongo['YearsSinceExit'] = current_year - df_lego_mongo['ExitYear']

# Reemplazo valores negativos con 0
df_lego_mongo['YearsSinceExit'] = df_lego_mongo['YearsSinceExit'].apply(lambda x: max(x, 0))

# Reemplazo valores NaN por 0 antes de la conversión
df_lego_mongo['YearsSinceExit'] = df_lego_mongo['YearsSinceExit'].fillna(0).astype(int)

# Creo la columna 'PriceChange' para calcular porcentaje del cambio de precio entre el precio de venta en BrickLink y el precio de venta el la web de Lego en EEUU
df_lego_mongo['PriceChange'] = ((df_lego_mongo['BrickLinkSoldPriceNew'] - df_lego_mongo['USRetailPrice']) / df_lego_mongo['USRetailPrice']) * 100
df_lego_mongo['PriceChange'] = df_lego_mongo['PriceChange'].fillna(0)  # Reemplazar nulos por 0 porque hay precios que no tenemos disponibles

df_lego_mongo['ResaleDemand'] = df_lego_mongo.apply(lambda row: row['BrickLinkSoldPriceNew'] / row['BrickLinkSoldPriceUsed'] if row['BrickLinkSoldPriceUsed'] > 0 else 0, axis=1)
df_lego_mongo['AppreciationTrend'] = df_lego_mongo.apply(lambda row: row['PriceChange'] / row['YearsSinceExit'] if row['YearsSinceExit'] > 0 else 0, axis=1)
df_lego_mongo["AppreciationTrend"] = df_lego_mongo["AppreciationTrend"].astype(float)

#Creo una columna para indicar si el tamaño del set es pequeño, mediano o grande
size_conditions = [
    (df_lego_mongo['Pieces'] < 250),
    (df_lego_mongo['Pieces'].between(250, 1000)),
    (df_lego_mongo['Pieces'] > 1000)
]
size_labels = ['Small', 'Medium', 'Large']
df_lego_mongo['SizeCategory'] = pd.cut(df_lego_mongo['Pieces'], bins=[0, 249, 1000, float('inf')], labels=size_labels, include_lowest=True)

# Definimos sets exclusivos según categorías que mejor revalorización tienen
exclusive_themes = ['Star Wars', 'Modular Buildings', 'Icons', 'Ideas', 'Creator Expert', 'Harry Potter', 'Marvel Super Heroes','Ghostbusters','Icons','The Lord of the Rings','Pirates of the Caribbean','Pirates','Trains','Architecture']
df_lego_mongo['Exclusivity'] = df_lego_mongo['Theme'].apply(lambda x: 'Exclusive' if x in exclusive_themes else 'Regular')

# Calculo ThemePopularity evitando divisiones por cero o valores infinitos
theme_popularity = df_lego_mongo.groupby('Theme')['PriceChange'].mean().replace([np.inf, -np.inf], np.nan)
df_lego_mongo['ThemePopularity'] = df_lego_mongo['Theme'].map(theme_popularity).fillna(0)

# Calculamos InvestmentScore asegurando que no haya valores inf o NaN originados por PriceChange
df_lego_mongo['InvestmentScore'] = df_lego_mongo.apply(lambda row: (row['PriceChange'] * 0.4) +
                                                     (row['AppreciationTrend'] * 0.3) +
                                                     (row['ThemePopularity'] * 0.2) +
                                                     (10 if row['Exclusivity'] == 'Exclusive' else 0), axis=1)

# Calculamos el incremento de precio anual desde que el set fue retirado
df_lego_mongo['AnnualPriceIncrease'] = (df_lego_mongo['BrickLinkSoldPriceNew'] - df_lego_mongo['USRetailPrice']) / df_lego_mongo['YearsSinceExit']

# Reemplazamos valores infinitos o NaN (por si hay sets con YearsSinceExit = 0)
df_lego_mongo.replace([np.inf, -np.inf], np.nan, inplace=True)
df_lego_mongo['AnnualPriceIncrease'].fillna(0)

# Calculamos el porcentaje de aumento anual del precio desde que el set fue retirado
df_lego_mongo['AnnualPercentageIncrease'] = ((df_lego_mongo['BrickLinkSoldPriceNew'] - df_lego_mongo['USRetailPrice']) /
                                       (df_lego_mongo['USRetailPrice'] * df_lego_mongo['YearsSinceExit'])) * 100

# Reemplazamos valores infinitos o NaN (por si hay YearsSinceExit o USRetailPrice en 0)
df_lego_mongo.replace([np.inf, -np.inf], np.nan, inplace=True)
df_lego_mongo['AnnualPercentageIncrease'].fillna(0)

# Selecciono los temas definitivos a estudiar en el trabajo
selected_themes = [
    "Speed Champions", "Architecture", "BrickHeadz", "Star Wars", "Ideas", "Collectable Minifigures",
    "Technic", "Minecraft", "Harry Potter", "Icons", "Ninjago", "Education", "Jurassic World",
    "Duplo", "DC Comics Super Heroes", "Marvel Super Heroes", "Creator", "City", "Friends",
    "Classic", "Disney"
]

# Filtro el dataframe para que solo contenga los temas seleccionados
df_lego_mongo= df_lego_mongo[df_lego_mongo['Theme'].isin(selected_themes)].copy()


In [25]:
df_lego_mongo.to_csv("df_lego_mongo.csv", index=False)


In [26]:
df_lego_mongo

Unnamed: 0,_id,SetID,Number,YearFrom,Category,Theme,Subtheme,SetName,ImageFilename,USRetailPrice,...,PriceChange,ResaleDemand,SizeCategory,Exclusivity,ThemePopularity,InvestmentScore,YearsSinceExit,AppreciationTrend,AnnualPriceIncrease,AnnualPercentageIncrease
0,67cdbf4341e4c15d56707700,31025,10280,2021,Normal,Icons,Botanical Collection,Flower Bouquet,10280-1,59.99,...,-36.739457,1.267535,Medium,Exclusive,0.0,-4.695783,0,0.0,,
1,67cdbf4341e4c15d56707701,30970,10281,2021,Normal,Icons,Botanical Collection,Bonsai Tree,10281-1,49.99,...,-28.685737,1.272305,Medium,Exclusive,0.0,-1.474295,0,0.0,,
2,67cdbf4341e4c15d5670770b,31845,10294,2021,Normal,Icons,Miscellaneous,Titanic,10294-1,679.99,...,-23.810644,1.047579,Large,Exclusive,0.0,0.475742,0,0.0,,
3,67cdbf4341e4c15d5670770c,31389,10295,2021,Normal,Icons,Vehicles,Porsche 911,10295-1,169.99,...,-24.236720,1.288931,Large,Exclusive,0.0,0.305312,0,0.0,,
4,67cdbf4341e4c15d5670770d,32116,10297,2022,Normal,Icons,Modular Buildings Collection,Boutique Hotel,10297-1,229.99,...,-24.953259,1.155056,Large,Exclusive,0.0,0.018696,0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
797,67cdbf4341e4c15d56708e3c,50380,6535736,2024,Extended,Technic,Promotional,McLaren P1 Logo,6535736-1,0.00,...,0.000000,0.000000,Small,Regular,0.0,0.000000,0,0.0,,
798,67cdbf4341e4c15d56708e3d,50392,6541140,2024,Extended,Harry Potter,Promotional,Platform 9 3/4,6541140-1,0.00,...,,0.000000,Small,Exclusive,0.0,,0,0.0,,
799,67cdbf4341e4c15d56708e3e,50408,6545695,2024,Extended,Harry Potter,Promotional,Hogwarts Express,6545695-1,0.00,...,0.000000,0.000000,Small,Exclusive,0.0,10.000000,0,0.0,,
800,67cdbf4341e4c15d56708e3f,50970,6562113,2025,Extended,City,Promotional,Soap Box Racer M&T,LBR2502-1,0.00,...,,0.000000,Small,Regular,0.0,,0,0.0,,
