In [1]:
import polars as pl
from feast import FeatureStore
from pathlib import Path
from tqdm import tqdm

# Configura√ß√£o
REPO_PATH = "./feature_repo"
DATA_PATH = Path("./data")
TRAIN_DATA_PATH = DATA_PATH / "training_dataset.parquet"

store = FeatureStore(repo_path=REPO_PATH)

In [2]:
print("üöÄ 1. Carregando hist√≥rico de transa√ß√µes (Positivos)...")
# Carregamos as transa√ß√µes para usar como "entidade base"

entity_df = pl.read_parquet(DATA_PATH / "transactions_train.parquet", columns=['t_dat', 'customer_id', 'article_id'])
entity_df = entity_df.with_columns(
    pl.col("t_dat").cast(pl.Datetime).alias("event_timestamp")
)
# Usamos tail() para pegar as transa√ß√µes mais recentes (2020), que alinham com as features
entity_df = entity_df.drop("t_dat").tail(300000)

print(f"üì¶ Entidades base carregadas: {entity_df.shape}")

üöÄ 1. Carregando hist√≥rico de transa√ß√µes (Positivos)...
üì¶ Entidades base carregadas: (300000, 3)


In [3]:
print("‚è≥ 2. Invocando Feast para Time-Travel Join...")
# Aqui a m√°gica acontece. O Feast vai buscar qual era o 'avg_spend' do usu√°rio
# NA DATA da compra, n√£o o de hoje.
training_df = store.get_historical_features(
    entity_df=entity_df.to_pandas(),
    features=[
        "user_stats:avg_spend",
        "user_stats:purchase_count",
        "item_stats:popularity_score",
        "item_stats:avg_price"
    ]
).to_df()

training_df.head()

‚è≥ 2. Invocando Feast para Time-Travel Join...


Unnamed: 0,customer_id,article_id,event_timestamp,avg_spend,purchase_count,popularity_score,avg_price
0,f882d43eb7aab25667917f0ddbc18e994bd31538aa10e2...,825509004,2020-09-13 00:00:00+00:00,0.029065,49,42,0.012804
1,ec6d1328d7b8091dddd9b769323c2f55c4f560053d583c...,901575001,2020-09-13 00:00:00+00:00,0.022451,77,20,0.029129
2,f1e428069ea3a08ad75c607edd556552c6fc7c3609d121...,695325020,2020-09-13 00:00:00+00:00,0.024136,7,192,0.02341
3,f04f22807493a3a0d57dec80326396765ed5b3f80f7f7e...,872600009,2020-09-13 00:00:00+00:00,0.02128,23,202,0.026835
4,f18cef960f8901b371958dd413e372ee21d221574d006a...,870957002,2020-09-13 00:00:00+00:00,0.031993,76,40,0.024872


In [4]:
print("üßπ 3. Limpeza e Preenchimento de Nulos...")
# Feature Stores geram Nulos se o usu√°rio era novo na √©poca.
# Preenchemos com 0 ou m√©dia.
training_df.fillna(0, inplace=True)

üßπ 3. Limpeza e Preenchimento de Nulos...


In [5]:
# Encode de IDs para Inteiros (Necess√°rio para Embeddings do PyTorch)
# Em produ√ß√£o, salvar√≠amos esses encoders como artefatos (Pickle/JSON)
print("üî¢ 4. Codificando IDs (Label Encoding)...")
training_df['user_index'] = training_df['customer_id'].astype('category').cat.codes
training_df['item_index'] = training_df['article_id'].astype('category').cat.codes

print(f"üíæ Salvando Dataset de Treino pronto: {TRAIN_DATA_PATH}")
training_df.to_parquet(TRAIN_DATA_PATH)

# Salvar metadados para o modelo saber o tamanho dos embeddings
meta = {
    'num_users': int(training_df['user_index'].max() + 1),
    'num_items': int(training_df['item_index'].max() + 1)
}
import json
with open(DATA_PATH / "model_metadata.json", "w") as f:
    json.dump(meta, f)

print("‚úÖ Conclu√≠do! Metadados:", meta)

üî¢ 4. Codificando IDs (Label Encoding)...
üíæ Salvando Dataset de Treino pronto: data/training_dataset.parquet
‚úÖ Conclu√≠do! Metadados: {'num_users': 14761, 'num_items': 8451}
