# ```setup()```

In [2]:
import sys

!{sys.executable} -m pip install pandas numpy scikit-learn joblib

Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting joblib
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.0 MB ? eta -:--:--
   ----- ---------------------------------- 1.0/8.0 MB 2.8 MB/s eta 0:00:03
   ------- -------------------------------- 1.6/8.0 MB 2.7 MB/s eta 0:00:03
   ---------- ----------------------------- 2.1/8.0 MB 2.8 MB/s eta 0:00:03
   -------------- ------------------------- 2.9/8.0 MB 2.8 MB/s eta 0:00:02
   ----------------- ---------------------- 3.4/8.0 MB 2.7 MB/s eta 0:00:02
 


[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
from pathlib import Path


def setup(
    csv_path,
    target_col="Close",
    horizon=1,
    train_ratio=0.7,
    val_ratio=0.15,
    save_artifacts=True,
    artifacts_path="../../data/artifacts/",
    processed_path="../../data/processed/"
):
    
    # -------------------------
    # 1. Load
    # -------------------------
    df = pd.read_csv(csv_path)
    
    # -------------------------
    # 2. Type correction
    # -------------------------
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
    
    if "Change %" in df.columns:
        df["Change %"] = (
            df["Change %"]
            .str.replace("%", "", regex=False)
            .astype(float) / 100
        )
    
    # -------------------------
    # 3. Sort chronologically
    # -------------------------
    df = df.sort_values("Date").reset_index(drop=True)
    
    # -------------------------
    # 4. Target engineering
    # -------------------------
    df["target_return"] = df[target_col].pct_change(horizon).shift(-horizon)
    df["target_direction"] = (df["target_return"] > 0).astype(int)
    
    # -------------------------
    # 5. Drop structural NaNs
    # -------------------------
    df = df.dropna().reset_index(drop=True)
    
    # -------------------------
    # 6. Feature / Target split
    # -------------------------
    feature_cols = [col for col in df.columns if col not in [
        "Date",
        "target_return",
        "target_direction"
    ]]
    
    X = df[feature_cols]
    y = df["target_return"]   # pode trocar para target_direction
    
    # -------------------------
    # 7. Temporal split
    # -------------------------
    n = len(df)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    X_train, X_val, X_test = (
        X.iloc[:train_end],
        X.iloc[train_end:val_end],
        X.iloc[val_end:]
    )
    
    y_train, y_val, y_test = (
        y.iloc[:train_end],
        y.iloc[train_end:val_end],
        y.iloc[val_end:]
    )
    
    # -------------------------
    # 8. Scaling (fit only on train)
    # -------------------------
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # -------------------------
    # 9. Save artifacts
    # -------------------------
    if save_artifacts:
        artifacts_dir = Path(artifacts_path)
        artifacts_dir.mkdir(parents=True, exist_ok=True)
        
        joblib.dump(scaler, artifacts_dir / "feature_scaler.pkl")
        
        processed_dir = Path(processed_path)
        processed_dir.mkdir(parents=True, exist_ok=True)
        
        df.to_csv(processed_dir / "dataset_processed.csv", index=False)
    
    return {
        "X_train": X_train_scaled,
        "X_val": X_val_scaled,
        "X_test": X_test_scaled,
        "y_train": y_train.values,
        "y_val": y_val.values,
        "y_test": y_test.values,
        "feature_columns": feature_cols
    }