# Подготовка данных

In [1]:
!git clone https://github.com/Orange-Hack/finam-x-hse-trade-ai-hack-forecast.git


Cloning into 'finam-x-hse-trade-ai-hack-forecast'...
remote: Enumerating objects: 102, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 102 (delta 2), reused 2 (delta 2), pack-reused 93 (from 1)[K
Receiving objects: 100% (102/102), 26.05 MiB | 3.03 MiB/s, done.
Resolving deltas: 100% (23/23), done.
Updating files: 100% (16/16), done.


In [2]:
# https://drive.google.com/drive/folders/1RKnaPlsKuF-zou0RcCreIBeWm2hG9i6w?usp=sharing
!gdown --folder --id 1RKnaPlsKuF-zou0RcCreIBeWm2hG9i6w -O forecast_data

Retrieving folder contents
Processing file 1faXvPkQOHybpYZV48FSzEtdmA1KgGxfS candles_2.csv
Processing file 1-YxO86oYLhgM77QPr4UFRzoYlkkuUil- candles.csv
Processing file 1p7fdkaJl-kwhAbUph6ntHa9D70dnHXBf news_2.csv
Processing file 1Ozc9E-ZtQzqhLqJDfpimQoPt7dz4Ap8n news.csv
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1faXvPkQOHybpYZV48FSzEtdmA1KgGxfS
To: /content/forecast_data/candles_2.csv
100% 89.0k/89.0k [00:00<00:00, 83.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-YxO86oYLhgM77QPr4UFRzoYlkkuUil-
To: /content/forecast_data/candles.csv
100% 1.25M/1.25M [00:00<00:00, 184MB/s]
Downloading...
From: https://drive.google.com/uc?id=1p7fdkaJl-kwhAbUph6ntHa9D70dnHXBf
To: /content/forecast_data/news_2.csv
100% 8.73M/8.73M [00:00<00:00, 32.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Ozc9E-ZtQzqhLqJDfpimQoPt7dz4Ap8n
To: /content/forecast_data/news.csv
10

In [3]:
import pandas as pd
import numpy as np


In [4]:
# Пути к файлам
path1 = '/content/forecast_data/candles.csv'
path2 = '/content/forecast_data/candles_2.csv'

# Загружаем оба файла
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)

# Объединяем и удаляем дубликаты
train_candles = pd.concat([df1, df2], ignore_index=True).drop_duplicates()

# Сохраняем результат
train_candles.to_csv('/content/forecast_data/candles_merged.csv', index=False)

print(f'Объединено: {len(train_candles)} строк (после удаления дубликатов)')


Объединено: 25621 строк (после удаления дубликатов)


In [5]:
len(train_candles['ticker'].unique())

19

In [6]:
train_candles['ticker'].value_counts()

Unnamed: 0_level_0,count
ticker,Unnamed: 1_level_1
AFLT,1351
ALRS,1351
CHMF,1351
GAZP,1351
LKOH,1351
MAGN,1351
MGNT,1351
MOEX,1351
ROSN,1351
SBER,1351


# Feature-инжиниринг

Добавленные фичи

1. Доходности
- **log_return** — логарифмическая доходность  
  $$ \ln\left(\frac{Close_t}{Close_{t-1}}\right) $$

- **log_return_lag1** — лаг доходности (на 1 шаг назад)  
  $$ \ln\left(\frac{Close_{t-1}}{Close_{t-2}}\right) $$

- **log_return_lag2** — лаг доходности (на 2 шага назад)  
  $$ \ln\left(\frac{Close_{t-2}}{Close_{t-3}}\right) $$

---

2. Скользящие средние и MACD
- **close_over_ema20** — нормализованная цена закрытия  
  $$ \frac{Close_t}{EMA_{20}(Close_t)} $$

- **macd** — индикатор MACD (разница EMA12 и EMA26)  
  $$ MACD_t = EMA_{12}(Close_t) - EMA_{26}(Close_t) $$

- **macd_signal** — сигнальная линия MACD (EMA9 от MACD)  
  $$ Signal_t = EMA_{9}(MACD_t) $$

---

3. Волатильность
- **rolling_vol_10** — скользящее стандартное отклонение лог-доходностей (10 периодов)  
  $$ \sigma_{10}(r_t) = \sqrt{\frac{1}{10}\sum_{i=0}^{9}(r_{t-i}-\bar{r})^2} $$

- **atr_14** — Average True Range (14 периодов)  
  $$ TR_t = \max \{High_t - Low_t,\ |High_t - Close_{t-1}|,\ |Low_t - Close_{t-1}|\} $$
  
  $$ ATR_t^{(14)} = \frac{1}{14}\sum_{i=0}^{13} TR_{t-i} $$

---

4. Форма свечи
- **candle_direction** — направление свечи  
  $$ \begin{cases}
  1, & \text{если } Close_t > Open_t \\
  0, & \text{иначе}
  \end{cases} $$

---

5. Объём
- **volume_ratio** — отношение текущего объёма к среднему  
  $$ \frac{Volume_t}{volume\_ma\_{10}} $$

---

6. Календарные фичи
- **dow_sin**, **dow_cos** — синус/косинус дня недели (циклическое кодирование)  
  $$ dow\_sin = \sin\left(\frac{2\pi \cdot dayofweek}{7}\right) $$
  
  $$ dow\_cos = \cos\left(\frac{2\pi \cdot dayofweek}{7}\right) $$


In [7]:
def add_technical_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Добавляет только информативные и некоррелированные технические фичи.
    Вход: df с колонками ['open', 'close', 'high', 'low', 'volume', 'begin', 'ticker']
    Выход: df с колонками: [
      'open', 'close', 'high', 'low', 'volume', 'begin', 'ticker',
      'log_return', 'log_return_lag1', 'log_return_lag2',
      'close_over_ema20', 'macd', 'macd_signal',
      'rolling_vol_10', 'atr_14',
      'candle_direction',
      'volume_ratio',
      'dow_sin', 'dow_cos'
    ]
    """
    df = df.copy()
    df['begin'] = pd.to_datetime(df['begin'])
    df = df.sort_values(['ticker', 'begin']).reset_index(drop=True)

    def compute_features(group):
        g = group.copy().reset_index(drop=True)

        # --- 1. Лог-доходности и лаги ---
        g['log_return'] = np.log(g['close'] / g['close'].shift(1))
        g['log_return_lag1'] = g['log_return'].shift(1)
        g['log_return_lag2'] = g['log_return'].shift(2)
        g['log_return_lag3'] = g['log_return'].shift(3)
        g['log_return_lag4'] = g['log_return'].shift(4)
        g['log_return_lag5'] = g['log_return'].shift(5)

        # --- 2. Доходности на окнах ---
        # 5-дневная доходность от t-10 до t-5
        g['log_return_window_5'] = np.log(g['close'].shift(5)) - np.log(g['close'].shift(10))

        # 10-дневная доходность от t-20 до t-10
        g['log_return_window_10'] = np.log(g['close'].shift(10)) - np.log(g['close'].shift(20))


        # --- 2. Нормированное отклонение от тренда ---
        ema_20 = g['close'].ewm(span=20, adjust=False).mean()
        g['close_over_ema20'] = g['close'] / ema_20

        # --- 3. MACD и сигнальная линия ---
        ema_12 = g['close'].ewm(span=12, adjust=False).mean()
        ema_26 = g['close'].ewm(span=26, adjust=False).mean()
        g['macd'] = ema_12 - ema_26
        g['macd_signal'] = g['macd'].ewm(span=9, adjust=False).mean()

        # --- 4. Волатильность (скользящая std лог-доходности) ---
        g['rolling_vol_10'] = g['log_return'].rolling(window=10, min_periods=2).std()

        # --- 5. ATR (Average True Range) ---
        tr1 = g['high'] - g['low']
        tr2 = abs(g['high'] - g['close'].shift(1))
        tr3 = abs(g['low'] - g['close'].shift(1))
        tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        g['atr_14'] = tr.rolling(window=14, min_periods=1).mean()

        # --- 6. Направление свечи (бинарное) ---
        g['candle_direction'] = (g['close'] > g['open']).astype(int)

        # --- 7. Относительный объём (аномалия) ---
        volume_ma_10 = g['volume'].rolling(window=10, min_periods=1).mean()
        g['volume_ratio'] = g['volume'] / volume_ma_10

        # --- 8. Циклическое кодирование дня недели ---
        dow = g['begin'].dt.dayofweek
        g['dow_sin'] = np.sin(2 * np.pi * dow / 7)
        g['dow_cos'] = np.cos(2 * np.pi * dow / 7)

        return g

    df_out = df.groupby('ticker', group_keys=False).apply(compute_features).reset_index(drop=True)
    return df_out

In [8]:
train_candles = add_technical_features(train_candles)

  df_out = df.groupby('ticker', group_keys=False).apply(compute_features).reset_index(drop=True)


In [9]:
train_candles.head()

Unnamed: 0,open,close,high,low,volume,begin,ticker,log_return,log_return_lag1,log_return_lag2,...,log_return_window_10,close_over_ema20,macd,macd_signal,rolling_vol_10,atr_14,candle_direction,volume_ratio,dow_sin,dow_cos
0,81.5,81.7,83.2,81.16,29755530,2020-06-19,AFLT,,,,...,,1.0,0.0,0.0,,2.04,1,1.0,-0.433884,-0.900969
1,81.72,82.1,83.98,80.26,18502950,2020-06-22,AFLT,0.004884,,,...,,1.004428,0.031909,0.006382,,2.88,1,0.766827,0.0,1.0
2,82.04,81.2,82.48,80.4,16848930,2020-06-23,AFLT,-0.011023,0.004884,,...,,0.99404,-0.01525,0.002055,0.011248,2.613333,0,0.77636,0.781831,0.62349
3,79.78,80.58,80.8,78.22,21559860,2020-06-25,AFLT,-0.007665,-0.011023,0.004884,...,,0.987725,-0.101482,-0.018652,0.008384,2.705,1,0.995064,0.433884,-0.900969
4,80.5,79.38,81.44,78.76,14677280,2020-06-26,AFLT,-0.015004,-0.007665,-0.011023,...,,0.975523,-0.263614,-0.067644,0.008598,2.7,0,0.724128,-0.433884,-0.900969


# Обучение модели

#### Модель Ridge
для дневной доходности следующего дня

In [10]:
# ==============================================
# FINAL: Ridge (multi-output) для r_{t+1..t+20}
# Без утечек + OHE тикера + временной декай
# ==============================================
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from pandas.tseries.offsets import BDay

# -------------------
# Конфигурация
# -------------------
SEED = 0
np.random.seed(SEED)

TARGET_DAY = pd.Timestamp('2025-09-08')  # дата t0 (последний день в данных)
H = 20          # горизонт будущих дней
K = 20          # длина окна прошлого
VAL_Q = 0.8     # доля вал внутри "allowed" окон для подбора alpha
HALF_LIFE_BD = 60  # полупериод временного декай (в рабочих днях)
PER_H_ALPHA = True  # True = отдельный alpha и модель для каждого горизонта

# -------------------
# 0) ДАННЫЕ
# -------------------
df = train_candles.copy()
df['begin'] = pd.to_datetime(df['begin'])
df = df.sort_values(['ticker','begin']).reset_index(drop=True)

# цена для доходности
price_col = 'adj_close' if 'adj_close' in df.columns else 'close'
df[price_col] = pd.to_numeric(df[price_col], errors='coerce')
df.loc[df[price_col] == 0, price_col] = np.nan
df[price_col] = df.groupby('ticker', group_keys=False)[price_col].apply(lambda s: s.ffill().bfill())

# однодневная форвард-доходность r_{t+1} = P_{t+1}/P_t - 1
df['target_return_1d'] = df.groupby('ticker')[price_col].shift(-1) / df[price_col] - 1

# признаки (подстройте под свои реальные колонки)
BASE_FEATURES = [
    'open','close','high','low','volume',
    'log_return','log_return_lag1','log_return_lag2','log_return_lag3','log_return_lag4','log_return_lag5',
    'log_return_window_5','log_return_window_10',
    'close_over_ema20','macd','macd_signal',
    'rolling_vol_10','atr_14','candle_direction','volume_ratio',
    'dow_sin','dow_cos',
]
RET_COL = 'target_return_1d'
for c in BASE_FEATURES:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# контроль: последняя дата по каждому тикеру
last_by_tic = df.groupby('ticker')['begin'].max()
if not (last_by_tic == TARGET_DAY).all():
    bad = last_by_tic[last_by_tic != TARGET_DAY]
    raise ValueError(f"Не у всех тикеров последняя дата = {TARGET_DAY.date()}. Несовпадения:\n{bad}")

# -------------------
# 1) ЭМБАРГО И «РАЗРЕШЁННОЕ» ПРОШЛОЕ
# -------------------
final_cutoff_t_end = TARGET_DAY - BDay(H)   # окна только с t_end ≤ t0 − H

# -------------------
# 2) ПРЕПРОЦЕССОР (fit только на прошлом: begin < t0 − H)
# -------------------
pre_fit_mask = df['begin'] < final_cutoff_t_end
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
num_pipe.fit(df.loc[pre_fit_mask, BASE_FEATURES])

X_scaled_all = pd.DataFrame(
    num_pipe.transform(df[BASE_FEATURES]),
    columns=BASE_FEATURES, index=df.index
)

# -------------------
# 3) ОКНА (K) + ТАРГЕТЫ (H)
# -------------------
def build_windowed_dataset(df_sorted, X_scaled, feature_cols, ret_col, K=20, H=20):
    X_list, Y_list, meta = [], [], []
    F = len(feature_cols)
    for tic, dft in df_sorted.groupby('ticker', sort=False):
        idx = dft.index.to_numpy()
        X_tic = X_scaled.loc[idx, feature_cols].to_numpy()
        r1_tic = df_sorted.loc[idx, ret_col].to_numpy()
        for tpos in range(K-1, len(idx)-H):
            X_win = X_tic[tpos-K+1:tpos+1, :]      # [K,F] (t-19..t)
            x_vec = X_win.reshape(K*F)
            y_vec = r1_tic[tpos+1:tpos+1+H]        # r_{t+1..t+20}
            if not (np.isfinite(x_vec).all() and np.isfinite(y_vec).all()):
                continue
            X_list.append(x_vec.astype(np.float32))
            Y_list.append(y_vec.astype(np.float32))
            meta.append((tic, df_sorted.loc[idx[tpos], 'begin']))
    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    meta_df = pd.DataFrame(meta, columns=['ticker','t_end'])
    return X, Y, meta_df

X_all, Y_all, meta_all = build_windowed_dataset(df, X_scaled_all, BASE_FEATURES, RET_COL, K=K, H=H)

# ----- оставляем ТОЛЬКО «разрешённые» окна: t_end ≤ t0 − H -----
allowed_mask = meta_all['t_end'] <= final_cutoff_t_end
X_allowed, Y_allowed = X_all[allowed_mask], Y_all[allowed_mask]
meta_allowed = meta_all.loc[allowed_mask].reset_index(drop=True)

print("Allowed windows:",
      "X_allowed", X_allowed.shape, "Y_allowed", Y_allowed.shape)

# -------------------
# 4) One-Hot тикера
# -------------------
all_tickers = sorted(df['ticker'].unique().tolist())
tic2pos = {t:i for i,t in enumerate(all_tickers)}

def ohe_from_series(ticker_series: pd.Series) -> np.ndarray:
    M = np.zeros((len(ticker_series), len(all_tickers)), dtype=np.float32)
    ts = ticker_series.to_numpy()
    for i, t in enumerate(ts):
        M[i, tic2pos[t]] = 1.0
    return M

OHE_allowed = ohe_from_series(meta_allowed['ticker'])
X_allowed_ext = np.hstack([X_allowed, OHE_allowed])  # окно + OHE

# -------------------
# 5) Временной декай (sample weights) — без утечки
# -------------------
t_cut = final_cutoff_t_end.normalize()
delta_bd = np.array(
    [np.busday_count(d.date(), t_cut.date()) for d in meta_allowed['t_end']],
    dtype=np.int32
)
w_allowed = (0.5) ** (delta_bd / HALF_LIFE_BD)

# -------------------
# 6) Внутри allowed: train/val по времени для подбора alpha
# -------------------
val_start_allowed = meta_allowed['t_end'].quantile(VAL_Q)
train_mask = meta_allowed['t_end'] <  val_start_allowed
val_mask   = meta_allowed['t_end'] >= val_start_allowed

X_tr_ext,  Y_tr  = X_allowed_ext[train_mask], Y_allowed[train_mask]
X_val_ext, Y_val = X_allowed_ext[val_mask],   Y_allowed[val_mask]
w_tr, w_val      = w_allowed[train_mask],     w_allowed[val_mask]

print("Train:", X_tr_ext.shape, "Val:", X_val_ext.shape)

def multioutput_mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

# -------------------
# 7) Подбор alpha и ОБУЧЕНИЕ
# -------------------
if not PER_H_ALPHA:
    # Вариант 1: один alpha для всех горизонтов (быстро)
    def try_alpha(a):
        mdl = Ridge(alpha=a, fit_intercept=True, random_state=SEED)
        mdl.fit(X_tr_ext, Y_tr, sample_weight=w_tr)
        pred = mdl.predict(X_val_ext)
        return multioutput_mae(Y_val, pred)

    best_alpha, best_mae = None, np.inf
    for a in np.logspace(-3, 2, 20):
        mae = try_alpha(a)
        if mae < best_mae:
            best_mae, best_alpha = mae, float(a)
    lo, hi = max(best_alpha/3, 1e-6), best_alpha*3
    for a in np.logspace(np.log10(lo), np.log10(hi), 25):
        mae = try_alpha(a)
        if mae < best_mae:
            best_mae, best_alpha = mae, float(a)
    print(f"[ONE-ALPHA] Best alpha: {best_alpha:.6g} | MAE_val={best_mae:.6f}")

    final_model = Ridge(alpha=best_alpha, fit_intercept=True, random_state=SEED)
    final_model.fit(X_allowed_ext, Y_allowed, sample_weight=w_allowed)

else:
    # Вариант 2: отдельный alpha и модель для каждого горизонта (лучше, всё ещё быстро)
    alphas_grid = np.logspace(-3, 2, 20)
    models_h, best_alpha_h = [], []
    for h in range(Y_tr.shape[1]):  # 0..19 (t+1..t+20)
        ytr_h, yvl_h = Y_tr[:, h], Y_val[:, h]
        best_a, best_m = None, np.inf
        # coarse
        for a in alphas_grid:
            mdl = Ridge(alpha=a, fit_intercept=True, random_state=SEED)
            mdl.fit(X_tr_ext, ytr_h, sample_weight=w_tr)
            pred = mdl.predict(X_val_ext)
            m = float(np.mean(np.abs(yvl_h - pred)))
            if m < best_m:
                best_m, best_a = m, float(a)
        # refine
        lo, hi = max(best_a/3, 1e-6), best_a*3
        for a in np.logspace(np.log10(lo), np.log10(hi), 25):
            mdl = Ridge(alpha=a, fit_intercept=True, random_state=SEED)
            mdl.fit(X_tr_ext, ytr_h, sample_weight=w_tr)
            pred = mdl.predict(X_val_ext)
            m = float(np.mean(np.abs(yvl_h - pred)))
            if m < best_m:
                best_m, best_a = m, float(a)
        best_alpha_h.append(best_a)

        # финальное обучение на всех allowed окна с весами
        mdl = Ridge(alpha=best_a, fit_intercept=True, random_state=SEED)
        mdl.fit(X_allowed_ext, Y_allowed[:, h], sample_weight=w_allowed)
        models_h.append(mdl)

    print("[PER-H] Подобраны alphas по горизонтам (первые 5):", [round(a,6) for a in best_alpha_h[:5]])

# -------------------
# 8) SUBMISSION на t0: окно K, оканчивающееся t0, для каждого тикера
# -------------------
def make_window_for_date_ext(df_all, X_scaled_all, feature_cols, ticker, t_date, K=20):
    dft = df_all[df_all['ticker']==ticker].sort_values('begin')
    if dft['begin'].iloc[-1] != t_date:
        raise ValueError(f"{ticker}: последняя дата {dft['begin'].iloc[-1].date()} != {t_date.date()}")
    idx = dft.index.to_numpy()
    last_idx = idx[-K:]
    if len(last_idx) < K:
        raise ValueError(f"{ticker}: недостаточно строк для окна K={K}")
    X_win = X_scaled_all.loc[last_idx, feature_cols].to_numpy()
    x_vec = X_win.reshape(K*len(feature_cols)).astype(np.float32)
    # OHE тикера
    ohe = np.zeros(len(all_tickers), dtype=np.float32)
    ohe[tic2pos[ticker]] = 1.0
    return np.concatenate([x_vec, ohe], axis=0)  # (K*F + n_tickers,)

rows = []
for tic in all_tickers:
    x_vec_ext = make_window_for_date_ext(df, X_scaled_all, BASE_FEATURES, tic, TARGET_DAY, K=K)
    if PER_H_ALPHA:
        y_hat = np.array([models_h[h].predict(x_vec_ext.reshape(1,-1))[0] for h in range(H)], dtype=float)
    else:
        y_hat = final_model.predict(x_vec_ext.reshape(1,-1))[0]
    rows.append({"ticker": tic, **{f"p{i+1}": float(y_hat[i]) for i in range(H)}})

submission = pd.DataFrame(rows).sort_values('ticker').reset_index(drop=True)
# контроль качества файла
assert submission.shape[1] == (1 + H), f"submission columns mismatch: {submission.shape}"
assert submission.isna().sum().sum() == 0, "NaN в submission!"

def transform_row(row):
    # копируем, чтобы не портить исходные данные
    new_row = row.copy()
    for j in range(2, 21):
        new_row.iloc[j] = (new_row.iloc[j - 1] + 1) * (new_row.iloc[j] + 1) - 1
    return new_row


submission = submission.apply(transform_row, axis=1)

submission.to_csv("submission.csv", index=False)
print("submission.csv создан. Форма:", submission.shape)
print(submission.head())


Allowed windows: X_allowed (24728, 440) Y_allowed (24728, 20)
Train: (19781, 459) Val: (4947, 459)


  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(*arrays, *other_args, **kwargs)
  return f(

[PER-H] Подобраны alphas по горизонтам (первые 5): [300.0, 300.0, 300.0, 300.0, 300.0]
submission.csv создан. Форма: (19, 21)
  ticker        p1        p2        p3        p4        p5        p6  \
0   AFLT  0.003094 -0.005721 -0.005024 -0.000554  0.001993 -0.002291   
1   ALRS  0.004726 -0.002787 -0.004065  0.000994  0.002771 -0.004142   
2   CHMF  0.000948 -0.004642 -0.006675 -0.001939 -0.000098 -0.001472   
3   GAZP  0.003260  0.001522  0.003570  0.008301  0.011327  0.014672   
4   GMKN -0.000823 -0.002076 -0.003455  0.001680 -0.003049 -0.001375   

         p7        p8        p9  ...       p11       p12       p13       p14  \
0 -0.000801  0.002000  0.000254  ... -0.005787 -0.005857 -0.000167  0.001050   
1 -0.009703 -0.004991 -0.007173  ... -0.008785 -0.017194 -0.005223 -0.004482   
2 -0.004910 -0.004147 -0.005828  ... -0.012242 -0.013966 -0.011308 -0.008939   
3  0.015103  0.018226  0.014386  ...  0.011874  0.012979  0.013730  0.012795   
4 -0.002678  0.005245 -0.002193  ... -0.0

In [11]:
import os, joblib
os.makedirs('/content', exist_ok=True)

# Сохраняем препроцессор всегда
joblib.dump(num_pipe, '/content/preprocessor.joblib')

if PER_H_ALPHA:
    # режим с 20 моделями — сохраняем их и метаданные
    joblib.dump(
        {
            "models_h": models_h,            # список из 20 Ridge-моделей
            "best_alpha_h": best_alpha_h,    # список из 20 alpha
            "all_tickers": all_tickers,
            "tic2pos": tic2pos,
            "BASE_FEATURES": BASE_FEATURES,
            "K": K, "H": H
        },
        '/content/final_ridge_models_h.joblib'
    )
    print("Сохранено: /content/final_ridge_models_h.joblib и /content/preprocessor.joblib")
else:
    # режим с одной моделью — сохраняем final_model и метаданные
    joblib.dump(
        {
            "model": final_model,            # единая Ridge-модель
            "best_alpha": best_alpha,
            "all_tickers": all_tickers,
            "tic2pos": tic2pos,
            "BASE_FEATURES": BASE_FEATURES,
            "K": K, "H": H
        },
        '/content/final_ridge_model.joblib'
    )
    print("Сохранено: /content/final_ridge_model.joblib и /content/preprocessor.joblib")


Сохранено: /content/final_ridge_models_h.joblib и /content/preprocessor.joblib


# Рассчет метрик

In [12]:
# === ЕДИНЫЕ МЕТРИКИ (объединённая версия) ===
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
import numpy as np
import pandas as pd

def cum_return(r):  # r shape (n,H) или (H,)
    r = np.asarray(r)
    if r.ndim == 1:
        return np.prod(1.0 + r) - 1.0
    return np.prod(1.0 + r, axis=1) - 1.0

def _horizon_metrics_single(y_tr_true, y_tr_pred, y_val_true, y_val_pred):
    # MAE и нормализация относительно "нулевого" бейзлайна (всегда 0)
    mae = float(np.mean(np.abs(y_val_true - y_val_pred)))
    mae_base = 0.018
    mae_norm = 1.0 - mae / mae_base

    # Направление (DA)
    da = float(np.mean(np.sign(y_val_pred) == np.sign(y_val_true)))

    # Brier: калибруем P(y>0 | y_pred) логистикой по train-предсказаниям
    y_tr_bin  = (y_tr_true  > 0).astype(int)
    y_val_bin = (y_val_true > 0).astype(int)
    p_base = float(y_tr_bin.mean())

    # устойчивый fallback на случай одного класса
    if (p_base == 0.0) or (p_base == 1.0):
        p_val = np.full_like(y_val_true, p_base, dtype=float)
    else:
        lr = LogisticRegression(max_iter=2000, solver='lbfgs', C=1.0, random_state=0)
        lr.fit(y_tr_pred.reshape(-1, 1), y_tr_bin)
        p_val = lr.predict_proba(y_val_pred.reshape(-1, 1))[:, 1]

    brier = float(brier_score_loss(y_val_bin, p_val))
    brier_base = 0.25
    brier_norm = 1.0 - brier / brier_base

    # NB: если нужно нормировать на сумму весов = 1, можно поделить на 1.1
    score = 0.7 * mae_norm + 0.3 * brier_norm + 0.1 * (1/da)
    return {"MAE": mae, "MAE_base": mae_base, "MAE_norm": mae_norm,
            "Brier": brier, "Brier_base": brier_base, "Brier_norm": brier_norm,
            "DA": da, "Score": score}

def eval_all_metrics(Y_tr, Y_tr_pred, Y_val, Y_val_pred):
    """
    Входы: матрицы формы (n_samples, H=20)
    Возвращает:
      per_h_df — метрики по каждому горизонту h=1..20,
      m_h1     — метрики для H=1 (это просто per_h_df с h=1),
      m_h20    — метрики для кумулятивной доходности H=20
    """
    H = Y_tr.shape[1]
    per_h = []
    for h in range(H):  # 0..H-1 => t+1..t+H
        m_h = _horizon_metrics_single(
            Y_tr[:, h], Y_tr_pred[:, h],
            Y_val[:, h], Y_val_pred[:, h]
        )
        m_h["h"] = h + 1
        per_h.append(m_h)
    per_h_df = pd.DataFrame(per_h)

    # H=1 (как в "втором" блоке)
    m_h1 = per_h_df.loc[per_h_df['h'] == 1].iloc[0].to_dict()

    # Кумулятивная доходность H=20 (как в "втором" блоке)
    R_tr_true  = cum_return(Y_tr[:, :H])
    R_tr_pred  = cum_return(Y_tr_pred[:, :H])
    R_val_true = cum_return(Y_val[:, :H])
    R_val_pred = cum_return(Y_val_pred[:, :H])
    m_h20 = _horizon_metrics_single(R_tr_true, R_tr_pred, R_val_true, R_val_pred)
    m_h20["h"] = "cum20"

    # Печать короткой сводки
    avg = per_h_df[["MAE_norm","Brier_norm","DA","Score"]].mean().to_dict()
    print("Средние метрики по h=1..20:",
          {k: round(v, 6) for k, v in avg.items()})
    print("H=1:",
          {k: round(v, 6) for k, v in m_h1.items() if k in ["MAE_norm","Brier_norm","DA","Score"]})
    print("Cum H=20:",
          {k: round(v, 6) for k, v in m_h20.items() if k in ["MAE_norm","Brier_norm","DA","Score"]})
    print("Топ-5 горизонтов по Score:")
    print(per_h_df.sort_values("Score", ascending=False).head(5)[["h","Score","MAE_norm","Brier_norm","DA"]])

    return per_h_df, m_h1, m_h20


In [13]:
# если у тебя X_tr_ext / X_val_ext (c OHE) — используй их
Y_tr_pred  = np.column_stack([models_h[h].predict(X_tr_ext)  for h in range(Y_tr.shape[1])])
Y_val_pred = np.column_stack([models_h[h].predict(X_val_ext) for h in range(Y_val.shape[1])])



In [14]:
per_h_df, m1, m20 = eval_all_metrics(Y_tr, Y_tr_pred, Y_val, Y_val_pred)


Средние метрики по h=1..20: {'MAE_norm': 0.184431, 'Brier_norm': -0.000229, 'DA': 0.600061, 'Score': 0.295712}
H=1: {'MAE_norm': 0.189741, 'Brier_norm': -1.8e-05, 'DA': 0.596321, 'Score': 0.300508}
Cum H=20: {'MAE_norm': -2.820202, 'Brier_norm': 0.00179, 'DA': 0.611684, 'Score': -1.810121}
Топ-5 горизонтов по Score:
     h     Score  MAE_norm  Brier_norm        DA
0    1  0.300508  0.189741   -0.000018  0.596321
1    2  0.298687  0.189824   -0.000099  0.602992
19  20  0.297759  0.188072   -0.000406  0.601577
18  19  0.297375  0.189307   -0.000501  0.606024
17  18  0.296879  0.185491   -0.000308  0.598342
