In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

SUB_DIR = Path("./")  # папка с сабмитами
SUB_FILES = [
    "submission_1.csv",
    "submission_2.csv",
    # "submission_3.csv",
]

ID_COL = "id"          # имя id-колонки
TARGET_COLS = None     # None = взять все кроме ID_COL, иначе список, например ["target"]

OUTPUT_DIR = Path("./ensembled_submissions")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Config OK")

In [None]:
subs = []
for f in SUB_FILES:
    path = SUB_DIR / f
    df = pd.read_csv(path)
    df["_source"] = f
    subs.append(df)
len(subs)

In [None]:
from functools import reduce

for i, df in enumerate(subs):
    assert ID_COL in df.columns, f"{SUB_FILES[i]}: ID_COL={ID_COL} not in columns"

# оставляем только общие колонки
common_cols = list(reduce(lambda a, b: a & b, (set(df.columns) for df in subs)))
if "_source" in common_cols:
    common_cols.remove("_source")

print("Общие колонки:", len(common_cols))
print(common_cols)

subs = [df[common_cols].sort_values(ID_COL).reset_index(drop=True) for df in subs]

for i in range(1, len(subs)):
    assert subs[i][ID_COL].equals(subs[0][ID_COL]), "ID-строки не совпадают между сабмитами"

base = subs[0].copy()

In [None]:
if TARGET_COLS is None:
    TARGET_COLS = [c for c in base.columns if c != ID_COL]
print("TARGET_COLS:", TARGET_COLS)

In [None]:
pred_stack = np.stack([df[TARGET_COLS].values for df in subs], axis=0)
pred_stack.shape  # (n_models, n_rows, n_targets)

In [None]:
ens_mean = pred_stack.mean(axis=0)
ens_median = np.median(pred_stack, axis=0)

sub_mean = base[[ID_COL]].copy()
sub_median = base[[ID_COL]].copy()

for i, c in enumerate(TARGET_COLS):
    sub_mean[c] = ens_mean[:, i]
    sub_median[c] = ens_median[:, i]

path_mean = OUTPUT_DIR / "submission_ens_mean.csv"
path_median = OUTPUT_DIR / "submission_ens_median.csv"

sub_mean.to_csv(path_mean, index=False)
sub_median.to_csv(path_median, index=False)

path_mean, path_median

In [None]:
n_models, n_rows, n_targets = pred_stack.shape
rank_stack = np.zeros_like(pred_stack, dtype=float)

for m in range(n_models):
    for j in range(n_targets):
        vals = pred_stack[m, :, j]
        order = np.argsort(vals)
        ranks = np.empty_like(order, dtype=float)
        ranks[order] = np.arange(1, n_rows + 1)
        rank_stack[m, :, j] = ranks

rank_mean = rank_stack.mean(axis=0)

sub_rank = base[[ID_COL]].copy()
for j, c in enumerate(TARGET_COLS):
    sub_rank[c] = rank_mean[:, j]

path_rank = OUTPUT_DIR / "submission_ens_rank.csv"
sub_rank.to_csv(path_rank, index=False)

path_rank