In [1]:
!pip -q install -U kaggle lightgbm pyarrow

import os
from getpass import getpass

token = getpass("Paste Kaggle Access Token (KGAT_...): ").strip()

os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/access_token", "w") as f:
    f.write(token)
os.chmod("/root/.kaggle/access_token", 0o600)

os.environ["KAGGLE_API_TOKEN"] = token

if os.path.exists("/root/.kaggle/kaggle.json"):
    os.rename("/root/.kaggle/kaggle.json", "/root/.kaggle/kaggle.json.bak")

print("✅ Token saved. Verifying Kaggle API...")
!kaggle -v
!kaggle competitions list | head -n 5


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.4/86.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.4/256.4 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.3/159.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.0/189.0 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Token saved. Verifying Kaggle API...
Kaggle API 1.8.2
ref                                                            

In [None]:
import os, time, subprocess

COMP = "drw-crypto-market-prediction"
OUT_DIR = "/content/drw_data"
ZIP_PATH = f"{OUT_DIR}/{COMP}.zip"

os.makedirs(OUT_DIR, exist_ok=True)

def file_size_mb(path):
    return os.path.getsize(path) / (1024**2) if os.path.exists(path) else 0

print("Starting download (this is large ~6GB). If it disconnects, rerun this cell.")

# 反复调用 download（Kaggle 会自动 resume），直到 zip 通过 unzip -t 检查
for attempt in range(1, 50):
    print(f"\n=== Attempt {attempt} ===")
    # 运行下载（失败也继续尝试）
    try:
        subprocess.run(
            ["kaggle", "competitions", "download", "-c", COMP, "-p", OUT_DIR],
            check=False
        )
    except Exception as e:
        print("Download call error:", e)

    sz = file_size_mb(ZIP_PATH)
    print(f"Current zip size: {sz:.1f} MB")

    # 如果文件存在就尝试检查完整性（只有下载完才会通过）
    if os.path.exists(ZIP_PATH) and sz > 100:  # 有点大小再测
        test = subprocess.run(["unzip", "-t", ZIP_PATH], capture_output=True, text=True)
        if test.returncode == 0:
            print(" Zip integrity OK (unzip -t passed).")
            break
        else:
            print("Zip not complete yet (unzip -t failed). Continue downloading...")
    time.sleep(10)

print("\nDone. Zip path:", ZIP_PATH, "size(MB):", file_size_mb(ZIP_PATH))

Starting download (this is large ~6GB). If it disconnects, rerun this cell.

=== Attempt 1 ===


In [3]:
!unzip -q /content/drw_data/drw-crypto-market-prediction.zip -d /content/drw_data
!ls -lh /content/drw_data | head -n 30


total 13G
-rw-r--r-- 1 root root 6.0G Jul  9 23:28 drw-crypto-market-prediction.zip
-rw-r--r-- 1 root root  14M Jul  9 23:18 sample_submission.csv
-rw-r--r-- 1 root root 3.2G Jul  9 23:18 test.parquet
-rw-r--r-- 1 root root 3.1G Jul  9 23:22 train.parquet


In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor

DATA_DIR = "/content/drw_data"
SEEDS = [42, 202, 3407]
LABEL_COL = "label"

train = pd.read_parquet(f"{DATA_DIR}/train.parquet")
test  = pd.read_parquet(f"{DATA_DIR}/test.parquet")
sub   = pd.read_csv(f"{DATA_DIR}/sample_submission.csv")

pred_col = sub.columns[1]
print("train:", train.shape, "test:", test.shape, "sub:", sub.shape)
print("submission columns:", sub.columns.tolist())

def add_row_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    eps = 1e-12
    df["ba_imbalance"] = (df["bid_qty"] - df["ask_qty"]) / (df["bid_qty"] + df["ask_qty"] + eps)
    df["trade_imbalance"] = (df["buy_qty"] - df["sell_qty"]) / (df["buy_qty"] + df["sell_qty"] + eps)
    df["volume_over_depth"] = df["volume"] / (df["bid_qty"] + df["ask_qty"] + eps)
    df["buy_ratio"]  = df["buy_qty"]  / (df["volume"] + eps)
    df["sell_ratio"] = df["sell_qty"] / (df["volume"] + eps)
    df["vol_over_trades"] = df["volume"] / (df["buy_qty"] + df["sell_qty"] + eps)

    for c in ["volume_over_depth", "vol_over_trades", "buy_ratio", "sell_ratio"]:
        lo, hi = df[c].quantile(0.01), df[c].quantile(0.99)
        df[c] = df[c].clip(lo, hi)
    return df

def zscore(a):
    a = np.asarray(a)
    return (a - a.mean()) / (a.std() + 1e-12)

train_fe = add_row_features(train)
test_fe  = add_row_features(test)

feature_cols = [c for c in train_fe.columns if c not in [LABEL_COL, "timestamp"]]
X = train_fe[feature_cols]
y = train_fe[LABEL_COL]
X_test = test_fe[feature_cols]

imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)
X_test_imp = imp.transform(X_test)

preds = []
for sd in SEEDS:
    m = LGBMRegressor(
        n_estimators=3500,
        learning_rate=0.02,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=sd,
    )
    m.fit(X_imp, y)
    preds.append(zscore(m.predict(X_test_imp)))

pred_ens = zscore(np.mean(preds, axis=0))

sub_out = sub.copy()
sub_out[pred_col] = pred_ens
out_path = "/content/submission_ensemble.csv"
sub_out.to_csv(out_path, index=False)

print("Saved:", out_path)
sub_out.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drw_data/train.parquet'