In [1]:
# ============================================================
# LG Aimers — Colab A100 Optimized Flat-path Submission Pipeline
#  + ROOM 0-day 보정(평일/주말 분리) 포함
#  + [마지막 전처리] 윈도우 내 양수 최댓값을 '둘째 큰 양수'로 클램프(주중/주말 분리)
#     - 양수가 없으면 0으로 전처리
#     - [주중 규칙 변경] 최댓값→둘째, 둘째→셋째로 클램프
#  - 입력(루트): train.csv, TRAIN_weather.csv, TRAIN_room.csv
#                TEST_00~09.csv, TEST_weather_00~09.csv, TEST_room_00~09.csv
#                sample_submission.csv
#  - 출력: submission_roomroom_1stclamp.csv
#  - 최적화:
#     * LightGBM GPU 자동 사용(가능 시) / 불가 시 CPU 폴백
#     * num_threads=전체 코어, dtype float32 다운캐스팅
#     * GPU 시 MultiOutputRegressor 병렬=1 (안정/안정성)
# ============================================================

import os, re, glob, unicodedata, warnings, math
import numpy as np, pandas as pd
from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb

warnings.filterwarnings("ignore")
pd.set_option("mode.chained_assignment", None)

# -----------------------------
# 고정 설정
# -----------------------------
SEED_BASE = 42
np.random.seed(SEED_BASE)

PREDICT, INPUT_WINDOW_DAYS = 7, 35
DATE, KEY, TARGET = "영업일자","영업장명_메뉴명","매출수량"

N_ESTIMATORS_FULL = 1200
ANCHOR_K_FULL     = 7

NON_SELLING_MIN_DAYS        = 14
APPLY_METRIC_AWARE_WSMAPE   = True
APPLY_METRIC_AWARE_WK_HOLOV = True
FORCE_ACTIVE_MIN1           = True
FORCE_GLOBAL_MIN1           = True

CATEGORICAL_COLS = ["key_encoded","month","dayofweek"]

# Tweedie p 앙상블 (CV 생략 → 균등 가중치)
P_LIST = [1.1, 1.2, 1.3, 1.4]
P_WEIGHTS = {p: 1.0/len(P_LIST) for p in P_LIST}

# 제출 후처리 라운딩 임계치
BEST_THR = 0.130

NUM_THREADS = os.cpu_count() or 8  # 코랩 VM에 맞게 자동
USE_GPU_AUTO = True                # GPU 자동 감지 시도

# -----------------------------
# 유틸/헬퍼
# -----------------------------
def to_datetime_norm(s):
	return pd.to_datetime(s, errors="coerce").dt.normalize()

def _read_csv_robust(p):
	try:
		return pd.read_csv(p)
	except UnicodeDecodeError:
		return pd.read_csv(p, encoding="cp949")

def round_nonneg(pred, thr):
	pred = np.maximum(pred, 0.0)
	frac = pred - np.floor(pred)
	out  = np.where(frac >= thr, np.floor(pred)+1, np.floor(pred))
	return np.maximum(out, 0.0)

def downcast_float32(df, exclude_cols=()):
	x = df.copy()
	for c in x.columns:
		if c in exclude_cols:
			continue
		if pd.api.types.is_float_dtype(x[c]):
			x[c] = x[c].astype(np.float32)
		elif pd.api.types.is_integer_dtype(x[c]) and x[c].isna().sum()==0:
			pass
	return x

def lightgbm_gpu_available():
	if not USE_GPU_AUTO:
		return False
	try:
		X = np.random.RandomState(0).randn(128, 8).astype(np.float32)
		y = (np.random.RandomState(1).rand(128) * 10).astype(np.float32)
		mdl = lgb.LGBMRegressor(
			n_estimators=5, objective="tweedie", tweedie_variance_power=1.2,
			device="gpu", gpu_platform_id=0, gpu_device_id=0,
			num_threads=min(4, NUM_THREADS), verbose=-1
		)
		mdl.fit(X, y)
		_ = mdl.predict(X[:4])
		return True
	except Exception:
		return False

# -----------------------------
# 디바이스 결정
# -----------------------------
GPU_OK = lightgbm_gpu_available()
if GPU_OK:
	LGB_DEVICE_KW = dict(device="gpu", gpu_platform_id=0, gpu_device_id=0)
	MOR_N_JOBS = 1     # GPU일 때 타깃 병렬을 1로(안정성/메모리)
	print("[LightGBM] GPU enabled")
else:
	LGB_DEVICE_KW = dict()  # CPU
	MOR_N_JOBS = -1         # CPU 최대 병렬
	print("[LightGBM] GPU unavailable → CPU fallback")

# -----------------------------
# 0) ROOM 보정 유틸
# -----------------------------
def _canon(s):
	return re.sub(r"\s+","", unicodedata.normalize("NFKC", str(s).replace("\ufeff","").replace("\xa0"," ").strip()).lower()).replace("_","").replace("-","")

def load_room_flags(path_room):
	"""
	room CSV → 날짜별 'room_all_zero' 플래그(DataFrame[DATE, room_all_zero]).
	판정 기준: 그 날짜의 '모든 객실 컬럼 합계' == 0  → 1, else 0 (엄격)
	"""
	if not os.path.exists(path_room):
		return pd.DataFrame({DATE: pd.to_datetime([]), "room_all_zero": []})
	df = _read_csv_robust(path_room)
	cmap = {c:_canon(c) for c in df.columns}
	date_col = next((c for c in df.columns if cmap[c] in {_canon("영업일자"), _canon("일자"), _canon("날짜"), _canon("date")}), None)
	if date_col != DATE:
		df = df.rename(columns={date_col: DATE})
	df[DATE] = to_datetime_norm(df[DATE])

	num_cols = [c for c in df.columns if c != DATE]
	for c in num_cols:
		if not pd.api.types.is_numeric_dtype(df[c]):
			df[c] = pd.to_numeric(df[c], errors="coerce")
	day_sum = df.groupby(DATE)[num_cols].sum(min_count=1)
	flag = (day_sum.fillna(0).sum(axis=1) == 0).astype(int).rename("room_all_zero").reset_index()
	return flag

def apply_room_zero_fix(df_in, room_flags, window=28, min_obs=2):
	"""
	ROOM 전처리:
	  - room_flags의 room_all_zero==1 인 날짜에 한하여,
		해당 날짜의 모든 메뉴 매출을 '평상시 값'으로 대체
	  - 평상시 값 = (같은 주중/주말) & room_all_zero==0 & 과거(<t) 관측의 최근 window개 중앙값
		* 대체값 산출 불가 시 → 과거 전체(room_all_zero==0) 중앙값
		* 그래도 없으면 → 원값 유지
	"""
	if room_flags is None or room_flags.empty:
		return df_in.copy()

	df = df_in.merge(room_flags, on=DATE, how="left")
	df["room_all_zero"] = df["room_all_zero"].fillna(0).astype(int)
	df["is_weekend"] = (df[DATE].dt.dayofweek >= 5).astype(int)

	outs = []
	for k, g in df.groupby(KEY, sort=False):
		g = g.sort_values(DATE).copy()
		idxs = g.index[g["room_all_zero"]==1].tolist()
		for idx in idxs:
			d = g.at[idx, DATE]
			w = int(g.at[idx, "is_weekend"])
			past = g[(g[DATE] < d) & (g["is_weekend"]==w) & (g["room_all_zero"]==0)]
			cand = past[TARGET].dropna().astype(float).values
			if cand.size < min_obs:
				past_any = g[(g[DATE] < d) & (g["room_all_zero"]==0)]
				cand = past_any[TARGET].dropna().astype(float).values
			if cand.size >= min_obs:
				val = float(np.median(cand[-window:]))
				g.at[idx, TARGET] = max(0.0, val)
		outs.append(g.drop(columns=["is_weekend","room_all_zero"]))
	out = pd.concat(outs, axis=0).sort_values([KEY, DATE]).reset_index(drop=True)
	return out

# -----------------------------
# (NEW) 마지막 단계 전처리: 최댓값→둘째 큰 양수로 클램프(주중/주말 분리)
#        - 윈도우(df_in)에 양수가 없으면 0으로 처리
#        - [주중] 최댓값→둘째, 둘째→셋째로 추가 클램프
# -----------------------------
def clamp_max_to_second_by_weekpart(df_in, date_col=DATE, key_col=KEY, target_col=TARGET):
	df = df_in.copy()
	df["_is_weekend_"] = (df[date_col].dt.dayofweek >= 5).astype(int)

	outs = []
	for (k, we), g in df.groupby([key_col, "_is_weekend_"], sort=False):
		vals = g[target_col].astype(float).values
		pos_unique = np.unique(vals[vals > 0])

		if pos_unique.size == 0:
			g[target_col] = 0.0
		else:
			if we == 1:
				# 주말: 기존 로직 유지 — 최댓값을 둘째 값으로 클램프
				if pos_unique.size >= 2:
					largest, second = pos_unique[-1], pos_unique[-2]
					mask1 = (g[target_col] == largest)
					if mask1.any():
						g.loc[mask1, target_col] = np.minimum(g.loc[mask1, target_col].astype(float), float(second))
			else:
				# 주중: 최댓값→둘째, 그리고 둘째→셋째
				if pos_unique.size >= 2:
					largest, second = pos_unique[-1], pos_unique[-2]
					mask1 = (g[target_col] == largest)
					if mask1.any():
						g.loc[mask1, target_col] = np.minimum(g.loc[mask1, target_col].astype(float), float(second))
				if pos_unique.size >= 3:
					third = pos_unique[-3]
					mask2 = (g[target_col] == second)
					if mask2.any():
						g.loc[mask2, target_col] = np.minimum(g.loc[mask2, target_col].astype(float), float(third))

		outs.append(g.drop(columns=["_is_weekend_"]))
	out = pd.concat(outs, axis=0).sort_values([key_col, date_col]).reset_index(drop=True)
	return out


[LightGBM] GPU unavailable → CPU fallback


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


In [2]:

# -----------------------------
# 1) train 로드 & 표준화
# -----------------------------
train = _read_csv_robust("train.csv")

orig = list(train.columns)
cmap = {c:_canon(c) for c in orig}
date_alias   = {_canon(x) for x in ["영업일자","일자","날짜","date"]}
key_single   = {_canon(x) for x in ["영업장명_메뉴명","영업장명메뉴명","key","메뉴키"]}
store_alias  = {_canon(x) for x in ["영업장명","매장명","지점명","점포명","매장","영업장"]}
menu_alias   = {_canon(x) for x in ["메뉴명","상품명","제품명","메뉴"]}
targ_alias   = {_canon(x) for x in ["매출수량","판매수량","수량","qty","판매량"]}

date_col   = next((c for c in orig if cmap[c] in date_alias), None)
target_col = next((c for c in orig if cmap[c] in targ_alias), None)
key_col    = next((c for c in orig if cmap[c] in key_single), None)

if key_col is None:
	store_col = next((c for c in orig if cmap[c] in store_alias), None)
	menu_col  = next((c for c in orig if cmap[c] in menu_alias ), None)
	train[KEY] = train[store_col].astype(str).str.strip() + "_" + train[menu_col].astype(str).str.strip()
else:
	if key_col != KEY: train = train.rename(columns={key_col:KEY})
if date_col != DATE:     train = train.rename(columns={date_col:DATE})
if target_col != TARGET: train = train.rename(columns={target_col:TARGET})

train[DATE]   = to_datetime_norm(train[DATE])
train[TARGET] = pd.to_numeric(train[TARGET], errors="coerce").clip(lower=0)

print("[train]", train.shape, "keys=", train[KEY].nunique(), "기간:", train[DATE].min().date(), "→", train[DATE].max().date())

# -----------------------------
# 2) 날씨(학습) 로드 & 일단위 변환
# -----------------------------
weather_m = _read_csv_robust("TRAIN_weather.csv")
w = weather_m.rename(columns={"일시": DATE}).copy()
w[DATE] = to_datetime_norm(w[DATE])
col_tmax = next(c for c in w.columns if ("최고" in c and "기온" in c))
col_tavg = next(c for c in w.columns if ("평균" in c and "기온" in c))
w_day = (w[[DATE, col_tmax, col_tavg]]
		 .groupby(DATE, as_index=False).mean(numeric_only=True)
		 .rename(columns={col_tmax:"wx_tmax", col_tavg:"wx_tavg"}))

# -----------------------------
# 3) 테스트 파일/범위, 공휴일 세트
# -----------------------------
test_files = sorted([p for p in glob.glob("test/TEST_*.csv")
					 if re.fullmatch(r"TEST_\d{2}\.csv", os.path.basename(p))])

test_list = []
for p in test_files:
	t = _read_csv_robust(p)
	if DATE in t.columns:
		t[DATE] = to_datetime_norm(t[DATE])
	test_list.append(t)

dmin = min([train[DATE].min()] + [t[DATE].min() for t in test_list if DATE in t.columns])
dmax = max([train[DATE].max()] + [t[DATE].max() for t in test_list if DATE in t.columns]) + pd.Timedelta(days=PREDICT+14)

manual_days = pd.to_datetime([
	"2023-01-23","2023-01-24","2023-03-01","2023-05-05","2023-05-29","2023-06-06","2023-08-15",
	"2023-09-28","2023-09-29","2023-10-02","2023-10-03","2023-10-09","2023-12-25",
	"2024-01-01","2024-02-09","2024-02-12","2024-03-01","2024-04-10","2024-05-06","2024-05-15",
	"2024-06-06","2024-08-15","2024-09-16","2024-09-17","2024-09-18","2024-10-01","2024-10-03","2024-10-09","2024-12-25",
	"2025-01-01","2025-01-27","2025-01-28","2025-01-29","2025-01-30","2025-03-03",
	"2025-05-05","2025-05-06","2025-06-03","2025-06-06","2025-08-15","2025-10-03","2025-10-09","2025-12-25",
	"2026-01-01"
]).normalize()

try:
	import holidays as _hol
	years = sorted({d.year for d in pd.date_range(dmin, dmax, freq="D")})
	kr = _hol.KR(years=years)
	hol = pd.DatetimeIndex([pd.Timestamp(k) for k in kr]).normalize()
	hol = pd.DatetimeIndex(sorted(set(hol) | set(manual_days)))
except Exception:
	hol = pd.DatetimeIndex(sorted(set(manual_days)))
hol = hol[(hol>=dmin) & (hol<=dmax)]
HOL_WEEKDAY_SET = set(hol[hol.dayofweek<5])
print("HOL_WEEKDAY_SET size:", len(HOL_WEEKDAY_SET))

[train] (88844, 3) keys= 167 기간: 2023-01-01 → 2024-06-15
HOL_WEEKDAY_SET size: 39


In [3]:
# --- Hwadam meta (by DATE) ---
hwadam = _read_csv_robust("TRAIN_hwadam.csv").rename(columns={"영업일자": DATE})
hwadam[DATE] = to_datetime_norm(hwadam[DATE])
for c in ["화담숲","화담채","모노레일"]:
	if c in hwadam.columns:
		hwadam[c] = pd.to_numeric(hwadam[c], errors="coerce").fillna(0.0)

In [4]:
TARGET_STORES = {"화담숲주막","화담숲카페"}   # 조건 적용 대상 업장

# --- hwadam 공통 merge ---
hwadam = _read_csv_robust("TRAIN_hwadam.csv").rename(columns={"영업일자": DATE})
hwadam[DATE] = to_datetime_norm(hwadam[DATE])
for c in ["화담숲","화담채","모노레일"]:
    if c in hwadam.columns:
        hwadam[c] = pd.to_numeric(hwadam[c], errors="coerce").fillna(0.0)

TARGET_STORES = {"화담숲주막","화담숲카페"}

# ✅ 이것만 남기고,
def add_hwadam_feats(df_in, hw_day):
    df = df_in.merge(hw_day[[DATE,"화담숲","화담채","모노레일"]], on=DATE, how="left")
    store = df[KEY].astype(str).str.rsplit("_", n=1).str[0]
    mask  = store.isin(TARGET_STORES)
    df["hw_forest"] = 0.0; df["hw_chae"] = 0.0; df["hw_mono"] = 0.0
    df.loc[mask,"hw_forest"] = pd.to_numeric(df.loc[mask,"화담숲"],  errors="coerce").fillna(0.0)
    df.loc[mask,"hw_chae"]   = pd.to_numeric(df.loc[mask,"화담채"],  errors="coerce").fillna(0.0)
    df.loc[mask,"hw_mono"]   = pd.to_numeric(df.loc[mask,"모노레일"], errors="coerce").fillna(0.0)
    return df.drop(columns=[c for c in ["화담숲","화담채","모노레일"] if c in df.columns])

In [5]:
# -----------------------------
# 4) 학습측 안전 전처리(평일 공휴일 급등 완화)
# -----------------------------
x = train.copy().sort_values([KEY, DATE]).reset_index(drop=True)
x["dow"] = x[DATE].dt.dayofweek
x["is_nonhol_weekday"] = ((x["dow"]<5) & (~x[DATE].isin(HOL_WEEKDAY_SET))).astype(int)
x["is_hol_wk"]         = (x[DATE].isin(HOL_WEEKDAY_SET)).astype(int)
x["__val__"] = x[TARGET].astype(float)

outs = []
for k, g in x.groupby(KEY, sort=False):
	g = g.copy()
	g["med_wd_dow"] = np.nan
	for d in range(5):
		idx = (g["dow"]==d) & (g["is_nonhol_weekday"]==1)
		s = g.loc[idx,"__val__"]
		med = s.shift(1).rolling(window=4, min_periods=2).median()
		g.loc[idx,"med_wd_dow"] = med.values
	thr_up = (1.0+0.25)*g["med_wd_dow"]
	adj = ((g["is_hol_wk"]==1)&np.isfinite(g["med_wd_dow"])&(g["__val__"]>thr_up))
	g.loc[adj,"__val__"] = np.maximum(0.0, np.round((1.0+0.10)*g.loc[adj,"med_wd_dow"]))
	g = g.drop(columns=["med_wd_dow"])
	outs.append(g)

train_pre = pd.concat(outs, axis=0).drop(columns=["dow","is_nonhol_weekday","is_hol_wk"])
train_pre[TARGET] = train_pre["__val__"].astype(float).clip(lower=0)
train_pre = train_pre.drop(columns="__val__").sort_values([KEY, DATE]).reset_index(drop=True)

print("[train_pre]", train_pre.shape, "기간:", train_pre[DATE].min().date(), "→", train_pre[DATE].max().date())

# 학습 데이터에 날씨 결합
train_pre = train_pre.merge(w_day, on=DATE, how="left")
train_pre = add_hwadam_feats(train_pre, hwadam)

# -----------------------------
# 4.5) ROOM 0-day 보정 (TRAIN)
# -----------------------------
room_train_flags = load_room_flags("TRAIN_room.csv")
train_pre = apply_room_zero_fix(train_pre, room_train_flags, window=28, min_obs=2)

# -----------------------------
# 4.9) [마지막 전처리] 최댓값→둘째 큰 양수 클램프(주중/주말 분리, 양수 없으면 0)
#     (주중: 최댓값→둘째, 둘째→셋째)
# -----------------------------
train_pre = clamp_max_to_second_by_weekpart(train_pre, date_col=DATE, key_col=KEY, target_col=TARGET)

[train_pre] (88844, 3) 기간: 2023-01-01 → 2024-06-15


In [6]:
# -----------------------------
# 5) 학습용 피처 생성
# -----------------------------
df = train_pre.copy().sort_values([KEY, DATE]).reset_index(drop=True)
df["month"] = df[DATE].dt.month
df["dayofweek"] = df[DATE].dt.dayofweek
df["day"] = df[DATE].dt.day
df["dayofyear"] = df[DATE].dt.dayofyear
df["is_month_end"] = df[DATE].dt.is_month_end.astype(int)
df["is_quarter_end"] = df[DATE].dt.is_quarter_end.astype(int)
df["is_weekend"] = (df["dayofweek"]>=5).astype(int)

# 미래 캘린더 피처
for h in range(1, PREDICT+1):
	dth = df[DATE] + pd.to_timedelta(h, unit="D")
	df[f"dow_h{h}"]        = dth.dt.dayofweek
	df[f"is_weekend_h{h}"] = (df[f"dow_h{h}"]>=5).astype(int)
	df[f"is_wkhol_h{h}"]   = dth.isin(pd.DatetimeIndex(HOL_WEEKDAY_SET)).astype(int)

# 랙/롤링
for lag in [7,14,21]:
	df[f"lag_{lag}"] = df.groupby(KEY)[TARGET].shift(lag)
for lag in [1,2,3,4,5,6]:
	df[f"lag_{lag}"] = df.groupby(KEY)[TARGET].shift(lag)

for wwin in [7,14]:
	df[f"rolling_mean_{wwin}"]   = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(wwin).mean())
	df[f"rolling_std_{wwin}"]    = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(wwin).std())
	df[f"rolling_median_{wwin}"] = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(wwin).median())

df["rolling_mean_21"] = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(21).mean())
df["ratio_mean7_21"]  = df["rolling_mean_7"]/df["rolling_mean_21"]
df["ratio_mean14_21"] = df["rolling_mean_14"]/df["rolling_mean_21"]

def _slope_raw(v):
	m = np.isfinite(v); v = v[m]; n=v.size
	if n<2: return np.nan
	x = np.arange(n, dtype=float); xm, ym = x.mean(), v.mean()
	denom = ((x-xm)**2).sum()
	if denom==0: return 0.0
	return float(((x-xm)*(v-ym)).sum()/denom)

df["slope_7"]  = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(7).apply(_slope_raw, raw=True))
df["slope_14"] = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(14).apply(_slope_raw, raw=True))

# 장기 무판매 플래그
def _flag_long_zero_block(g, min_days):
	is_zero = (g[TARGET]==0).astype(int)
	block_id = (is_zero != is_zero.shift()).cumsum()
	block_len = is_zero.groupby(block_id).transform("sum")
	return ((is_zero==1)&(block_len>=min_days)).astype(int)
df["is_long_zero_block"] = df.groupby(KEY, sort=False).apply(
	lambda g: _flag_long_zero_block(g, NON_SELLING_MIN_DAYS)
).reset_index(level=0, drop=True).astype(int)

# EWM
df["ewm_mean_7"] = df.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).ewm(span=7, adjust=False, min_periods=2).mean())

# 요일 Fourier
x_dow = df["dayofweek"].astype(float)
df["dow_sin"] = np.sin(2*np.pi*x_dow/7.0); df["dow_cos"] = np.cos(2*np.pi*x_dow/7.0)

# KEY 인코딩
cats = pd.Index(sorted(df[KEY].astype(str).unique()))
key2id = {k:i for i,k in enumerate(cats)}
df["key_encoded"] = df[KEY].astype(str).map(key2id).astype("category")

# 타깃 시프팅
for i in range(1, PREDICT+1):
	df[f"target_{i}"] = df.groupby(KEY)[TARGET].shift(-i)

df_trainF = df.dropna().reset_index(drop=True)

feat_cols = [
	"month","dayofweek","key_encoded","day","dayofyear","is_month_end","is_quarter_end","is_weekend",
	"rolling_mean_7","rolling_std_7","rolling_median_7",
	"rolling_mean_14","rolling_std_14","rolling_median_14",
	"rolling_mean_21","ratio_mean7_21","ratio_mean14_21",
	"slope_7","slope_14","ewm_mean_7","dow_sin","dow_cos","is_long_zero_block","wx_tmax","wx_tavg","hw_forest","hw_chae","hw_mono",
] + [c for c in df.columns if re.match(r"^lag_\d+$", c)] + [c for c in df.columns if re.match(r"^(dow_h\d|is_weekend_h\d|is_wkhol_h\d)$", c)]

# float32 다운캐스팅(카테고리 제외)
df_trainF = downcast_float32(df_trainF, exclude_cols=CATEGORICAL_COLS + [DATE, KEY])

print("[TrainF]", df_trainF.shape, "features:", len(feat_cols))

[TrainF] (68781, 68) features: 58


In [7]:
# -----------------------------
# 6) 모델 학습 (p-앙상블)
# -----------------------------
def fit_multiout_with_p(X, y, p, seed=SEED_BASE, n_estim=N_ESTIMATORS_FULL):
	base = lgb.LGBMRegressor(
		n_estimators=n_estim, learning_rate=0.04, num_leaves=63,
		feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1,
		lambda_l1=0.1, lambda_l2=0.1, random_state=seed,
		objective="tweedie", metric="mae", tweedie_variance_power=float(p),
		num_threads=NUM_THREADS, verbose=-1,
		**LGB_DEVICE_KW
	)
	m = MultiOutputRegressor(base, n_jobs=MOR_N_JOBS)
	X_ = X.copy()
	for c in ("key_encoded","month","dayofweek"):
		if c in X_.columns: X_[c] = X_[c].astype("category")
	X_ = downcast_float32(X_, exclude_cols=CATEGORICAL_COLS)
	y_ = y.astype(np.float32)
	m.fit(X_, y_, categorical_feature=[c for c in ("key_encoded","month","dayofweek") if c in X_.columns])
	return m

models_full = {}
X_full = df_trainF[feat_cols].copy()
for c in ("key_encoded","month","dayofweek"):
	if c in X_full.columns: X_full[c] = X_full[c].astype("category")
X_full = downcast_float32(X_full, exclude_cols=CATEGORICAL_COLS)
y_full = df_trainF[[f"target_{i}" for i in range(1, PREDICT+1)]].astype(np.float32).copy()

for p in P_LIST:
	models_full[p] = fit_multiout_with_p(X_full, y_full, p=float(p), seed=SEED_BASE, n_estim=N_ESTIMATORS_FULL)

print("[fit] done. p-list:", P_LIST)

[fit] done. p-list: [1.1, 1.2, 1.3, 1.4]


In [None]:
# -----------------------------
# 7) 테스트 날씨/ROOM/Hwadam 준비
# -----------------------------
test_wday = {}
for wp in sorted([p for p in glob.glob("test/meta/TEST_weather_*.csv")
                  if re.fullmatch(r"TEST_weather_\d{2}\.csv", os.path.basename(p))]):
    wid = os.path.splitext(os.path.basename(wp))[0].split("_")[-1]
    fid = f"TEST_{wid}"
    w = _read_csv_robust(wp).rename(columns={"일시": DATE})
    w[DATE] = to_datetime_norm(w[DATE])
    col_tmax = next(c for c in w.columns if ("최고" in c and "기온" in c))
    col_tavg = next(c for c in w.columns if ("평균" in c and "기온" in c))
    w_day_test = (w[[DATE, col_tmax, col_tavg]]
                  .groupby(DATE, as_index=False).mean(numeric_only=True)
                  .rename(columns={col_tmax:"wx_tmax", col_tavg:"wx_tavg"}))
    test_wday[fid] = w_day_test

# ROOM flags (TEST)
room_flags_by_fid = {}
# 교정
for rp in sorted([p for p in glob.glob("test/meta/TEST_room_*.csv")
                  if re.fullmatch(r"TEST_room_\d{2}\.csv", os.path.basename(p))]):
    wid = os.path.splitext(os.path.basename(rp))[0].split("_")[-1]
    fid = f"TEST_{wid}"
    room_flags_by_fid[fid] = load_room_flags(rp)

_w_fallback = pd.DataFrame({DATE: pd.to_datetime([]), "wx_tmax": [], "wx_tavg": []})

def add_weather_feats(df_in, wday):
    return df_in.merge(wday, on=DATE, how="left")

# --- hwadam 공통 merge ---
hwadam = _read_csv_robust("TRAIN_hwadam.csv").rename(columns={"영업일자": DATE})
hwadam[DATE] = to_datetime_norm(hwadam[DATE])
for c in ["화담숲","화담채","모노레일"]:
    if c in hwadam.columns:
        hwadam[c] = pd.to_numeric(hwadam[c], errors="coerce").fillna(0.0)

TARGET_STORES = {"화담숲주막","화담숲카페"}

In [11]:

# -----------------------------
# 8) 안전 전처리 함수(테스트에도 동일 적용)
# -----------------------------
def weekday_holiday_impute(df_in, hol_set):
	xx = df_in.copy()
	xx["dow"] = xx[DATE].dt.dayofweek
	xx["is_weekday"] = (xx["dow"] < 5).astype(int)
	xx["is_weekday_hol"] = xx[DATE].isin(hol_set).astype(int)

	outs = []
	for k, g in xx.groupby(KEY, sort=False):
		g = g.sort_values(DATE).copy()
		end_date = g[DATE].max()
		cand = g[(g["is_weekday"]==1) & (g["is_weekday_hol"]==0)].copy()
		all_weekday_zero = (cand[TARGET].sum()==0)

		def _wmean(vals, dates, end_date, decay=0.90, lastweek_window=7, boost=0.50):
			if len(vals)==0: return np.nan
			vals = np.asarray(vals,float); dates = pd.to_datetime(dates)
			delta = (end_date - dates).days.astype(float)
			w = decay ** delta
			w = np.where(dates >= end_date - pd.Timedelta(days=lastweek_window-1), w*(1+boost), w)
			w = np.clip(w, 1e-6, None)
			return float(np.sum(w*vals)/np.sum(w))

		for idx, row in g.loc[(g["is_weekday"]==1) & (g["is_weekday_hol"]==1)].iterrows():
			if all_weekday_zero:
				g.at[idx, TARGET] = 0; continue
			pool = cand[cand["dow"]==int(row["dow"])]
			base = _wmean(pool[TARGET].values, pool[DATE].values, end_date) if len(pool)>0 else \
				   _wmean(cand[TARGET].values, cand[DATE].values, end_date)   # (오타 fix)
			t7 = row[DATE] - pd.Timedelta(days=7)
			prev = g[(g[DATE]==t7) & (g["is_weekday"]==1) & (g["is_weekday_hol"]==0)]
			val_t7 = float(prev[TARGET].iloc[0]) if len(prev)==1 else np.nan
			if np.isfinite(base):
				y_hat = base if not np.isfinite(val_t7) else 0.65*base + 0.35*val_t7
				g.at[idx, TARGET] = int(max(0, round(y_hat)))
			else:
				g.at[idx, TARGET] = 0
		outs.append(g)
	return pd.concat(outs, axis=0).drop(columns=["dow","is_weekday","is_weekday_hol"])



def spike_clamp(df_in, hol_set):
	x2 = df_in.copy()
	x2["dow"] = x2[DATE].dt.dayofweek
	x2["is_nonhol_weekday"] = ((x2["dow"]<5) & (~x2[DATE].isin(hol_set))).astype(int)
	x2["is_hol_wk"] = x2[DATE].isin(hol_set).astype(int)

	outs = []
	for k, g in x2.groupby(KEY, sort=False):
		g = g.copy()
		g["med_wd_dow"] = np.nan
		for d in range(5):
			idx = (g["dow"]==d) & (g["is_nonhol_weekday"]==1)
			s = g.loc[idx, TARGET].astype(float)
			med = s.shift(1).rolling(window=4, min_periods=2).median()
			g.loc[idx, "med_wd_dow"] = med.values
		thr_up = 1.25 * g["med_wd_dow"]
		adj = (g["is_hol_wk"]==1) & np.isfinite(g["med_wd_dow"]) & (g[TARGET] > thr_up)
		g.loc[adj, TARGET] = np.maximum(0.0, np.round(1.10 * g.loc[adj, "med_wd_dow"]))
		outs.append(g.drop(columns=["med_wd_dow"]))
	return pd.concat(outs, axis=0).drop(columns=["dow","is_nonhol_weekday","is_hol_wk"])

# -----------------------------
# 9) 예측 (앵커 앙상블 + p 앙상블)
# -----------------------------
all_preds = []

def build_feats(df_in):
	f = df_in.copy().sort_values([KEY, DATE])
	f["month"] = f[DATE].dt.month
	f["dayofweek"] = f[DATE].dt.dayofweek
	f["day"] = f[DATE].dt.day
	f["dayofyear"] = f[DATE].dt.dayofyear
	f["is_month_end"]   = f[DATE].dt.is_month_end.astype(int)
	f["is_quarter_end"] = f[DATE].dt.is_quarter_end.astype(int)
	f["is_weekend"]     = (f["dayofweek"]>=5).astype(int)

	for h in range(1, PREDICT+1):
		dth = f[DATE] + pd.to_timedelta(h, unit="D")
		f[f"dow_h{h}"]        = dth.dt.dayofweek
		f[f"is_weekend_h{h}"] = (f[f"dow_h{h}"]>=5).astype(int)
		f[f"is_wkhol_h{h}"]   = dth.isin(pd.DatetimeIndex(HOL_WEEKDAY_SET)).astype(int)

	for lag in [7,14,21]: f[f"lag_{lag}"] = f.groupby(KEY)[TARGET].shift(lag)
	for lag in [1,2,3,4,5,6]: f[f"lag_{lag}"] = f.groupby(KEY)[TARGET].shift(lag)

	for wwin in [7,14]:
		f[f"rolling_mean_{wwin}"]   = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(wwin).mean())
		f[f"rolling_std_{wwin}"]    = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(wwin).std())
		f[f"rolling_median_{wwin}"] = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(wwin).median())

	f["rolling_mean_21"] = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(21).mean())
	f["ratio_mean7_21"]  = f["rolling_mean_7"]/f["rolling_mean_21"]
	f["ratio_mean14_21"] = f["rolling_mean_14"]/f["rolling_mean_21"]

	def _slope(v):
		v = v[np.isfinite(v)]
		if v.size < 2: return np.nan
		x = np.arange(v.size, dtype=float)
		xm, ym = x.mean(), v.mean()
		denom = ((x-xm)**2).sum()
		return 0.0 if denom==0 else float(((x-xm)*(v-ym)).sum()/denom)

	f["slope_7"]  = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(7).apply(_slope, raw=True))
	f["slope_14"] = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).rolling(14).apply(_slope, raw=True))
	f["ewm_mean_7"] = f.groupby(KEY)[TARGET].transform(lambda x: x.shift(1).ewm(span=7, adjust=False, min_periods=2).mean())

	x_dow = f["dayofweek"].astype(float)
	f["dow_sin"] = np.sin(2*np.pi*x_dow/7.0); f["dow_cos"] = np.cos(2*np.pi*x_dow/7.0)

	f["key_encoded"] = f[KEY].astype(str).map(key2id).astype("category")

	def _flag_lzb_local(g):
		iz = (g[TARGET]==0).astype(int)
		bid = (iz != iz.shift()).cumsum()
		blen = iz.groupby(bid).transform("sum")
		return ((iz==1) & (blen >= NON_SELLING_MIN_DAYS)).astype(int)
	f["is_long_zero_block"] = f.groupby(KEY, sort=False).apply(_flag_lzb_local).reset_index(level=0, drop=True).astype(int)

	f = downcast_float32(f, exclude_cols=CATEGORICAL_COLS + [DATE, KEY])
	return f


In [12]:

for path in test_files:
	fid = os.path.splitext(os.path.basename(path))[0]  # TEST_xx
	tdf = _read_csv_robust(path)
	tdf[DATE] = to_datetime_norm(tdf[DATE])
	tdf = tdf.sort_values([KEY, DATE]).reset_index(drop=True)

	# (A) 평일 공휴일 imputing → (B) 급등 클램프 → (C) ROOM 보정 → (D) 날씨 결합
	tdf_imp  = weekday_holiday_impute(tdf, HOL_WEEKDAY_SET)
	tdf_safe = spike_clamp(tdf_imp, HOL_WEEKDAY_SET)
	tdf_room = apply_room_zero_fix(tdf_safe, room_flags_by_fid.get(fid, pd.DataFrame()), window=28, min_obs=2)
	tdf_room = add_weather_feats(tdf_room, test_wday.get(fid, _w_fallback))
	tdf_room = add_hwadam_feats(tdf_room, hwadam)   # 날짜가 없으면 0으로 남음


	# (E) [마지막 전처리] 주중: 최댓값→둘째, 둘째→셋째 / 주말: 최댓값→둘째
	tdf_room = clamp_max_to_second_by_weekpart(tdf_room, date_col=DATE, key_col=KEY, target_col=TARGET)

	his = tdf_room.sort_values([KEY, DATE])
	last_date = his[DATE].max()
	date_set = set(his[DATE])
	anchors = sorted(d for d in (last_date - pd.to_timedelta(np.arange(ANCHOR_K_FULL), "D")) if d in date_set)

	bucket = {}
	for ad in anchors:
		sub = his[his[DATE] <= ad].copy()
		Feat = build_feats(sub)
		rowA = Feat[Feat[DATE]==ad].copy()
		if rowA.empty:
			continue
		Xrow = rowA[feat_cols].copy()
		for c in ("key_encoded","month","dayofweek"):
			if c in Xrow.columns: Xrow[c] = Xrow[c].astype("category")
		Xrow = downcast_float32(Xrow, exclude_cols=CATEGORICAL_COLS)

		# p-가중 앙상블
		Y_mix = None
		for p, mdl in models_full.items():
			Yp = mdl.predict(Xrow)  # (n_keys, 7)
			w  = P_WEIGHTS.get(p, 0.0)
			Y_mix = (w*Yp) if Y_mix is None else (Y_mix + w*Yp)

		meta = rowA[[KEY,DATE]].reset_index(drop=True)
		for i in range(len(meta)):
			keyi = meta.at[i, KEY]
			for h in range(PREDICT):
				dtp = (ad + pd.Timedelta(days=h+1)).normalize()
				bucket.setdefault((dtp, keyi), []).append(float(Y_mix[i, h]))

	for (dtp, keyi), vals in bucket.items():
		all_preds.append({DATE: dtp, KEY: keyi, TARGET: float(np.mean(vals))})

sub_long = pd.DataFrame(all_preds).copy()
sub_long[DATE] = to_datetime_norm(sub_long[DATE])


In [13]:

# -----------------------------
# 10) 블록 통계(후처리용) — 예측과 동일 기준(ROOM 포함)
# -----------------------------
date_to_file, block_pos_values, block_lengths, weekend_means = {}, {}, {}, {}

for p in test_files:
	fid = os.path.basename(p).split(".")[0]
	dfb = _read_csv_robust(p)
	dfb[DATE] = to_datetime_norm(dfb[DATE])
	dfb = dfb.sort_values([KEY, DATE]).reset_index(drop=True)

	# (A) 평일 공휴일 imputing → (B) 급등 클램프 → (C) ROOM 보정
	tdf_imp_stat  = weekday_holiday_impute(dfb, HOL_WEEKDAY_SET)
	tdf_safe_stat = spike_clamp(tdf_imp_stat, HOL_WEEKDAY_SET)
	tdf_safe_stat = apply_room_zero_fix(tdf_safe_stat, room_flags_by_fid.get(fid, pd.DataFrame()), window=28, min_obs=2)

	# (D) [마지막 전처리] 주중: 최댓값→둘째, 둘째→셋째 / 주말: 최댓값→둘째
	tdf_safe_stat = clamp_max_to_second_by_weekpart(tdf_safe_stat, date_col=DATE, key_col=KEY, target_col=TARGET)

	tdf_safe_stat = tdf_safe_stat.sort_values([KEY, DATE])

	last = tdf_safe_stat[DATE].max()
	for h in range(1, PREDICT+1):
		date_to_file[(last + pd.Timedelta(days=h)).strftime("%Y-%m-%d")] = fid
	for k_i, g in tdf_safe_stat.groupby(KEY):
		g = g.sort_values(DATE)
		pos = g.loc[g[TARGET] > 0, TARGET].astype(float).values
		block_pos_values[(fid, k_i)] = pos
		block_lengths[(fid, k_i)] = int(g.shape[0])
		wknd = g[g[DATE].dt.dayofweek >= 5][TARGET].astype(float)
		weekend_means[(fid, k_i)] = float(wknd.mean()) if wknd.size > 0 else np.nan


In [14]:

# -----------------------------
# 11) 후처리
# -----------------------------
def is_metric_target_post(k):
	s = str(k)
	if "공깃밥" in s: return False
	if "단체" in s: return True
	return False

# (A) 평일 공휴일이면 주말 평균으로 대체
if APPLY_METRIC_AWARE_WK_HOLOV:
	rows = []
	for _, r in sub_long.iterrows():
		dt = r[DATE]
		if (dt.weekday() < 5) and (dt in HOL_WEEKDAY_SET):
			fid = date_to_file.get(dt.strftime("%Y-%m-%d"))
			wk = weekend_means.get((fid, r[KEY])) if fid is not None else np.nan
			val = float(wk) if np.isfinite(wk) else float(r[TARGET])
		else:
			val = float(r[TARGET])
		rows.append(val)
	sub_long[TARGET] = np.array(rows, dtype=float)

# (B) weighted-SMAPE 기반 블렌딩
def _weighted_smape_constant(pos_vals, n_days, decay=0.92, clamp_k=0.50, beta_soft=0.50):
	pos = np.asarray(pos_vals, float)
	if pos.size == 0: return 1, 0.0
	pos = np.sort(pos); q1,q2,q3 = np.percentile(pos,[25,50,75]); iqr = q3-q1; p95 = np.percentile(pos,95)
	w = np.array([decay**(pos.size-1-i) for i in range(pos.size)], float); w /= w.sum()
	base_cands = np.unique(np.round(np.r_[pos,q1,q2,q3,(q1+q3)/2,p95]).astype(int))
	lo = max(1, int(np.floor(max(1, q1 - clamp_k*iqr)))); hi = int(np.ceil(min(max(base_cands.max(),2), p95*1.10)))
	cands = np.unique(np.r_[base_cands, np.arange(lo, max(lo+1, hi+1))])
	def _wsmape(c): c=float(c); return np.sum(w*(2.0*np.abs(c-pos)/(np.abs(c)+pos)))
	scores = np.array([_wsmape(c) for c in cands]); c_star = int(cands[np.argmin(scores)])
	clamp_lo = max(1, int(np.floor(q1 - clamp_k*iqr))); clamp_hi = int(np.ceil(q3 + clamp_k*iqr))
	c_clip = int(np.clip(c_star, clamp_lo, clamp_hi))
	c_final = int(np.round((1 - beta_soft)*c_star + beta_soft*c_clip)); c_final = max(1, c_final)
	zero_rate = 1.0 - (pos.size / max(1, n_days))
	iqr_ratio = (iqr / (q2 + 1e-9)) if q2 > 0 else 0.0; iqr_ratio = float(np.clip(iqr_ratio, 0.0, 2.0))
	alpha_raw = 0.6*zero_rate + 0.4*(iqr_ratio / (1.0 + iqr_ratio))
	alpha = float(np.clip(alpha_raw, 0.0, 0.5))
	return c_final, alpha

if APPLY_METRIC_AWARE_WSMAPE:
	new_vals = []
	for _, r in sub_long.iterrows():
		k = r[KEY]
		if not is_metric_target_post(k):
			new_vals.append(float(r[TARGET])); continue
		fid = date_to_file.get(r[DATE].strftime("%Y-%m-%d"))
		if fid is None:
			new_vals.append(float(r[TARGET])); continue
		pos = block_pos_values.get((fid, k), np.array([]))
		nd  = block_lengths.get((fid, k), INPUT_WINDOW_DAYS)
		c, a = _weighted_smape_constant(pos, nd)
		y = float(r[TARGET]); y_new = (1.0 - a)*y + a*float(c)
		new_vals.append(y_new)
	sub_long[TARGET] = np.array(new_vals, dtype=float)

# 라운딩 & 최소치 보정
sub_long[TARGET] = round_nonneg(sub_long[TARGET].values, BEST_THR).astype(int)

if FORCE_ACTIVE_MIN1 or FORCE_GLOBAL_MIN1:
	if FORCE_ACTIVE_MIN1:
		upd = []
		for _, r in sub_long.iterrows():
			y = int(r[TARGET])
			fid = date_to_file.get(r[DATE].strftime("%Y-%m-%d"))
			if (y == 0) and (fid is not None) and is_metric_target_post(r[KEY]):
				pos = block_pos_values.get((fid, r[KEY]), np.array([]))
				if pos.size > 0: y = 1
			upd.append(y)
		sub_long[TARGET] = np.array(upd, dtype=int)
	if FORCE_GLOBAL_MIN1:
		sub_long.loc[sub_long[TARGET] == 0, TARGET] = 1

print("[postprocess] rows:", len(sub_long))


[postprocess] rows: 21710


In [15]:

# -----------------------------
# 12) sample 형식으로 pivot → 저장
# -----------------------------
OUT_PATH = "submission_roomroom_hwadam추가.csv"

sample = _read_csv_robust("sample_submission.csv")
label_col = sample.columns[0]

# date_label 매핑: "TEST_xx+N일"
date_label = {}
for fp in test_files:
	tdf = _read_csv_robust(fp)
	tdf[DATE] = to_datetime_norm(tdf[DATE])
	fid = os.path.basename(fp).split(".")[0]  # TEST_xx
	last = tdf[DATE].max()
	for i in range(PREDICT):
		date_label[(last + pd.Timedelta(days=i+1)).strftime("%Y-%m-%d")] = f"{fid}+{i+1}일"

sub_long = sub_long.copy()
sub_long["date_label"] = sub_long[DATE].dt.strftime("%Y-%m-%d").map(date_label)

pivot = sub_long.pivot_table(index="date_label", columns=KEY, values=TARGET, aggfunc="first")
final = sample.set_index(label_col)
common = [c for c in final.columns if c in pivot.columns]
if len(common)>0:
	final[common] = final[common].combine_first(pivot[common])
	final.update(pivot[common])

final = final.fillna(1).astype(np.int64).reset_index()
final.columns = sample.columns

final.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("[SAVE] ->", OUT_PATH)
print(final.head(2))


[SAVE] -> submission_roomroom_hwadam추가.csv
         영업일자  느티나무 셀프BBQ_1인 수저세트  느티나무 셀프BBQ_BBQ55(단체)  \
0  TEST_00+1일                   7                    56   
1  TEST_00+2일                   2                    59   

   느티나무 셀프BBQ_대여료 30,000원  느티나무 셀프BBQ_대여료 60,000원  느티나무 셀프BBQ_대여료 90,000원  \
0                       6                       3                       1   
1                       1                       1                       1   

   느티나무 셀프BBQ_스프라이트 (단체)  느티나무 셀프BBQ_신라면  느티나무 셀프BBQ_쌈장  느티나무 셀프BBQ_육개장 사발면  \
0                     17               2              1                   1   
1                     19               1              1                   1   

   ...  화담숲주막_스프라이트  화담숲주막_참살이 막걸리  화담숲주막_찹쌀식혜  화담숲주막_콜라  화담숲주막_해물파전  \
0  ...            4             12          12         4          43   
1  ...            1              1           1         1           2   

   화담숲카페_메밀미숫가루  화담숲카페_아메리카노 HOT  화담숲카페_아메리카노 ICE  화담숲카페_카페라떼 ICE  \
0            19        