In [8]:
# 목적: 프로젝트 루트 고정 + 폴더 보장
# 입출력: 폴더 생성(project/data/raw 등)
# 주의: 경로는 본인 환경에 맞게 ROOT만 바꾸면 됨
from pathlib import Path

ROOT = Path(r"/workspace/DataM").resolve()  # ← 원하는 절대경로
RAW  = ROOT/"data"/"raw";      PROC = ROOT/"data"/"processed"
RPT  = ROOT/"reports";         FIG  = RPT/"figures"
EXP  = ROOT/"experiments";     LOGS = ROOT/"logs"

for p in [RAW, PROC, RPT, FIG, EXP, LOGS]:
    p.mkdir(parents=True, exist_ok=True)   # ← 들여쓰기 필수!

print("ROOT =", ROOT)


ROOT = /workspace/DataM


In [9]:
# 목적: MRE에 필요한 최소 패키지 설치 확인
# 입출력: 없음(설치 로그), 버전 프린트
# 주의: 노트북 커널이 conda env(cnidium-ssp)인지 확인
import sys, subprocess
pkgs = ["numpy", "pandas", "scikit-learn", "matplotlib"]
subprocess.run([sys.executable, "-m", "pip", "install", *pkgs, "-q"]) # 조용히 설치
import numpy as np, pandas as pd, sklearn, matplotlib
print("NumPy", np.__version__, "Pandas", pd.__version__, "Sklearn", sklearn.__version__, "Matplotlib", matplotlib.__version__)

[0m

NumPy 1.26.3 Pandas 2.2.3 Sklearn 1.6.1 Matplotlib 3.10.6


In [13]:
# 목적: Leaf_TPC 5-Fold CV 실행 → metrics.json 저장
# 입출력: RAW 첫 CSV -> experiments/exp_20250924_mre/metrics.json
# 주의: 전처리(월→sin/cos, 스케일)는 Pipeline 내부. seed=42

import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold, cross_validate
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet

SEED = 42

# 경로 가드(이전 셀에서 RAW/EXP를 안 만들었어도 동작)
try:
    RAW, EXP
except NameError:
    ROOT = Path.cwd().resolve()
    RAW  = ROOT / "data" / "raw"
    EXP  = ROOT / "experiments"

csvs = sorted(RAW.glob("*.csv"))
assert csvs, f"No CSV in {RAW}"

try:
    df = pd.read_csv(csvs[0], encoding="utf-8-sig")
except UnicodeDecodeError:
    df = pd.read_csv(csvs[0], encoding="cp949")

num_cand = [
    "CO2ppm","Temp","Humid","VPD","Chl_a","Chl_b","TChl","Car",
    "ABS-RC","Dio-RC","Tro-RC","Eto-RC","PI_abs","DF_abs","SFI_abs","Fv-Fm",
    "Leaf_ExtractionYield","Root_ExtractionYield"
]
num_cols = [c for c in num_cand if c in df.columns]

if "month" not in df.columns:
    df["month"] = 1

target = "Leaf_TPC" if "Leaf_TPC" in df.columns else df.columns[-1]
X = df[["month"] + num_cols].copy()
y = df[target].copy()

month_cycle = FunctionTransformer(lambda X: np.c_[
    np.sin(2*np.pi*(X.values[:, 0] / 12.0)),
    np.cos(2*np.pi*(X.values[:, 0] / 12.0))
])

trans = [("month_cycle", month_cycle, ["month"])]
if num_cols:
    trans.append(("scale", StandardScaler(), num_cols))

prep = ColumnTransformer(trans, remainder="drop")
pipe = Pipeline([("prep", prep), ("model", ElasticNet(random_state=SEED))])

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
scoring = {"MAE": "neg_mean_absolute_error", "MSE": "neg_mean_squared_error", "R2": "r2"}
res = cross_validate(pipe, X, y, cv=cv, scoring=scoring, return_train_score=False)

def mstd(a):
    a = np.asarray(a)
    return float(a.mean()), float(a.std())

mae_m, mae_s = mstd(-res["test_MAE"])
mse_m, mse_s = mstd(-res["test_MSE"])
r2_m,  r2_s  = mstd( res["test_R2"])

out = {
    "target": target,
    "n_samples": int(len(df)),
    "n_features": int(X.shape[1]),
    "cv": "KFold(n_splits=5, shuffle=True, random_state=42)",
    "metrics": {
        "MAE_mean": mae_m, "MAE_std": mae_s,
        "MSE_mean": mse_m, "MSE_std": mse_s,
        "R2_mean":  r2_m,  "R2_std":  r2_s
    }
}

exp_dir = EXP / "exp_20250924_mre"
exp_dir.mkdir(parents=True, exist_ok=True)
(exp_dir / "metrics.json").write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8-sig")

print(out)
print("Saved ->", exp_dir / "metrics.json")


{'target': 'Leaf_TPC', 'n_samples': 135, 'n_features': 19, 'cv': 'KFold(n_splits=5, shuffle=True, random_state=42)', 'metrics': {'MAE_mean': 0.5573793397507052, 'MAE_std': 0.04497281463008568, 'MSE_mean': 0.38196725768321516, 'MSE_std': 0.058693079020913703, 'R2_mean': 0.20440509362505335, 'R2_std': 0.04596065559560122}}
Saved -> /workspace/DataM/experiments/exp_20250924_mre/metrics.json


In [14]:
# 목적: 오늘 환경 기록(재현성)
from pathlib import Path
import sys, platform, json

ROOT = Path("/workspace/DataM").resolve()
LOGS = ROOT/"logs"; LOGS.mkdir(parents=True, exist_ok=True)
p = LOGS/"setup_20250924.md"
meta = {"python": sys.version, "platform": platform.platform(), "seed": 42}
p.write_text("# 환경 기록 (2025-09-24)\n\n```\n"+json.dumps(meta, ensure_ascii=False, indent=2)+"\n```\n", encoding="utf-8-sig")
print("Wrote:", p)


Wrote: /workspace/DataM/logs/setup_20250924.md


In [None]:
# conda로 git 설치
!conda install -y -c conda-forge git
# 사용자 정보 설정(한 번만)
!git config --global user.name "Your Name"
!git config --global user.email "you@example.com"
# 레포 초기화 & 첫 커밋
%cd /workspace/DataM
!git init
!git add .
!git commit -m "chore: env setup + MRE pipeline (seed=42)"


done
Solving environment: / 