In [17]:
pip install numpy pandas scikit-learn opencv-python tqdm

Note: you may need to restart the kernel to use updated packages.




In [35]:
import os, cv2, random, joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# ===== 경로 & 설정 =====
TRAIN_IMG_PATH = r"C:/Users/82106/Downloads/cifar-10/train/train"
TRAIN_LABELS   = r"C:/Users/82106/Downloads/cifar-10/trainLabels.csv"
RANDOM_SEED    = 42
random.seed(RANDOM_SEED); np.random.seed(RANDOM_SEED)

def read_flatten(path):
    img = cv2.imread(path)
    if img is None: return None
    return (img.astype(np.float32)/255.0).flatten()

# ===== 1) 데이터 로드 =====
labels_df = pd.read_csv(TRAIN_LABELS)
labels_df['id'] = pd.to_numeric(labels_df['id'], errors='coerce').astype(int)

X, y = [], []
print(f"[INFO] {len(labels_df):,}장 로딩...")
for img_id, label in tqdm(labels_df[['id','label']].itertuples(index=False), total=len(labels_df)):
    v = read_flatten(os.path.join(TRAIN_IMG_PATH, f"{img_id}.png"))
    if v is None: continue
    X.append(v); y.append(label)

X = np.array(X, dtype=np.float32)
y = np.array(y)

# ===== 2) 8:2 분할 =====
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

# ===== 3) 하이퍼파라미터 탐색 =====
pca_dims  = [64, 128, 256, 384]
metrics   = ['euclidean', 'cosine']
k_vals    = [1, 3, 5, 7, 9]

best = {"acc": -1, "pipeline": None, "cfg": None}

for n in pca_dims:
    for metric in metrics:
        for k in k_vals:
            pipe = Pipeline([
                ("scaler", StandardScaler(with_mean=True, with_std=True)),
                ("pca", PCA(n_components=n, whiten=True, random_state=RANDOM_SEED)),
                ("knn", KNeighborsClassifier(n_neighbors=k, metric=metric, n_jobs=-1))
            ])
            pipe.fit(X_tr, y_tr)
            pred = pipe.predict(X_val)
            acc  = accuracy_score(y_val, pred)
            print(f"[INFO] PCA={n:>3}, metric={metric:<9}, k={k}: acc={acc:.4f}")
            if acc > best["acc"]:
                best = {"acc": acc, "pipeline": pipe,
                        "cfg": {"pca": n, "metric": metric, "k": k}}

print(f"\n=== Best: acc={best['acc']:.4f} | cfg={best['cfg']} ===")

# ===== 4) 최적 모델 저장 =====
joblib.dump(best["pipeline"], "best_knn_pca_pipeline.pkl")
with open("best_knn_pca_cfg.txt", "w", encoding="utf-8") as f:
    f.write(f"{best['cfg']}\nacc={best['acc']:.4f}")
print("[INFO] best_knn_pca_pipeline.pkl / best_knn_pca_cfg.txt 저장 완료")


[INFO] 50,000장 로딩...


100%|██████████| 50000/50000 [00:55<00:00, 905.21it/s] 


[INFO] PCA= 64, metric=euclidean, k=1: acc=0.3658
[INFO] PCA= 64, metric=euclidean, k=3: acc=0.3389
[INFO] PCA= 64, metric=euclidean, k=5: acc=0.3486
[INFO] PCA= 64, metric=euclidean, k=7: acc=0.3495
[INFO] PCA= 64, metric=euclidean, k=9: acc=0.3504
[INFO] PCA= 64, metric=cosine   , k=1: acc=0.4396
[INFO] PCA= 64, metric=cosine   , k=3: acc=0.4287
[INFO] PCA= 64, metric=cosine   , k=5: acc=0.4586
[INFO] PCA= 64, metric=cosine   , k=7: acc=0.4755
[INFO] PCA= 64, metric=cosine   , k=9: acc=0.4836
[INFO] PCA=128, metric=euclidean, k=1: acc=0.3075
[INFO] PCA=128, metric=euclidean, k=3: acc=0.2730
[INFO] PCA=128, metric=euclidean, k=5: acc=0.2746
[INFO] PCA=128, metric=euclidean, k=7: acc=0.2716
[INFO] PCA=128, metric=euclidean, k=9: acc=0.2675
[INFO] PCA=128, metric=cosine   , k=1: acc=0.4465
[INFO] PCA=128, metric=cosine   , k=3: acc=0.4329
[INFO] PCA=128, metric=cosine   , k=5: acc=0.4577
[INFO] PCA=128, metric=cosine   , k=7: acc=0.4726
[INFO] PCA=128, metric=cosine   , k=9: acc=0.4837


In [36]:
import os, cv2, random, joblib, time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# ===== 경로 & 설정 =====
TRAIN_IMG_PATH = r"C:/Users/82106/Downloads/cifar-10/train/train"
TEST_IMG_PATH  = r"C:/Users/82106/Downloads/cifar-10/test/test"
TRAIN_LABELS   = r"C:/Users/82106/Downloads/cifar-10/trainLabels.csv"

RANDOM_SEED = 42       # 그대로 유지
TEST_SAMPLE = 10_000   # 그대로 유지

# 고정 하이퍼파라미터
PCA_DIM   = 256
METRIC    = "cosine"
K         = 9

random.seed(RANDOM_SEED); np.random.seed(RANDOM_SEED)

def read_flatten(path):
    img = cv2.imread(path)
    if img is None: return None
    return (img.astype(np.float32)/255.0).flatten()

# ===== 1) Train(5만) 로드 =====
labels_df = pd.read_csv(TRAIN_LABELS)   # id,label (1..50000)
labels_df['id'] = pd.to_numeric(labels_df['id'], errors='coerce').astype(int)

X, y = [], []
print(f"[INFO] Train {len(labels_df):,}장 로딩...")
for img_id, label in tqdm(labels_df[['id','label']].itertuples(index=False), total=len(labels_df)):
    v = read_flatten(os.path.join(TRAIN_IMG_PATH, f"{img_id}.png"))
    if v is None: continue
    X.append(v); y.append(label)

X = np.array(X, dtype=np.float32); y = np.array(y)
print("[INFO] X:", X.shape, "y:", y.shape)

# ===== 2) 8:2 검증 정확도 간단 확인(선택적) =====
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

val_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=PCA_DIM, whiten=True, random_state=RANDOM_SEED)),
    ("knn", KNeighborsClassifier(n_neighbors=K, metric=METRIC, n_jobs=-1))
])
val_pipe.fit(X_tr, y_tr)
val_pred = val_pipe.predict(X_val)
val_acc  = accuracy_score(y_val, val_pred)
print(f"\n=== Validation acc (PCA={PCA_DIM}, metric={METRIC}, k={K}): {val_acc:.4f} ===\n")

# ===== 3) 고정 설정으로 전체 5만장 재학습 =====
final_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=PCA_DIM, whiten=True, random_state=RANDOM_SEED)),
    ("knn", KNeighborsClassifier(n_neighbors=K, metric=METRIC, n_jobs=-1))
])
print("[INFO] 전체 5만장으로 최종 모델 재학습...")
final_pipe.fit(X, y)

joblib.dump(final_pipe, "final_knn_pca_pipeline.pkl")
with open("final_cfg.txt", "w", encoding="utf-8") as f:
    f.write(f"pca={PCA_DIM}, metric={METRIC}, k={K}\nval_acc={val_acc:.4f}")
print("[INFO] 최종 모델 저장 완료 -> final_knn_pca_pipeline.pkl, final_cfg.txt")

# ===== 4) Test(30만)에서 무작위 1만장 예측 =====
test_files = [f for f in os.listdir(TEST_IMG_PATH) if f.lower().endswith(".png")]
test_files = sorted(test_files, key=lambda x: int(os.path.splitext(x)[0]))
assert len(test_files) >= TEST_SAMPLE, "테스트 이미지가 10,000장보다 적습니다."

sample_files = random.sample(test_files, TEST_SAMPLE)

X_test, test_ids = [], []
print(f"\n[INFO] Test {TEST_SAMPLE:,}장 로딩/전처리...")
for f in tqdm(sample_files):
    iid = int(os.path.splitext(f)[0])
    v = read_flatten(os.path.join(TEST_IMG_PATH, f))
    if v is None: continue
    X_test.append(v); test_ids.append(iid)

X_test = np.array(X_test, dtype=np.float32)
print("[INFO] X_test:", X_test.shape)

print("[INFO] 최종 모델로 예측 중...")
test_pred = final_pipe.predict(X_test)

# ===== 5) 제출 파일 저장 (id,label) =====
submission = pd.DataFrame({"id": test_ids, "label": test_pred}).sort_values("id")
submission.to_csv("submission.csv", index=False)
print("\n[INFO] submission.csv 저장 완료 (테스트 1만장 예측)")
print(submission.head())


[INFO] Train 50,000장 로딩...


100%|██████████| 50000/50000 [00:56<00:00, 889.31it/s] 


[INFO] X: (50000, 3072) y: (50000,)

=== Validation acc (PCA=256, metric=cosine, k=9): 0.4888 ===

[INFO] 전체 5만장으로 최종 모델 재학습...
[INFO] 최종 모델 저장 완료 -> final_knn_pca_pipeline.pkl, final_cfg.txt

[INFO] Test 10,000장 로딩/전처리...


100%|██████████| 10000/10000 [01:59<00:00, 83.42it/s]


[INFO] X_test: (10000, 3072)
[INFO] 최종 모델로 예측 중...

[INFO] submission.csv 저장 완료 (테스트 1만장 예측)
       id     label
9310   45     truck
4907   49      bird
3981   89      bird
5908   97       cat
5196  103  airplane
