In [57]:
#no need to modify
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [58]:
#no need to modify
import numpy as np
import os
import glob
import cv2

In [59]:
#have to modify
def load_dataset(data_dir):
    """
    Load all grayscale .jpg images from a directory and flatten them into column vectors.

    Args:
        data_dir (str): Path to the folder containing .jpg images.

    Returns:
        X (np.ndarray): 2D array of shape (D, N), where D = H * W (flattened image size)
                        and N = number of images.
        labels (List[int]): List of length N with integer labels parsed from the
                            first two characters of each filename.
    """
    img_files = glob.glob(os.path.join(data_dir, '*.jpg'))
    #이미지 벡터 저장할 리스트
    images = []
    #레이블 저장할 리스트
    labels = []
    for img_path in img_files:
      # 이미지 회색조로 불러오기
      img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
      # 이미지를 1차원 벡터로 펼치기
      img_flattened = img.flatten()
      #이미지 벡터를 리스트에 저장
      images.append(img_flattened)
      #파일 이름에서 레이블 파싱 후 첫 두글자를 숫자로 변환
      filename = os.path.basename(img_path)
      label_str = filename[:2]
      try:
        label = int(label_str)
        labels.append(label)
      except ValueError:
        print(f"Skipping invalid label in filename: {filename}")
        continue

    # 리스트에 저장된 이미지 벡터들을 쌓아 2D NumPy 배열 생성
    # 각 벡터가 열이 되도록 전치. (D, N) 형태
    X = np.array(images).T
    return X,labels
    raise NotImplementedError("This is your homework.")

In [60]:
#no need to modify
def compute_pairwise_distances(X, Y, metric='l2'):
    """
    Compute pairwise distances between two sets of vectors using the specified metric.

    Args:
        X (np.ndarray): Array of shape (D, M), representing M query vectors of dimensionality D.
        Y (np.ndarray): Array of shape (D, N), representing N reference vectors of dimensionality D.
        metric (str): Distance metric to use; either 'l2' for Euclidean distance or 'l1' for Manhattan distance.

    Returns:
        np.ndarray: Distance matrix of shape (M, N), where entry (i, j) is the distance between X[:, i] and Y[:, j].

    Raises:
        ValueError: If an unsupported metric string is provided.
    """
    if metric == 'l2':
        # Use Euclidean (L2) distance
        return calc_l2(X, Y)
    elif metric == 'l1':
        # Use Manhattan (L1) distance
        return calc_l1(X, Y)
    else:
        # Reject unknown metrics
        raise ValueError(f"Unknown metric: {metric}")

#have to modify
def calc_l2(X, Y):

    """
    Compute pairwise L2 (Euclidean) distances between two sets of vectors.

    Args:
        X (np.ndarray): Array of shape (D, M), M query vectors.
        Y (np.ndarray): Array of shape (D, N), N reference vectors.

    Returns:
        np.ndarray: Matrix of shape (M, N) where entry (i, j) is
                    ||X[:, i] - Y[:, j]||_2.
    """
    #차원 확장 후 broadcasting 연산 수행하여 diff_matrix 구하기
    X_expanded = X.T[:, np.newaxis, :] #(M,1,D)
    Y_expanded = Y.T[np.newaxis, :, :] #(1,N,D)
    diff_matrix = X_expanded - Y_expanded #(M,N,D)
    #diff_matrix 제곱
    squared_diff = diff_matrix**2
    #squared_diff의 세번째 차원을 합하여 차원 축소(3D -> 2D)
    sum_squared_diff = np.sum(squared_diff, axis=2)
    #sum_squared_diff에 루트 씌워 distance_matrix정의
    distance_matrix = np.sqrt(sum_squared_diff)
    return distance_matrix
    raise NotImplementedError("This is your homework.")

#have to modify
def calc_l1(X, Y):
    """
    Compute pairwise L1 (Manhattan) distances between two sets of vectors.

    Args:
        X (np.ndarray): Array of shape (D, M), M query vectors.
        Y (np.ndarray): Array of shape (D, N), N reference vectors.

    Returns:
        np.ndarray: Matrix of shape (M, N) where entry (i, j) is
                    ||X[:, i] - Y[:, j]||_1.
    """
    #차원 확장 후 broadcasting 연산 수행하여 diff_matrix 구하기
    X_expanded = X.T[:, np.newaxis, :] #(M,1,D)
    Y_expanded = Y.T[np.newaxis, :, :] #(1,N,D)
    diff_matrix = X_expanded - Y_expanded #(M,N,D)
    #diff_matrix의 모든 원소에 절대값 취함.
    abs_diff = np.abs(diff_matrix)
    #abs_diff의 세번째 차원을 합하여 차원 축소(3D -> 2D)
    sum_abs_diff = np.sum(abs_diff, axis=2)
    distance_matrix = sum_abs_diff
    return distance_matrix
    raise NotImplementedError("This is your homework.")

In [61]:
#have to modify
def match_accuracy(X_train, y_train, X_test, y_test, metric='l2'):
    """
    Perform nearest-neighbor classification and compute accuracy.

    For each test vector, find the closest training vector using the specified
    distance metric, then compare its label to the true test label.

    Args:
        X_train (np.ndarray): Training data of shape (D, N_train).
        y_train (List[int]): Labels for training data, length N_train.
        X_test (np.ndarray): Test data of shape (D, N_test).
        y_test (List[int]): True labels for test data, length N_test.
        metric (str): Distance metric to use; either 'l1' or 'l2'.

    Returns:
        float: Classification accuracy (between 0 and 1).
    """
    #X_test: query vector, X_train: refefence vector
    #define distance_matrix
    distance_matrix = compute_pairwise_distances(X_test, X_train, metric = metric)
    #Initialize hit_count(정답 개수)
    hit_count = 0
    #distance matrix row별로 접근((1,N)vector)
    for i in range(distance_matrix.shape[0]):
      #각 row에서 distance의 최솟값 구하고, 해당 값의 인덱스 반환
      closest_index = np.argmin(distance_matrix[i,:])
      #가장 높은 확률을 가진 값의 레이블 반환
      predicted_label = y_train[closest_index]
      #실제값과 비교
      true_label = y_test[i]
      if predicted_label == true_label:
        hit_count += 1
    accuracy = hit_count / len(y_test)
    return accuracy

    raise NotImplementedError("This is your homework.")

In [70]:
def proj1(data_root):
    # train: 50 subjects with 7 images each; test/val: 50 subjects with 1 image each
    X_tr, y_tr = load_dataset(os.path.join(data_root, 'train'))
    X_te, y_te = load_dataset(os.path.join(data_root, 'test2'))

    for metric in ('l1', 'l2'):
        acc = match_accuracy(X_tr, y_tr, X_te, y_te, metric)
        print(f"proj1 – {metric.upper()} accuracy: {acc * 100:.2f}%")

In [71]:
#have to modify
data_root = '/content/drive/MyDrive/git/CSE4130-Intro_to_ML/data'
print("===== Project 1 =====")
proj1(data_root)

===== Project 1 =====
proj1 – L1 accuracy: 10.00%
proj1 – L2 accuracy: 86.00%
