In [5]:
#step 3 and step 4
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skimage.io import imread
from skimage.transform import resize
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Read CSV file
data_dir = './data/data/'  # Replace with your image directory path
csv_file = './chinese_mnist.csv'  # Replace with your CSV file path
df = pd.read_csv(csv_file)

# Load image data
images = []
labels = df['character'].values  # Get labels and convert to NumPy array

for _, row in df.iterrows():
    # Construct filename based on suite_id, sample_id, and code
    filename = f"input_{row['suite_id']}_{row['sample_id']}_{row['code']}.jpg"
    img_path = os.path.join(data_dir, filename)

    if os.path.exists(img_path):
        image = imread(img_path)
        # step 4 Read and resize image to 64x64 pixels
        image = resize(image, (64, 64))
        images.append(image)
    else:
        print(f"Image {img_path} not found.")

images = np.array(images)
labels = np.array(labels)
label_map = {char: idx for idx, char in enumerate(np.unique(labels))}
labels = np.array([label_map[char] for char in labels])
unique_labels, counts = np.unique(labels, return_counts=True)
print(f"Unique labels: {unique_labels}")
print(f"Counts: {counts}")

# Flatten image data to fit classifier input
n_samples, height, width = images.shape
X = images.reshape((n_samples, height * width))

# Use train_test_split for stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, labels,
    train_size=5000,
    test_size=1000,
    stratify=labels,
    random_state=42
)

# Validate number of each class
train_counts = np.bincount(y_train)
test_counts = np.bincount(y_test)
print(f"Training set class distribution: {train_counts}")
print(f"Test set class distribution: {test_counts}")

Unique labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Counts: [1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000
 1000]
Training set class distribution: [334 333 333 333 333 334 333 333 333 334 333 334 333 334 333]
Test set class distribution: [66 67 67 67 67 66 67 67 67 66 67 66 67 66 67]


In [6]:
# step 5
# Initialize KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Initialize SGD classifier
sgd_classifier = SGDClassifier(max_iter=250)

# Print classifier objects to ensure parameters are set correctly
print("KNN Classifier:", knn_classifier)
print("Decision Tree Classifier:", dt_classifier)
print("SGD Classifier:", sgd_classifier)

KNN Classifier: KNeighborsClassifier(n_neighbors=3)
Decision Tree Classifier: DecisionTreeClassifier()
SGD Classifier: SGDClassifier(max_iter=250)


In [7]:
# step 6
# Fit KNN classifier to training data
knn_classifier.fit(X_train, y_train)

# Fit Decision Tree classifier to training data
dt_classifier.fit(X_train, y_train)

# Fit SGD classifier to training data
sgd_classifier.fit(X_train, y_train)

In [8]:
# step 7
# Define function to evaluate model performance
def evaluate_model(classifier, X_test, y_test):
    # Predict test data
    y_pred = classifier.predict(X_test)

    # Calculate various evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    return accuracy, precision, recall, f1, cm

# Evaluate KNN classifier
knn_accuracy, knn_precision, knn_recall, knn_f1, knn_cm = evaluate_model(knn_classifier, X_test, y_test)

# Evaluate Decision Tree classifier
dt_accuracy, dt_precision, dt_recall, dt_f1, dt_cm = evaluate_model(dt_classifier, X_test, y_test)

# Evaluate SGD classifier
sgd_accuracy, sgd_precision, sgd_recall, sgd_f1, sgd_cm = evaluate_model(sgd_classifier, X_test, y_test)

In [9]:
# step 8
# Print evaluation results
print("KNN Classifier Performance:")
print("Accuracy:", knn_accuracy)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1 Score:", knn_f1)
print("Confusion Matrix:\n", knn_cm)
print("\n")

print("Decision Tree Classifier Performance:")
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1)
print("Confusion Matrix:\n", dt_cm)
print("\n")

print("SGD Classifier Performance:")
print("Accuracy:", sgd_accuracy)
print("Precision:", sgd_precision)
print("Recall:", sgd_recall)
print("F1 Score:", sgd_f1)
print("Confusion Matrix:\n", sgd_cm)

KNN Classifier Performance:
Accuracy: 0.351
Precision: 0.5442695128527733
Recall: 0.351
F1 Score: 0.3642068406411239
Confusion Matrix:
 [[61  0  0  0  0  4  0  0  0  1  0  0  0  0  0]
 [36 11  0  0  3  7  0  1  2  2  4  1  0  0  0]
 [30  0 25  0  1  2  2  0  1  4  1  0  0  1  0]
 [32  0  0 17  0 18  0  0  0  0  0  0  0  0  0]
 [23  6  4  2 19  1  1  4  0  3  2  2  0  0  0]
 [35  0  0  4  0 27  0  0  0  0  0  0  0  0  0]
 [15  5  7 15  3 16  6  0  0  0  0  0  0  0  0]
 [35  7  4  1  2  5  1 11  0  0  1  0  0  0  0]
 [13  0  0  0  1  0  1  0 50  1  1  0  0  0  0]
 [47  1  1  1  1  3  0  0  2 10  0  0  0  0  0]
 [24  0  1  0  0  1  1  0  0  0 38  2  0  0  0]
 [24  1  1  2  2  4  0  0  0  1 16 15  0  0  0]
 [26  1  3  9  0 12  0  0  0  2  0  1 12  1  0]
 [16  1  9  6  2  6  3  0  0  1  6  1  0 15  0]
 [10  2  8  0  1  0  0  1  1  5  0  4  1  0 34]]


Decision Tree Classifier Performance:
Accuracy: 0.257
Precision: 0.262919855077149
Recall: 0.257
F1 Score: 0.25828155240867784
Confusion Matr