In [None]:
# ----------------  Read and load data ----------------
from google.colab import drive
from skimage import io, transform
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import glob
import os
import numpy as np

# Mount Google Drive
drive.mount('/content/gdrive')

# Specify the paths to the root folders containing subfolders for training and testing
train_folder = '/content/gdrive/My Drive/train'
test_folder = '/content/gdrive/My Drive/test'

# Load and preprocess a single image for each fault type
sample_images = []

for defect_type in os.listdir(train_folder):
    folder_path = os.path.join(train_folder, defect_type)

    # Skip non-directory entries
    if not os.path.isdir(folder_path):
        continue

    # Load the first image from the current folder
    image_paths = glob.glob(os.path.join(folder_path, '*.bmp'))

    if image_paths:
        first_image_path = image_paths[0]
        image = io.imread(first_image_path)
        sample_images.append((defect_type, image))

# Plot the sample images
plt.figure(figsize=(15, 5))

for i, (defect_type, image) in enumerate(sample_images, 1):
    plt.subplot(1, len(sample_images), i)
    plt.imshow(image, cmap='gray')
    plt.title(defect_type)
    plt.axis('off')

plt.show()


In [None]:
# ---------------- Data Preprocessing -------------------
X_train = []
y_train = []

for defect_type in os.listdir(train_folder):
    folder_path = os.path.join(train_folder, defect_type)

    # Skip non-directory entries
    if not os.path.isdir(folder_path):
        continue

    # Load images from the current folder
    image_paths = glob.glob(os.path.join(folder_path, '*.bmp'))

    for path in image_paths:
        image = io.imread(path)
        flattened_image = image.flatten()

        X_train.append(flattened_image)
        y_train.append(defect_type)

X_train = np.array(X_train)
y_train = np.array(y_train)

scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)


pca = PCA()
pca.fit(X_train_normalized)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

plt.plot(cumulative_variance)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

In [None]:
num_components = 45
pca = PCA(n_components=num_components)
X_train_pca = pca.fit_transform(X_train_normalized)


X_test = []
y_test = []

for fault_type in os.listdir(test_folder):
    folder_path = os.path.join(test_folder, fault_type)

    # Skip non-directory entries
    if not os.path.isdir(folder_path):
        continue

    # Load images from the current folder
    image_paths = glob.glob(os.path.join(folder_path, '*.bmp'))

    for path in image_paths:
        image = io.imread(path)
        flattened_image = image.flatten()

        X_test.append(flattened_image)
        y_test.append(fault_type)  # Extract the label from the folder name

X_test = np.array(X_test)
y_test = np.array(y_test)

X_test_normalized = scaler.transform(X_test)

X_test_pca = pca.transform(X_test_normalized)

In [None]:
# ---------------- Build training model -------------------
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Generate learning curves
train_sizes, train_scores, val_scores = learning_curve(
    clf, X_train_pca, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
)

# Calculate mean and standard deviation of training and validation scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training Score', marker='o')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15)
plt.plot(train_sizes, val_mean, label='Validation Score', marker='o')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.15)

plt.title('Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Train the classifier on the full training set
clf.fit(X_train_pca, y_train)

In [None]:
# ---------------- Predict on testing data ------------------
y_test_pred = clf.predict(X_test_pca)
print("Classification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

In [None]:
accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy on Test Data:", accuracy)