In [2]:
# import dependences
%pip install seaborn

from PIL import Image
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

Note: you may need to restart the kernel to use updated packages.


In [3]:
# load dataset
def load_dataset(fname=None, new_size=(10, 10)):
    if fname is None:
        base_dir = os.getcwd()
        fname = os.path.join(base_dir, '../elpv-dataset/labels.csv')

    data = np.genfromtxt(fname, dtype=['|S19', '<f8', '|S4'], names=[
                         'path', 'probability', 'type'])
    image_fnames = np.char.decode(data['path'])
    probs = data['probability']
    types = np.char.decode(data['type'])

    def load_cell_image(fname):
        with Image.open(fname) as image:
            image = image.convert('L')  # Convert to grayscale
            image = image.resize(new_size, Image.Resampling.LANCZOS)  # Resize the image
            return np.asarray(image)

    dir = os.path.dirname(fname)

    images = np.array([load_cell_image(os.path.join(dir, fn))
                       for fn in image_fnames])

    return images, probs, types

images, proba, types = load_dataset(new_size=(10, 10))  

# # randomly show 16 x 16 images
# num_images = 16 * 16
# selected_indices = np.random.choice(len(images), num_images, replace=False)
# selected_images = images[selected_indices]
# selected_proba = proba[selected_indices]
# selected_types = types[selected_indices]

# fig, axes = plt.subplots(16, 16, figsize=(20, 20))
# for i, ax in enumerate(axes.flat):
#     ax.imshow(selected_images[i], cmap='gray')
#     ax.axis('off')
#     label = f'{selected_proba[i]:.2f}, {selected_types[i]}'
#     ax.text(0.5, -0.1, label, fontsize=10, ha='center', transform=ax.transAxes)

# plt.tight_layout()
# plt.show()

In [4]:

X = images.reshape((images.shape[0], -1))
y = (proba > 0.5).astype(int)  

[[ 51  72  67 ...  71  74  50]
 [ 65  85  92 ...  74  69  45]
 [ 56  76  76 ...  85  88  58]
 ...
 [149 169 164 ... 146 149 136]
 [107 135 150 ... 152 160 144]
 [134 142 139 ... 142 146 130]]
[1 1 1 ... 0 0 0]


In [6]:

type_encoder = LabelEncoder()
types_encoded = type_encoder.fit_transform(types)
# Split the data
X_train, X_test, y_train, y_test, types_train, types_test = train_test_split(X, y, types_encoded, test_size=0.25, random_state=42)
print(X_train)
print(y_train)
print(types_train)

# Filter out 'mono' and 'poly' samples for separate processing
X_train_mono = X_train[types_train == 0]
y_train_mono = y_train[types_train == 0]
X_test_mono = X_test[types_test == 0]
y_test_mono = y_test[types_test == 0]

X_train_poly = X_train[types_train == 1]
y_train_poly = y_train[types_train == 1]
X_test_poly = X_test[types_test == 1]
y_test_poly = y_test[types_test == 1]

[[170 200 194 ... 154 167 163]
 [139 140 131 ... 144 151 149]
 [ 98 148 145 ... 151 153 101]
 ...
 [ 94 163 163 ... 173 184 116]
 [130 162 158 ... 146 149 122]
 [ 82 106  99 ...  90  87  69]]
[0 1 0 ... 0 0 1]
[1 1 0 ... 0 0 1]


In [7]:
# Create a pipeline with PCA and a classifier
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=150)),  # Adjust the number of components
    ('classifier', RandomForestClassifier(random_state=42))
])

# Parameters for grid search
param_grid = {
    'pca__n_components': [50, 100, 150],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None]
}

# Create and train a pipeline for 'mono' samples
pipe_mono = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipe_mono.fit(X_train_mono, y_train_mono)

# Predictions and evaluation for 'mono'
y_pred_mono = pipe_mono.predict(X_test_mono)

# Create and train a pipeline for 'poly' samples
pipe_poly = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipe_poly.fit(X_train_poly, y_train_poly)

# Predictions and evaluation for 'poly'
y_pred_poly = pipe_poly.predict(X_test_poly)

In [9]:
# Grid search with cross-validation
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# accounting accuracy
accuracy = accuracy_score(y_test, y_pred)


45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from tabulate import tabulate



# 生成混淆矩阵数据
cm = confusion_matrix(y_test, y_pred)
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)

# 格式化矩阵数据为百分比和实际值
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        c = cm[i, j]
        p = cm_perc[i, j]
        annot[i, j] = f'{p:.1f}%\n({c})'

# 自定义类别名称
true_class_names = ['0% defective', '100% defective']  # 真实标签的类别名称
predicted_class_names = ['0% defective by Pred', '100% defective by Pred']  # 模型预测的类别名称

# 绘制混淆矩阵
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=annot, fmt='', cmap='Blues', cbar=True, linewidths=0.5, linecolor='gray')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# 设置y轴（True Label）的类别名称
plt.yticks(np.arange(len(true_class_names)) + 0.5, true_class_names, rotation=0)

# 设置x轴（Predicted Label）的类别名称
plt.xticks(np.arange(len(predicted_class_names)) + 0.5, predicted_class_names)

# 显示图形
plt.show()

# 打印准确度
print(f"Accuracy on Test Set: {accuracy:.2f}")

# # Plotting confusion matrix
# plt.figure(figsize=(10, 7))
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
# plt.title('Confusion Matrix')
# plt.ylabel('True Label')
# plt.xlabel('Predicted Label')
# plt.show()

# print(f"Accuracy on Test Set: {accuracy:.2f}")

# print("Classification Report for Mono:")
# print(classification_report(y_test_mono, y_pred_mono))
# print("Confusion Matrix for Mono:")
# print(confusion_matrix(y_test_mono, y_pred_mono))
# print("Accuracy for Mono: {:.2f}%".format(100 * accuracy_score(y_test_mono, y_pred_mono)))


# print("\nClassification Report for Poly:")
# print(classification_report(y_test_poly, y_pred_poly))
# print("Confusion Matrix for Poly:")
# print(confusion_matrix(y_test_poly, y_pred_poly))
# print("Accuracy for Poly: {:.2f}%".format(100 * accuracy_score(y_test_poly, y_pred_poly)))

NameError: name 'confusion_matrix' is not defined