In [15]:
%pip install seaborn

from PIL import Image
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Note: you may need to restart the kernel to use updated packages.


In [16]:
def load_dataset(fname=None, new_size=(32, 32)):
    if fname is None:
        base_dir = os.getcwd()
        fname = os.path.join(base_dir, '../elpv-dataset/labels.csv')

    data = np.genfromtxt(fname, dtype=[('path', 'S19'), ('probability', 'f8'), ('type', 'S4')], names=['path', 'probability', 'type'])
    image_fnames = np.char.decode(data['path'])
    probs = data['probability']
    types = np.char.decode(data['type'])

    def load_cell_image(fname):
        with Image.open(fname) as image:
            image = image.convert('L')
            image = image.resize(new_size, Image.Resampling.LANCZOS)
            return np.asarray(image)

    dir = os.path.dirname(fname)
    images = np.array([load_cell_image(os.path.join(dir, fn)) for fn in image_fnames])

    return images, probs, types

images, proba, types = load_dataset(new_size=(32, 32))

In [17]:
# 将类型编码为数字
type_encoder = LabelEncoder()
types_encoded = type_encoder.fit_transform(types)

# 将概率值转换为分类
proba_classes = np.array([0 if p == 0 else 1 if p == 0.3333333333333333 else 2 if p == 0.6666666666666666 else 3 for p in proba])

# Flatten the images for ML algorithms
X = images.reshape((images.shape[0], -1))
y = proba_classes  # 使用分类后的概率值



In [18]:
# Split the data
X_train, X_test, y_train, y_test, types_train, types_test = train_test_split(X, y, types_encoded, test_size=0.25, random_state=42)

# Filter out 'mono' and 'poly' samples for separate processing
# 假设 'mono' 编码为 0，'poly' 编码为 1
X_train_mono = X_train[types_train == 0]
y_train_mono = y_train[types_train == 0]
X_test_mono = X_test[types_test == 0]
y_test_mono = y_test[types_test == 0]

X_train_poly = X_train[types_train == 1]
y_train_poly = y_train[types_train == 1]
X_test_poly = X_test[types_test == 1]
y_test_poly = y_test[types_test == 1]

In [19]:
# Create and train a pipeline for 'mono' samples
pipe_mono = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipe_mono.fit(X_train_mono, y_train_mono)

# Predictions and evaluation for 'mono'
y_pred_mono = pipe_mono.predict(X_test_mono)

# Create and train a pipeline for 'poly' samples
pipe_poly = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipe_poly.fit(X_train_poly, y_train_poly)

# Predictions and evaluation for 'poly'
y_pred_poly = pipe_poly.predict(X_test_poly)

print("Classification Report for Mono:")
print(classification_report(y_test_mono, y_pred_mono))
print("Confusion Matrix for Mono:")
print(confusion_matrix(y_test_mono, y_pred_mono))
print("Accuracy for Mono: {:.2f}%".format(100 * accuracy_score(y_test_mono, y_pred_mono)))


print("\nClassification Report for Poly:")
print(classification_report(y_test_poly, y_pred_poly))
print("Confusion Matrix for Poly:")
print(confusion_matrix(y_test_poly, y_pred_poly))
print("Accuracy for Poly: {:.2f}%".format(100 * accuracy_score(y_test_poly, y_pred_poly)))

Classification Report for Mono:
              precision    recall  f1-score   support

           0       0.73      0.88      0.80       147
           1       0.56      0.19      0.29        26
           2       0.00      0.00      0.00        18
           3       0.75      0.78      0.77        82

    accuracy                           0.73       273
   macro avg       0.51      0.46      0.46       273
weighted avg       0.67      0.73      0.69       273

Confusion Matrix for Mono:
[[130   2   0  15]
 [ 17   5   0   4]
 [ 16   0   0   2]
 [ 16   2   0  64]]
Accuracy for Mono: 72.89%

Classification Report for Poly:
              precision    recall  f1-score   support

           0       0.71      0.96      0.81       225
           1       0.85      0.23      0.37        47
           2       0.00      0.00      0.00        13
           3       0.77      0.52      0.62        98

    accuracy                           0.72       383
   macro avg       0.58      0.43      0.45 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
