In [16]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


X, y = fetch_openml('Fashion-MNIST', version=1, return_X_y=True)
# Розділення на навчальний та тестовий набори
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
class_names = [
    'T-shirt',
    'Trouser',
    'Pullover',
    'Dress',
    'Coat',
    'Sandal',
    'Shirt',
    'Sneaker',
    'Bag',
    'Ankle boot'
]

class_names[int(y_train[0])]

'Ankle boot'

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

In [19]:
import numpy as np


# Розділення навчального набору на два: з мітками та без міток
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(x_train, y_train, test_size=0.5, random_state=42)

X_labeled, X_unlabeled, y_labeled, y_unlabeled

(array([[-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        ...,
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068]]),
 array([[-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        ...,
        [-0.00796844, -0.02016575, -0.02805079, ..., -

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


base_classifier = SVC(C=1.0, kernel='rbf', gamma='scale', probability=True)
base_classifier.fit(X_labeled, y_labeled)

# Оцінка точності базового класифікатора
y_pred_labeled = base_classifier.predict(x_test)

In [6]:
accu = accuracy_score(y_test, y_pred_labeled)
crep = classification_report(y_test, y_pred_labeled, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.8812857142857143
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.83      0.84      0.83      1394
     Trouser       1.00      0.97      0.98      1402
    Pullover       0.82      0.82      0.82      1407
       Dress       0.87      0.90      0.89      1449
        Coat       0.79      0.83      0.81      1357
      Sandal       0.96      0.95      0.96      1449
       Shirt       0.72      0.64      0.68      1407
     Sneaker       0.92      0.95      0.94      1359
         Bag       0.93      0.98      0.96      1342
  Ankle boot       0.96      0.94      0.95      1434

    accuracy                           0.88     14000
   macro avg       0.88      0.88      0.88     14000
weighted avg       0.88      0.88      0.88     14000



In [7]:
from sklearn.semi_supervised import SelfTrainingClassifier


base_classifier = SVC(C=1.0, kernel='rbf', gamma='scale', probability=True)
# Класифікатор самонавчання
self_training_classifier = SelfTrainingClassifier(base_classifier)
self_training_classifier.fit(X_labeled, y_labeled)

y_pred_self_training = self_training_classifier.predict(x_test)



In [8]:
accu = accuracy_score(y_test, y_pred_self_training)
crep = classification_report(y_test, y_pred_self_training, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.8812857142857143
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.83      0.84      0.83      1394
     Trouser       1.00      0.97      0.98      1402
    Pullover       0.82      0.82      0.82      1407
       Dress       0.87      0.90      0.89      1449
        Coat       0.79      0.83      0.81      1357
      Sandal       0.96      0.95      0.96      1449
       Shirt       0.72      0.64      0.68      1407
     Sneaker       0.92      0.95      0.94      1359
         Bag       0.93      0.98      0.96      1342
  Ankle boot       0.96      0.94      0.95      1434

    accuracy                           0.88     14000
   macro avg       0.88      0.88      0.88     14000
weighted avg       0.88      0.88      0.88     14000



In [9]:
from sklearn.semi_supervised import LabelPropagation


# Класифікатор поширення міток
label_propagation_classifier = LabelPropagation(kernel='rbf', gamma=0.1, n_jobs=-1)
label_propagation_classifier.fit(X_labeled, y_labeled)

y_pred_label_propagation = label_propagation_classifier.predict(x_test)

  probabilities /= normalizer


In [10]:
accu = accuracy_score(y_test, y_pred_label_propagation)
crep = classification_report(y_test, y_pred_label_propagation, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.845
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.76      0.83      0.79      1394
     Trouser       0.99      0.97      0.98      1402
    Pullover       0.76      0.75      0.75      1407
       Dress       0.90      0.85      0.87      1449
        Coat       0.74      0.75      0.75      1357
      Sandal       0.99      0.85      0.91      1449
       Shirt       0.58      0.62      0.60      1407
     Sneaker       0.87      0.96      0.91      1359
         Bag       0.98      0.93      0.95      1342
  Ankle boot       0.91      0.96      0.93      1434

    accuracy                           0.84     14000
   macro avg       0.85      0.85      0.85     14000
weighted avg       0.85      0.84      0.85     14000



In [11]:
from sklearn.semi_supervised import LabelSpreading
# ALWAYS ERROR BECAUSE OF MEMORY
# Link: https://stackoverflow.com/questions/57507832/unable-to-allocate-array-with-shape-and-data-type

# Класифікатор розповсюдження міток
# label_spreading_classifier = LabelSpreading(kernel='rbf', gamma=0.1, n_jobs=-1)
# label_spreading_classifier.fit(x_train, y_train)

# y_pred_label_spreading = label_spreading_classifier.predict(x_test)

In [12]:
# accu = accuracy_score(y_test, y_pred_label_spreading)
# crep = classification_report(y_test, y_pred_label_spreading, target_names=class_names)

# print(f"Accuracy: {accu}\nClassification report:\n{crep}")

In [None]:
from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation, LabelSpreading
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC


base_classifier = SVC(C=1.0, kernel='rbf', gamma='scale', probability=True)
self_training_classifier = SelfTrainingClassifier(base_classifier)
label_propagation_classifier = LabelPropagation(kernel='rbf', gamma=0.1, n_jobs=-1)
label_spreading_classifier = LabelSpreading(kernel='rbf', gamma=0.1, n_jobs=-1)

models = [self_training_classifier, label_propagation_classifier, label_spreading_classifier]
percentages = [70, 60, 50, 40, 30]

y_unlabeled = y_unlabeled.astype('category')  # Convert to categorical
y_unlabeled = y_unlabeled.cat.add_categories([-1])  # Add -1 as a category

for model in models:
    for perc in percentages:
        num_unlabeled = int(len(y_unlabeled) * perc / 100)
        random_unlabeled_indices = np.random.choice(len(y_unlabeled), num_unlabeled, replace=False)
        y_unlabeled.iloc[random_unlabeled_indices] = -1
        
        model.fit(X_unlabeled, y_unlabeled)

        pred = model.predict(x_test)
        accu = accuracy_score(y_test, pred)
        crep = classification_report(y_test, pred, target_names=class_names)
        
        print(f"Model: {model}, percentage of unlabeled data: {perc}, accuracy: {accu}\n"
              f"Classification report:\n{crep}")

In [21]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import accuracy_score, classification_report


label_propagation_classifier = LabelPropagation(kernel='rbf', gamma=0.1, n_jobs=-1)
percentages = [40, 30] # 70, 60, 50, 
models = [label_propagation_classifier]

if '-1' not in y_unlabeled.cat.categories:
    y_unlabeled = y_unlabeled.astype('category').cat.add_categories(['-1'])


for model in models:
    for perc in percentages:
        num_unlabeled = int(len(y_unlabeled) * perc / 100)
        random_unlabeled_indices = np.random.choice(len(y_unlabeled), num_unlabeled, replace=False)
        y_unlabeled_temp = y_unlabeled.copy()
        y_unlabeled_temp.iloc[random_unlabeled_indices] = '-1'
        print(y_unlabeled_temp)
                
        model.fit(X_unlabeled, y_unlabeled_temp)
        
        pred = model.predict(x_test)
        accu = accuracy_score(y_test, pred)
        crep = classification_report(y_test, pred)
                
        print(f"Model: {model}, unlabeled data: {perc}%, accuracy: {accu}\n"
              f"Classification report:\n{crep}\n")

10234    -1
22304     2
2945     -1
45481    -1
38536    -1
         ..
3019     -1
45324     7
16512     9
30895    -1
5980      6
Name: class, Length: 28000, dtype: category
Categories (11, object): ['0', '1', '2', '3', ..., '7', '8', '9', '-1']


  probabilities /= normalizer
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: LabelPropagation(gamma=1, n_jobs=-1), unlabeled data: 40%, accuracy: 0.4905
Classification report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.79      0.50      0.61      1394
           1       0.98      0.57      0.72      1402
           2       0.73      0.44      0.55      1407
           3       0.88      0.49      0.63      1449
           4       0.72      0.44      0.54      1357
           5       0.98      0.48      0.65      1449
           6       0.61      0.35      0.45      1407
           7       0.86      0.59      0.70      1359
           8       0.98      0.51      0.67      1342
           9       0.87      0.53      0.66      1434

    accuracy                           0.49     14000
   macro avg       0.76      0.45      0.56     14000
weighted avg       0.84      0.49      0.62     14000


10234     5
22304     2
2945     -1
45481     1
38536     6
         ..
3019    

  probabilities /= normalizer
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: LabelPropagation(gamma=1, n_jobs=-1), unlabeled data: 30%, accuracy: 0.5775714285714286
Classification report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.76      0.58      0.66      1394
           1       0.98      0.68      0.80      1402
           2       0.76      0.53      0.63      1407
           3       0.89      0.57      0.69      1449
           4       0.71      0.49      0.58      1357
           5       0.99      0.57      0.72      1449
           6       0.62      0.45      0.52      1407
           7       0.86      0.67      0.75      1359
           8       0.98      0.59      0.73      1342
           9       0.90      0.65      0.76      1434

    accuracy                           0.58     14000
   macro avg       0.77      0.53      0.62     14000
weighted avg       0.85      0.58      0.69     14000




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
