In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


X, y = fetch_openml('Fashion-MNIST', version=1, return_X_y=True)
X /= 255.0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
class_names = [
    'T-shirt',
    'Trouser',
    'Pullover',
    'Dress',
    'Coat',
    'Sandal',
    'Shirt',
    'Sneaker',
    'Bag',
    'Ankle boot'
]

class_names[int(y_train[0])]

'Ankle boot'

In [3]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

In [4]:
import numpy as np


# Розділення навчального набору на два: з мітками та без міток
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(x_train, y_train, test_size=0.5, random_state=42)
X_labeled, X_unlabeled, y_labeled, y_unlabeled

(array([[-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        ...,
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068]]),
 array([[-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        [-0.00796844, -0.02016575, -0.02805079, ..., -0.15864275,
         -0.09123214, -0.03342068],
        ...,
        [-0.00796844, -0.02016575, -0.02805079, ..., -

In [5]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


l_svc = LinearSVC(loss='hinge', dual=True, C=1e-2, multi_class='ovr')
l_svc.fit(X_labeled, y_labeled)

y_pred = l_svc.predict(x_test)



In [6]:
accu = accuracy_score(y_test, y_pred)
crep = classification_report(y_test, y_pred, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.8403571428571428
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.77      0.80      0.79      1394
     Trouser       0.96      0.97      0.96      1402
    Pullover       0.77      0.74      0.76      1407
       Dress       0.84      0.86      0.85      1449
        Coat       0.72      0.81      0.76      1357
      Sandal       0.91      0.90      0.91      1449
       Shirt       0.68      0.52      0.59      1407
     Sneaker       0.87      0.93      0.90      1359
         Bag       0.92      0.94      0.93      1342
  Ankle boot       0.94      0.94      0.94      1434

    accuracy                           0.84     14000
   macro avg       0.84      0.84      0.84     14000
weighted avg       0.84      0.84      0.84     14000



In [7]:
from sklearn.semi_supervised import SelfTrainingClassifier


# Класифікатор самонавчання
self_training_classifier = SelfTrainingClassifier(l_svc)
self_training_classifier.fit(X_labeled, y_labeled)

y_pred_self_training = self_training_classifier.predict(x_test)



In [8]:
accu = accuracy_score(y_test, y_pred_self_training)
crep = classification_report(y_test, y_pred_self_training, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.8406428571428571
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.77      0.80      0.79      1394
     Trouser       0.96      0.97      0.96      1402
    Pullover       0.77      0.74      0.76      1407
       Dress       0.84      0.86      0.85      1449
        Coat       0.72      0.81      0.76      1357
      Sandal       0.91      0.90      0.91      1449
       Shirt       0.68      0.52      0.59      1407
     Sneaker       0.87      0.93      0.90      1359
         Bag       0.92      0.94      0.93      1342
  Ankle boot       0.94      0.94      0.94      1434

    accuracy                           0.84     14000
   macro avg       0.84      0.84      0.84     14000
weighted avg       0.84      0.84      0.84     14000



In [9]:
from sklearn.semi_supervised import LabelPropagation


# Класифікатор поширення міток
label_propagation_classifier = LabelPropagation(kernel='rbf', gamma=1e-1, n_jobs=-1)
label_propagation_classifier.fit(X_labeled, y_labeled)

y_pred_label_propagation = label_propagation_classifier.predict(x_test)

  probabilities /= normalizer


In [10]:
accu = accuracy_score(y_test, y_pred_label_propagation)
crep = classification_report(y_test, y_pred_label_propagation, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.845
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.76      0.83      0.79      1394
     Trouser       0.99      0.97      0.98      1402
    Pullover       0.76      0.75      0.75      1407
       Dress       0.90      0.85      0.87      1449
        Coat       0.74      0.75      0.75      1357
      Sandal       0.99      0.85      0.91      1449
       Shirt       0.58      0.62      0.60      1407
     Sneaker       0.87      0.96      0.91      1359
         Bag       0.98      0.93      0.95      1342
  Ankle boot       0.91      0.96      0.93      1434

    accuracy                           0.84     14000
   macro avg       0.85      0.85      0.85     14000
weighted avg       0.85      0.84      0.85     14000



In [11]:
from sklearn.semi_supervised import LabelSpreading
# ALWAYS ERROR (with rbf kernel) BECAUSE OF MEMORY
# Link: https://stackoverflow.com/questions/57507832/unable-to-allocate-array-with-shape-and-data-type

# Класифікатор розповсюдження міток
label_spreading_classifier = LabelSpreading(kernel='knn', n_neighbors=3, n_jobs=-1)
label_spreading_classifier.fit(X_labeled, y_labeled)

y_pred_label_spreading = label_spreading_classifier.predict(x_test)

In [12]:
accu = accuracy_score(y_test, y_pred_label_spreading)
crep = classification_report(y_test, y_pred_label_spreading, target_names=class_names)

print(f"Accuracy: {accu}\nClassification report:\n{crep}")

Accuracy: 0.8427142857142857
Classification report:
              precision    recall  f1-score   support

     T-shirt       0.77      0.84      0.80      1394
     Trouser       0.99      0.96      0.98      1402
    Pullover       0.76      0.76      0.76      1407
       Dress       0.90      0.84      0.87      1449
        Coat       0.74      0.77      0.75      1357
      Sandal       0.99      0.82      0.90      1449
       Shirt       0.60      0.59      0.59      1407
     Sneaker       0.86      0.96      0.90      1359
         Bag       0.97      0.93      0.95      1342
  Ankle boot       0.90      0.95      0.92      1434

    accuracy                           0.84     14000
   macro avg       0.85      0.84      0.84     14000
weighted avg       0.85      0.84      0.84     14000



In [13]:
class_names = [
    'Nothing (-1)',
    'T-shirt',
    'Trouser',
    'Pullover',
    'Dress',
    'Coat',
    'Sandal',
    'Shirt',
    'Sneaker',
    'Bag',
    'Ankle boot'
]

class_names[int(y_train[0])]

'Bag'

In [14]:
from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation, LabelSpreading
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
import warnings


# base_classifier = SVC(C=1.0, kernel='rbf', gamma='scale', probability=True)
base_classifier = LinearSVC(loss='hinge', dual=True, C=1e-2, multi_class='ovr')
self_training_classifier = SelfTrainingClassifier(base_classifier)
label_propagation_classifier = LabelPropagation(kernel='rbf', gamma=1e-1, n_jobs=-1)
label_spreading_classifier = LabelSpreading(kernel='knn', n_neighbors=3, n_jobs=-1)

models = [self_training_classifier, label_propagation_classifier, label_spreading_classifier]
percentages = [30, 40, 50, 60, 70]

if '-1' not in y_unlabeled.cat.categories:
    y_unlabeled = y_unlabeled.astype('category').cat.add_categories(['-1'])


for perc in percentages:
    for model in models:
        num_unlabeled = int(len(y_unlabeled) * perc / 100)
        random_unlabeled_indices = np.random.choice(len(y_unlabeled), num_unlabeled, replace=False)
        y_unlabeled_temp = y_unlabeled.copy()
        y_unlabeled_temp.iloc[random_unlabeled_indices] = '-1'

        warnings.filterwarnings("ignore")
        model.fit(X_unlabeled, y_unlabeled_temp)
        
        pred = model.predict(x_test)
        accu = accuracy_score(y_test, pred)
        crep = classification_report(y_test, pred, target_names=class_names)

        print(f"Model: {model}, unlabeled data: {perc}%, accuracy: {accu}\n"
              f"Classification report:\n{crep}\n")

Model: SelfTrainingClassifier(base_estimator=LinearSVC(C=0.01, dual=True,
                                                loss='hinge')), unlabeled data: 30%, accuracy: 0.805
Classification report:
              precision    recall  f1-score   support

Nothing (-1)       0.00      0.00      0.00         0
     T-shirt       0.75      0.80      0.77      1394
     Trouser       0.95      0.96      0.95      1402
    Pullover       0.75      0.71      0.73      1407
       Dress       0.80      0.86      0.83      1449
        Coat       0.69      0.77      0.73      1357
      Sandal       0.87      0.88      0.88      1449
       Shirt       0.75      0.32      0.44      1407
     Sneaker       0.87      0.90      0.89      1359
         Bag       0.90      0.93      0.91      1342
  Ankle boot       0.92      0.93      0.92      1434

    accuracy                           0.81     14000
   macro avg       0.75      0.73      0.73     14000
weighted avg       0.83      0.81      0.81 

In [15]:
""" TEST ZONE """

# from sklearn.semi_supervised import LabelPropagation
# from sklearn.metrics import accuracy_score, classification_report
# import warnings


# label_propagation_classifier = LabelPropagation(kernel='rbf', gamma=0.1, n_jobs=-1)
# percentages = [40, 30] # 70, 60, 50, 
# models = [label_propagation_classifier]

# if '-1' not in y_unlabeled.cat.categories:
#     y_unlabeled = y_unlabeled.astype('category').cat.add_categories(['-1'])


# for model in models:
#     for perc in percentages:
#         num_unlabeled = int(len(y_unlabeled) * perc / 100)
#         random_unlabeled_indices = np.random.choice(len(y_unlabeled), num_unlabeled, replace=False)
#         y_unlabeled_temp = y_unlabeled.copy()
#         y_unlabeled_temp.iloc[random_unlabeled_indices] = '-1'

#         warnings.filterwarnings("ignore")
#         model.fit(X_unlabeled, y_unlabeled_temp)
        
#         pred = model.predict(x_test)
#         accu = accuracy_score(y_test, pred)
#         crep = classification_report(y_test, pred, target_names=class_names)

#         print(f"Model: {model}, unlabeled data: {perc}%, accuracy: {accu}\n"
#               f"Classification report:\n{crep}\n")

' TEST ZONE '