# LabelSpreading 測試

## 載入相關套件

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelSpreading

## 載入資料集

In [2]:
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, 
                           n_redundant=0, random_state=1)

## 資料分割

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, 
                                            random_state=1, stratify=y)

## 設定 50% 資料為沒有標註(-1)

In [4]:
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(
          X_train, y_train, test_size=0.5, random_state=1, stratify=y_train)
X_train_mixed = np.concatenate((X_train_lab, X_test_unlab))
nolabel = [-1 for _ in range(len(y_test_unlab))]
y_train_mixed = np.concatenate((y_train_lab, nolabel))
y_train_mixed.shape

(500,)

## LabelSpreading 模型訓練與評估

In [5]:
clf = LabelSpreading()
clf.fit(X_train_mixed, y_train_mixed)
clf.score(X_test, y_test)

0.854

## LogisticRegression 模型訓練與評估

In [6]:
from sklearn.linear_model import LogisticRegression

clf2 = LogisticRegression()
clf2.fit(X_train_lab, y_train_lab)
clf2.score(X_test, y_test)

0.848

## 取得訓練資料標註

In [7]:
tran_labels = clf.transduction_
tran_labels.shape

(500,)

## 再依LabelSpreading傳播結果進行模型訓練與評估

In [8]:
clf3  = LogisticRegression()
clf3.fit(X_train_mixed, tran_labels)
clf3.score(X_test, y_test)

0.858