In [22]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook
from sklearn.semi_supervised import label_propagation
from sklearn.metrics import confusion_matrix, classification_report
from scipy import stats
from sklearn.decomposition import PCA

import time

### 1 加载数据

In [2]:
dataSet = pd.read_csv('../Cs137data/re_bp_SetData.csv')
dataSet.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,4991,4992,4993,4994,4995,4996,4997,4998,4999,5000
0,0,97.0,101.0,77.0,74.0,90.0,88.0,30.0,29.0,241.0,...,37.0,33.0,195.0,196.0,34.0,31.0,42.0,44.0,77.0,0.0
1,1,79.0,74.0,141.0,143.0,235.0,237.0,32.0,31.0,68.0,...,90.0,88.0,31.0,31.0,58.0,60.0,28.0,27.0,26.0,0.0
2,2,82.0,85.0,51.0,51.0,52.0,53.0,119.0,119.0,152.0,...,38.0,67.0,66.0,134.0,134.0,32.0,33.0,98.0,100.0,0.0
3,3,46.0,49.0,30.0,29.0,85.0,85.0,62.0,62.0,245.0,...,34.0,103.0,101.0,52.0,52.0,82.0,78.0,46.0,46.0,0.0
4,4,50.0,50.0,232.0,232.0,30.0,36.0,180.0,181.0,88.0,...,53.0,58.0,45.0,43.0,67.0,67.0,64.0,63.0,28.0,0.0


In [3]:
X = dataSet.values[:, 1: 5001]
Y = dataSet.values[:, -1]

# 删除本底数据，得到新的训练数据
X1 = np.delete(X, np.where(Y == 0)[0], axis=0)
Y1 = np.delete(Y, np.where(Y == 0)[0], axis=0)
X1.shape, Y1.shape

((7317, 5000), (7317,))

In [4]:
rng = np.random.RandomState(2)
indices = np.arange(X1.shape[0])
rng.shuffle(indices)
indices.max()

7316

In [23]:
def data_pca(x_train, COMPONENT_NUM):
    start = time.time()
    print("====>>===>>===>> PCA ")
    pca = PCA(n_components=COMPONENT_NUM, copy=True, whiten=False)  # 创建一个 PCA 对象
    pca.fit(x_train)    # 构建 PCA 模型
    pcaXTrain = pca.transform(x_train)
    # pcaXTest = pca.transform(x_test)
    print('pca time is %0.3fs' % (time.time() - start))
    return pcaXTrain

In [24]:
x_train = X1[indices[ :5000]]
y = Y1[indices[ :5000]]
x_train = data_pca(x_train, 0.95)

x_train.shape, x_train[:5, :], y[:5]

====>>===>>===>> PCA 
pca time is 83.312s


((5000, 2570), array([[-1017.29013269,   250.38499221,    56.31505813, ...,
           -14.60496093,    15.21905425,    27.70739763],
        [ -795.19157289,    81.55740213,   -55.37583812, ...,
            11.20707372,    52.44707993,   -28.67307639],
        [   40.70541809,   226.6525751 ,   -32.78683521, ...,
            25.78342479,    78.78625651,    -7.85024934],
        [   20.1505545 ,   -82.70954004,   -52.74263687, ...,
           -16.29999199,   -23.12175141,    28.35322013],
        [-1027.294783  ,   183.75991871,  -413.5661098 , ...,
            15.12700849,   -10.2510332 ,    25.90691698]]), array([3., 3., 2., 2., 3.]))

In [25]:
n_total_samples = y.shape[0]
n_labeled_points = 1000
max_iterations = 10

# 后面 2500 个数据未被标签
unlabeled_indices = np.arange(n_total_samples)[n_labeled_points: ]

f = plt.figure()

unlabeled_indices.shape

(4000,)

<Figure size 432x288 with 0 Axes>

In [26]:
for i in range(max_iterations):
    if len(unlabeled_indices) == 0:
        print("No unlabeled items left to label.")
        break
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1

    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=20)
    lp_model.fit(x_train, y_train)

    predicted_labels = lp_model.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                          labels=lp_model.classes_)

    print("Iteration %i %s" % (i, 70 * "_"))
    print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
          % (n_labeled_points, n_total_samples - n_labeled_points,
             n_total_samples))
    print(classification_report(true_labels, predicted_labels))
    print("Confusion matrix")
    print(cm)

    # compute the entropies of transduced label distributions
    pred_entropies = stats.distributions.entropy(
        lp_model.label_distributions_.T)

    # select up to 5 digit examples that the classifier is most uncertain about
    uncertainty_index = np.argsort(pred_entropies)[::-1]
    uncertainty_index = uncertainty_index[
        np.in1d(uncertainty_index, unlabeled_indices)][:300]

    # keep track of indices that we get labels for
    delete_indices = np.array([], dtype=int)

    # for more than 5 iterations, visualize the gain only on the first 5
    if i < 5:
        f.text(.05, (1 - (i + 1) * .183),
               "model %d\n\nfit with\n%d labels" %
               ((i + 1), i * 5 + 10), size=10)
    for index in enumerate(uncertainty_index):
        # for more than 5 iterations, visualize the gain only on the first 5
        # labeling 5 points, remote from labeled set
        delete_index, = np.where(unlabeled_indices == index)
        delete_indices = np.concatenate((delete_indices, delete_index))

    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
    n_labeled_points += len(uncertainty_index)

f.suptitle("Active learning with Label Propagation.\nRows show 5 most "
           "uncertain labels to learn with the next model.", y=1.15)
plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2,
                    hspace=0.85)
plt.show()

  self.label_distributions_ /= normalizer


Iteration 0 ______________________________________________________________________
Label Spreading model: 1000 labeled & 4000 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 1 ______________________________________________________________________
Label Spreading model: 1300 labeled & 3700 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 2 ______________________________________________________________________
Label Spreading model: 1600 labeled & 3400 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 3 ______________________________________________________________________
Label Spreading model: 1900 labeled & 3100 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 4 ______________________________________________________________________
Label Spreading model: 2200 labeled & 2800 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 5 ______________________________________________________________________
Label Spreading model: 2500 labeled & 2500 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 6 ______________________________________________________________________
Label Spreading model: 2800 labeled & 2200 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 7 ______________________________________________________________________
Label Spreading model: 3100 labeled & 1900 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 8 ______________________________________________________________________
Label Spreading model: 3400 labeled & 1600 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]


  self.label_distributions_ /= normalizer


Iteration 9 ______________________________________________________________________
Label Spreading model: 3700 labeled & 1300 unlabeled (5000 total)


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.31      1.00      0.47      1243
         2.0       0.00      0.00      0.00      1187
         3.0       0.00      0.00      0.00      1496
         4.0       0.00      0.00      0.00        74

    accuracy                           0.31      4000
   macro avg       0.08      0.25      0.12      4000
weighted avg       0.10      0.31      0.15      4000

Confusion matrix
[[1243    0    0    0]
 [1187    0    0    0]
 [1496    0    0    0]
 [  74    0    0    0]]




<Figure size 432x288 with 0 Axes>