In [1]:
import numpy as np
from keras.datasets import mnist
from selfTrainingModules import selfTrainImageModel, selfTrainTextModel, coTrainImageModel
from selfTrainingModules import supervisedSeqImage, supervisedSeqText
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from keras.datasets import imdb

Using TensorFlow backend.


In [None]:
def selfTrainMNIST(pThresh, selectionType, labelSize, induceError, errorDenom, layerSize,
                   dopoutPct, regLambda):
    #1. Label Sizes
    #2. Probabilities
    #3. Wrong Label Percent
    #4. Model hidden layer size
    (xTrain, yTrain), (xTest, yTest) = mnist.load_data()
    xTrain = xTrain.reshape(60000, 784)
    xTrain = xTrain/255.0
    xTest = xTest.reshape(10000, 784)
    xTest = xTest/255.0
    
    layerSize = 100
    dopoutPct = 0.25
    regLambda = 0
    batchSize = 32
    maxIter = 5

    selfTrainErr = []
    supervisedErr = []
    for size in labelSize:
        output = selfTrainImageModel(xTrain, yTrain, batchSize, maxIter, pThresh, size, selectionType,
                                     induceError, errorDenom, layerSize, dopoutPct, regLambda, 'SEQ')
        wrongLabels = output[1]
        yHat = output[0].predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        selfTrainErr = np.append(selfTrainErr, 1 - sum(np.diag(cm))/sum(sum(cm)))
        print('Denom ', errorDenom, 'Label Size ', size, wrongLabels)
        
        model = supervisedSeqImage(xTrain, yTrain, batchSize, layerSize, dopoutPct, regLambda, size,
                                   selectionType, 'SEQ')
        yHat = model.predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        yHat = model.predict_classes(xTest, batch_size = batchSize)
        supervisedErr = np.append(supervisedErr, 1 - sum(np.diag(cm))/sum(sum(cm)))
    return([selfTrainErr, supervisedErr])


In [None]:
def plotBaseModels(pThresh, selectionType, labelSize, induceError, errorDenom, pltName):
    layerSize = 100
    base512errors = selfTrainMNIST(pThresh, selectionType, labelSize, induceError, errorDenom,
                                   layerSize, 0.25, 0)
    layerSize = 100
    base100errors = selfTrainMNIST(pThresh, selectionType, labelSize, induceError, errorDenom,
                                   layerSize, 0, 0.01)
    xIdx = list(range(len(labelSize)))
    plt.plot(xIdx, base512errors[0], label = "Dropout - 25%")
    plt.plot(xIdx, base512errors[1], label = "Supervised Learning - Dropout")
    plt.plot(xIdx, base100errors[0], label = "L2 Reg - Lambda 0.01")
    plt.plot(xIdx, base100errors[1], label = "Supervised Learning - L2 Reg")
    plt.xticks(xIdx, labelSize)
    plt.xlabel("Number of Labeled Samples")
    plt.ylabel("Test Error")
    plt.title('Performance of Self-training on MNIST Data')
    plt.legend()
    plt.savefig(pltName + '_Self_train_MNIST.png')

In [None]:
def plotBaseModelsSeqVSCNN(pThresh, selectionType, labelSize, induceError, errorDenom, pltName,
                           dopoutPct, regLambda):
    (xTrain, yTrain), (xTest, yTest) = mnist.load_data()
    xTrain = xTrain.reshape(60000, 784)
    xTest = xTest.reshape(10000, 784)
    xTrain = xTrain.astype('float32')
    xTest = xTest.astype('float32')
    xTrain /= 255
    xTest /= 255

    layerSize = 100
    batchSize = 32
    dropout = 0.25
    regLambda = 0

    seqErrors = []
    seqSelfTrainErrors = []
    cnnErrors = []
    cnnSelfTrainErrors = []
    for size in labelSize:
        model = supervisedSeqImage(xTrain, yTrain, batchSize, layerSize, dropout, regLambda, size,
                       'Group', 'SEQ')
        yHat = model.predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        seqErrors = np.append(seqErrors, 1 - sum(np.diag(cm))/sum(sum(cm)))

        output = selfTrainImageModel(xTrain, yTrain, batchSize, maxIter, pThresh, size, selectionType,
                                     induceError, errorDenom, layerSize, dopoutPct, regLambda, 'SEQ')
        wrongLabels = output[1]
        yHat = output[0].predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        seqSelfTrainErrors = np.append(seqSelfTrainErrors, 1 - sum(np.diag(cm))/sum(sum(cm)))
        print('Denom ', errorDenom, 'Label Size ', size, wrongLabels)

    (xTrain, yTrain), (xTest, yTest) = mnist.load_data()
    xTrain = xTrain.reshape(xTrain.shape[0], 28, 28, 1)
    xTest = xTest.reshape(xTest.shape[0], 28, 28, 1)
    xTrain = xTrain.astype('float32')
    xTest = xTest.astype('float32')
    xTrain /= 255
    xTest /= 255
    for size in labelSize:
        model = supervisedSeqImage(xTrain, yTrain, batchSize, layerSize, dropout, regLambda, size,
                       'Group', 'CNN')
        yHat = model.predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        cnnErrors = np.append(cnnErrors, 1 - sum(np.diag(cm))/sum(sum(cm)))

        output = selfTrainImageModel(xTrain, yTrain, batchSize, maxIter, pThresh, size, selectionType,
                                     induceError, errorDenom, layerSize, dopoutPct, regLambda, 'CNN')
        wrongLabels = output[1]
        yHat = output[0].predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        cnnSelfTrainErrors = np.append(cnnSelfTrainErrors, 1 - sum(np.diag(cm))/sum(sum(cm)))
        print('Denom ', errorDenom, 'Label Size ', size, wrongLabels)

    xIdx = list(range(len(labelSize)))
    plt.plot(xIdx, seqErrors, label = "Dense Model - Supervised")
    plt.plot(xIdx, seqSelfTrainErrors, label = "Dense Model -  - Self-raining")
    plt.plot(xIdx, cnnErrors, label = "CNN Model - Supervised")
    plt.plot(xIdx, cnnSelfTrainErrors, label = "CNN Model - Self-raining")
    plt.xticks(xIdx, labelSize)
    plt.xlabel("Number of Labeled Samples")
    plt.ylabel("Test Error")
    plt.title('Performance of Self-training on MNIST Data')
    plt.legend()
    plt.savefig(pltName + '_Self_train_MNIST.png')
    return([seqErrors, seqSelfTrainErrors, cnnErrors, cnnSelfTrainErrors])

In [None]:
def plotCoTrainModels(pThresh, selectionType, labelSize, induceError, errorDenom, pltName):
    (xTrain, yTrain), (xTest, yTest) = mnist.load_data()
    xTrain = xTrain.reshape(60000, 784)
    xTest = xTest.reshape(10000, 784)
    xTrain = xTrain.astype('float32')
    xTest = xTest.astype('float32')
    xTrain /= 255
    xTest /= 255

    layerSize = 100
    batchSize = 32
    dropout = 0.25
    regLambda = 0
    
    m1Errors = []
    m2Errors = []
    seqErrors = []
    cnnErrors = []
    for size in labelSize:
        xTrain = xTrain.reshape(60000, 784)
        xTest = xTest.reshape(10000, 784)
        output = coTrainImageModel(xTrain, yTrain, batchSize, maxIter, pThresh, size, 
                                   selectionType, induceError, errorDenom, layerSize, dropout, 
                                   regLambda)
        yHat = output[0].predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        m1Errors = np.append(m1Errors, 1 - sum(np.diag(cm))/sum(sum(cm)))
        
        model = supervisedSeqImage(xTrain, yTrain, batchSize, layerSize, dropout, regLambda, size,
                                   'Group', 'SEQ')
        yHat = model.predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        seqErrors = np.append(seqErrors, 1 - sum(np.diag(cm))/sum(sum(cm)))
        
        xTrain = xTrain.reshape(xTrain.shape[0], 28, 28, 1)
        xTest = xTest.reshape(xTest.shape[0], 28, 28, 1)
        yHat = output[1].predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        m2Errors = np.append(m2Errors, 1 - sum(np.diag(cm))/sum(sum(cm)))
        
        model = supervisedSeqImage(xTrain, yTrain, batchSize, layerSize, dropout, regLambda, size,
                                   'Group', 'CNN')
        yHat = model.predict_classes(xTest, batch_size = batchSize)
        cm = confusion_matrix(yTest, yHat)
        cnnErrors = np.append(cnnErrors, 1 - sum(np.diag(cm))/sum(sum(cm)))
        
        del(output)
        del(model)
        
    xIdx = list(range(len(labelSize)))
    plt.plot(xIdx, seqErrors, label = "Model1 - Supervised")
    plt.plot(xIdx, m1Errors, label = "Model1 - Co-raining")
    plt.plot(xIdx, cnnErrors, label = "Model2 - Supervised")
    plt.plot(xIdx, m2Errors, label = "Model2 - Co-raining")
    plt.xticks(xIdx, labelSize)
    plt.xlabel("Number of Labeled Samples")
    plt.ylabel("Test Error")
    plt.title('Performance of Self-training on MNIST Data')
    plt.legend()
    plt.savefig(pltName + '_Self_train_MNIST.png')
    return([seqErrors, m1Errors, cnnErrors, m2Errors])

In [None]:
def plotWrongLabelModels(pThresh, selectionType, labelSize, induceError, errorDenom, pltName):
    layerSize = 100
    wrongLabelErrors = []
    errors = selfTrainMNIST(pThresh, selectionType, labelSize, False, 0, layerSize, 0.25, 0)
    wrongLabelErrors.append([errors[0].tolist()])
    if induceError == True:
        for denom in errorDenom:
            errors = selfTrainMNIST(pThresh, selectionType, labelSize, induceError, denom, layerSize,
                                    0.25, 0)
            wrongLabelErrors.append([errors[0].tolist()])
            #wrongLabelErrors = np.append(wrongLabelErrors, errors[0])
    print(wrongLabelErrors)
    print(errors[1])
    xIdx = list(range(len(labelSize)))
    plt.plot(xIdx, wrongLabelErrors[0][0], label = "0% wrong labels")
    if induceError == True:
        for i in range(len(errorDenom)):
            label = str(int(100/errorDenom[i]))
            plt.plot(xIdx, wrongLabelErrors[1 + i][0], label = label + "% wrong labels")
    plt.plot(xIdx, errors[1], label = "Supervised Learning")
    plt.xticks(xIdx, labelSize)
    plt.xlabel("Number of Labeled Samples")
    plt.ylabel("Test Error")
    plt.title('Performance of Self-training on MNIST Data')
    plt.legend()
    plt.savefig(pltName + '_Self_train_MNIST.png')
    return([wrongLabelErrors, errors[1]])


In [None]:
pThresh = 0.9
selectionType = "Group"
labelSize = [100, 500, 1000, 2000, 5000, 10000, 25000]
induceError = False
errorDenom = 10
pltName = 'RegvsDropout'
plotBaseModels(pThresh, selectionType, labelSize, induceError, errorDenom, pltName)

pThresh = 0.9
selectionType = "Group"
labelSize = [100, 500, 1000, 2000, 5000, 10000, 25000]
induceError = False
errorDenom = 10
pltName = 'SelfTrain_SeqVSCNN'
plotBaseModelsSeqVSCNN(pThresh, selectionType, labelSize, induceError, errorDenom, pltName)

pThresh = 0.9
selectionType = "Group"
induceError = True
labelSize = [100, 500, 1000, 2000, 5000, 10000, 25000]
errorDenom = [20, 10, 4, 2]
pltName = 'Wrong_Labels_CNN'
wrongLabelErrors = plotWrongLabelModels(pThresh, selectionType, labelSize, induceError,
                                        errorDenom, pltName)

pThresh = 0.95
selectionType = "Group"
labelSize = [100, 500, 1000, 2000, 5000, 10000, 25000]
induceError = False
errorDenom = 10
pltName = 'CoTrain'
output = plotCoTrainModels(pThresh, selectionType, labelSize, induceError, errorDenom, pltName)

In [None]:
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 32

print('Loading data...')
(xTrain, yTrain), (xTest, yTest) = imdb.load_data(num_words = max_features)
print(len(xTrain), 'train sequences')
print(len(xTest), 'test sequences')

print('Pad sequences (samples x time)')
xTrain = sequence.pad_sequences(xTrain, maxlen=maxlen)
xTest = sequence.pad_sequences(xTest, maxlen=maxlen)
print('xTrain shape:', xTrain.shape)
print('xTest shape:', xTest.shape)

batchSize = 32
maxIter = 5
pThresh = 0.9
selectionType = "Group"
dataType = 'text'
maxFeatures = 20000

dropout = 0.25
regLambda = 0
labelSize = [500, 1000, 5000, 10000, 20000]
selfTrainErr = []
supervisedErr = []
for size in labelSize:
    print('selfTrainErr : ', selfTrainErr)
    output = selfTrainTextModel(xTrain, yTrain, batchSize, maxIter, pThresh, size, selectionType,
                                dropout, regLambda)
    wrongLabels = output[1]
    yHat = output[0].predict_classes(xTest, batch_size = batchSize)
    cm = confusion_matrix(yTest, yHat)
    selfTrainErr = np.append(selfTrainErr, 1 - sum(np.diag(cm))/sum(sum(cm)))
    del(output)
    print(wrongLabels)

    model = supervisedSeqText(xTrain, yTrain, batchSize, dropout, regLambda, size,
                              selectionType)
    yHat = model.predict_classes(xTest, batch_size = batchSize)
    cm = confusion_matrix(yTest, yHat)
    supervisedErr = np.append(supervisedErr, 1 - sum(np.diag(cm))/sum(sum(cm)))
    del(model)
    print('Size : ', size)


xIdx = list(range(len(labelSize)))
plt.plot(xIdx, selfTrainErr, color = 'red', label = "Self Training")
plt.plot(xIdx, supervisedErr, color = "blue", label = "Supervised Learning")
plt.xticks(xIdx, labelSize)
plt.xlabel("Number of Labeled Samples")
plt.ylabel("Test Error")
plt.title('Self-training on IMDB Data')
plt.legend()
plt.savefig('Self-train-IMDB.png')
