In [None]:
import numpy as np
from classifiers import seqDenseNN, seqDenseLSTM, seqCNN
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

In [None]:
def vectorize(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

def splitData(xTrain, yTrain, labeledSize, selectionType):
    idx = np.array([])
    if (selectionType == "Group"):
        labels = np.unique(yTrain)
        size = int(labeledSize/len(labels))
        for label in labels:
            idx = np.append(idx, np.random.choice(np.where(yTrain == label)[0], 
                                                  size, replace = False))
            np.random.shuffle(idx)
    else:
        idx = np.random.choice(yTrain.shape[0], labeledSize, replace = False)
    idx = idx.astype(int)
    labeledXTrain = xTrain[idx, ]
    labeledYTrain = yTrain[idx, ]
    
    unlabeledXTrain = xTrain[[i for i in range(xTrain.shape[0]) if i not in idx], ]
    unlabeledYTrain = yTrain[[i for i in range(xTrain.shape[0]) if i not in idx], ]
    return([labeledXTrain, labeledYTrain, unlabeledXTrain, unlabeledYTrain])

In [None]:
def selfTrainImageModel(xTrain, yTrain, batchSize, maxIter, pThresh, labeledSize, selectionType, 
                        induceError, errorDenom, layerSize, dropout, regLambda, model):
    # Assumes that the classification model can also calculate class probabilities
    
    # 1. Randomly classify data as labeled and unlabeled according to label size
    data = splitData(xTrain, yTrain, labeledSize, selectionType)
    labeledXTrain = data[0]
    labeledYTrain = data[1]    
    unlabeledXTrain = data[2]
    unlabeledYTrain = data[3]
    
    unlabeledData = np.copy(unlabeledYTrain)
    seqDenseModel = seqDenseNN(labeledXTrain, layerSize, dropout, regLambda)
    if model == 'CNN':
        seqDenseModel = seqCNN(xTrain, layerSize, dropout)
    labeledYTrain = to_categorical(labeledYTrain, 10)
    unlabeledYTrain = to_categorical(unlabeledYTrain, 10)
    
    count = 0
    wrongLabePct = []
    shouldRandomizeFirstPrediction = induceError
    # Check if there are any more unlabeled datapoints left or if the number of iterations are done
    while((count <= maxIter) and (len(unlabeledData) != 0)):
        print(count)
        print(len(unlabeledData))
        # 2. Run a CNN on labeled data and get predicted labels and probabilities
        seqDenseModel.fit(labeledXTrain, labeledYTrain, batch_size = batchSize, epochs = 5,  
                     validation_split = 0.1, verbose = 0)
        unlabeledYHat = seqDenseModel.predict_classes(unlabeledXTrain, batch_size = batchSize)
        predProbs = seqDenseModel.predict(unlabeledXTrain, batch_size = batchSize)
        
        # 3. Choose datapoints with high label probabilities and add them to labaled set
        highProbIdx = np.where(predProbs > pThresh)[0]
        labeledXTrain = np.vstack((labeledXTrain, unlabeledXTrain[highProbIdx, :]))
        yIdx = highProbIdx
        if (shouldRandomizeFirstPrediction):
            np.random.shuffle(yIdx[0:int(len(highProbIdx)/errorDenom)])
            #shouldRandomizeFirstPrediction = False
        labeledYTrain = np.vstack((labeledYTrain, to_categorical(unlabeledYHat[yIdx], 10)))
        # Check how much of the unlabeled data that was added to labeled data was wrongly classified
        unlabeledYHat = unlabeledYHat[highProbIdx]        
        cMat = confusion_matrix(np.argmax(unlabeledYTrain[highProbIdx, :], axis = -1), unlabeledYHat)
        print('Prob Idx Size ', highProbIdx.shape)
        errorPct = 0
        if unlabeledYTrain[highProbIdx, :].shape[0] > 0:
            errorPct = 1 - sum(np.diag(cMat))/sum(sum(cMat))
        wrongLabePct = np.append(wrongLabePct, errorPct)
        
        # Remove unlabeled data that was added to labeled data from labeled data
        unlabeledXTrain = unlabeledXTrain[[i for i in range(unlabeledXTrain.shape[0]) \
                                           if i not in highProbIdx], ]
        unlabeledYTrain = unlabeledYTrain[[i for i in range(unlabeledYTrain.shape[0]) \
                                           if i not in highProbIdx], ]
        unlabeledData = np.copy(unlabeledYTrain)
        count += 1
    return([seqDenseModel, wrongLabePct])

In [None]:
def coTrainImageModel(xTrain, yTrain, batchSize, maxIter, pThresh, labeledSize, selectionType, 
                        induceError, errorDenom, layerSize, dropout, regLambda):
    # Assumes that the classification model can also calculate class probabilities
    
    # 1. Randomly classify data as labeled and unlabeled according to label size
    data = splitData(xTrain, yTrain, labeledSize, selectionType)
    labeledXTrain = data[0]
    labeledYTrain = data[1]    
    unlabeledXTrain = data[2]
    unlabeledYTrain = data[3]
    
    unlabeledData = np.copy(unlabeledYTrain)
    model1 = seqDenseNN(labeledXTrain, layerSize, dropout, regLambda)
    model2 = seqCNN(xTrain, layerSize, dropout)
    
    labeledYTrain = to_categorical(labeledYTrain, 10)
    unlabeledYTrain = to_categorical(unlabeledYTrain, 10)
    
    count = 0
    wrongLabePct = []
    shouldRandomizeFirstPrediction = induceError
    # Check if there are any more unlabeled datapoints left or if the number of iterations are done
    while((count <= maxIter) and (len(unlabeledData) != 0)):
        print(count)
        print(len(unlabeledData))
        # 2. Run a CNN on labeled data and get predicted labels and probabilities
        labeledXTrain = labeledXTrain.reshape(labeledXTrain.shape[0], 784)
        unlabeledXTrain = unlabeledXTrain.reshape(unlabeledXTrain.shape[0], 784)
        model1.fit(labeledXTrain, labeledYTrain, batch_size = batchSize, epochs = 5,  
                     validation_split = 0.1, verbose = 0)
        unlabeledYHat1 = model1.predict_classes(unlabeledXTrain, batch_size = batchSize)
        predProbs1 = model1.predict(unlabeledXTrain, batch_size = batchSize)
        
        labeledXTrain = labeledXTrain.reshape(labeledXTrain.shape[0], 28, 28, 1)
        unlabeledXTrain = unlabeledXTrain.reshape(unlabeledXTrain.shape[0], 28, 28, 1)
        model2.fit(labeledXTrain, labeledYTrain, batch_size = batchSize, epochs = 5,  
                     validation_split = 0.1, verbose = 0)
        unlabeledYHat2 = model2.predict_classes(unlabeledXTrain, batch_size = batchSize)
        predProbs2 = model2.predict(unlabeledXTrain, batch_size = batchSize)
        
        # 3. Choose datapoints with high label probabilities and add them to labaled set
        rows1, cols1 = np.where(predProbs1 > pThresh)
        rows2, cols2 = np.where(predProbs2 > pThresh)
        
        highProbIdx = rows1
        unlabeledYHat = cols1
        for i in range(len(rows2)):
            if rows2[i] not in highProbIdx:
                highProbIdx = np.append(highProbIdx, rows2[i])
                unlabeledYHat = np.append(unlabeledYHat, cols2[i])
        
        labeledXTrain = labeledXTrain.reshape(labeledXTrain.shape[0], 784)
        unlabeledXTrain = unlabeledXTrain.reshape(unlabeledXTrain.shape[0], 784)
        labeledXTrain = np.vstack((labeledXTrain, unlabeledXTrain[highProbIdx, :]))

        # Check how much of the unlabeled data that was added to labeled data was wrongly classified
        cMat = confusion_matrix(np.argmax(unlabeledYTrain[highProbIdx, :], axis = -1), unlabeledYHat)
        if (shouldRandomizeFirstPrediction):
            np.random.shuffle(unlabeledYHat[0:int(len(highProbIdx)/errorDenom)])
            #shouldRandomizeFirstPrediction = False
        labeledYTrain = np.vstack((labeledYTrain, to_categorical(unlabeledYHat, 10)))
        print('Prob Idx Size ', highProbIdx.shape)
        errorPct = 0
        if unlabeledYTrain[highProbIdx, :].shape[0] > 0:
            errorPct = 1 - sum(np.diag(cMat))/sum(sum(cMat))
        wrongLabePct = np.append(wrongLabePct, errorPct)
        
        # Remove unlabeled data that was added to labeled data from labeled data
        unlabeledXTrain = unlabeledXTrain[[i for i in range(unlabeledXTrain.shape[0]) \
                                           if i not in highProbIdx], ]
        unlabeledYTrain = unlabeledYTrain[[i for i in range(unlabeledYTrain.shape[0]) \
                                           if i not in highProbIdx], ]
        unlabeledData = np.copy(unlabeledYTrain)
        count += 1
    return([model1, model2, wrongLabePct])

In [None]:
def supervisedSeqImage(xTrain, yTrain, batchSize, layerSize, dropout, regLambda, labeledSize,
                       selectionType, model):
    data = splitData(xTrain, yTrain, labeledSize, selectionType)
    xTrain = data[0]
    yTrain = data[1]
    
    seqDenseModel = seqDenseNN(xTrain, layerSize, dropout, regLambda)
    if model == 'CNN':
        seqDenseModel = seqCNN(xTrain, layerSize, dropout)
    yTrain = to_categorical(yTrain, 10)
    seqDenseModel.fit(xTrain, yTrain, batch_size = batchSize, epochs = 5, 
                      validation_split = 0.1, verbose = 0)
    return(seqDenseModel)

In [None]:
def selfTrainTextModel(xTrain, yTrain, batchSize, maxIter, pThresh, labeledSize, selectionType,
                       dropout, regLambda):
    # Assumes that the classification model can also calculate class probabilities
    
    # 1. Randomly classify data as labeled and unlabeled according to label size
    data = splitData(xTrain, yTrain, labeledSize, selectionType)
    labeledXTrain = data[0]
    labeledYTrain = data[1]    
    unlabeledXTrain = data[2]
    unlabeledYTrain = data[3]
    
    unlabeledData = np.copy(unlabeledYTrain)
    max_features = 20000
        
    count = 0
    wrongLabePct = []
    
    # Check if there are any more unlabeled datapoints left or if the number of iterations are done
    while((count <= maxIter) and (len(unlabeledData) != 0)):
        print(count)
        print(len(unlabeledData))
        # 2. Run a CNN on labeled data and get predicted labels and probabilities
        seqDenseModel = seqDenseLSTM(max_features, dropout, regLambda)
        seqDenseModel.fit(labeledXTrain, labeledYTrain, batch_size = batchSize, epochs = 5,
                          validation_split = 0.1, verbose = 0)
        unlabeledYHat = seqDenseModel.predict_classes(unlabeledXTrain, batch_size = batchSize)
        predProbs = seqDenseModel.predict(unlabeledXTrain, batch_size = batchSize)
        
        # 3. Choose datapoints with high label probabilities and add them to labaled set
        highProbIdx = np.where((predProbs > pThresh) | (predProbs < 1- pThresh))[0]
        labeledXTrain = np.vstack((labeledXTrain, unlabeledXTrain[highProbIdx, :]))
        labeledYTrain = np.hstack((labeledYTrain, 
                                   np.squeeze(np.asarray(unlabeledYHat[highProbIdx]))))
        # Check how much of the unlabeled data that was added to labeled data was wrongly classified
        unlabeledYHat = np.squeeze(np.asarray(unlabeledYHat[highProbIdx]))
        cMat = confusion_matrix(unlabeledYTrain[highProbIdx], unlabeledYHat)
        errorPct = 0
        if unlabeledYTrain[highProbIdx].shape[0] > 0:
            errorPct = 1 - sum(np.diag(cMat))/sum(sum(cMat))
        wrongLabePct = np.append(wrongLabePct, errorPct)
        
        # Remove unlabeled data that was added to labeled data from labeled data
        unlabeledXTrain = unlabeledXTrain[[i for i in range(unlabeledXTrain.shape[0]) \
                                           if i not in highProbIdx], ]
        unlabeledYTrain = unlabeledYTrain[[i for i in range(unlabeledYTrain.shape[0]) \
                                           if i not in highProbIdx], ]
        unlabeledData = np.copy(unlabeledYTrain)
        count += 1
    return([seqDenseModel, wrongLabePct])

In [None]:
def supervisedSeqText(xTrain, yTrain, batchSize, dropout, regLambda, labeledSize,
                       selectionType):    
    data = splitData(xTrain, yTrain, labeledSize, selectionType)
    xTrain = data[0]
    yTrain = data[1]
    
    max_features = 20000
    seqDenseModel = seqDenseLSTM(max_features, dropout, regLambda)
    seqDenseModel.fit(xTrain, yTrain, batch_size = batchSize, epochs = 5, 
                      validation_split = 0.1, verbose = 0)
    return(seqDenseModel)