In [1]:
from __future__ import print_function
import keras
import pandas as pd
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from time import time
from sklearn.metrics import roc_auc_score as rocauc
from sklearn.metrics import accuracy_score as acc
from sklearn.model_selection import train_test_split
from __future__ import print_function  
import seaborn as sns
sns.set_style("whitegrid")
np.set_printoptions(precision=10)
np.set_printoptions(suppress=True)
input_shape = (28, 28, 1)

Using TensorFlow backend.


In [2]:
def D1_problem_to_C1_problem(arr, fractions):
    number_of_C0_in_D0, number_of_C0_in_D1, number_of_C1_in_D0, number_of_C1_in_D1 = fractions
    alpha = number_of_C0_in_D0 / (number_of_C0_in_D0 + number_of_C1_in_D0)
    beta = number_of_C0_in_D1 / (number_of_C0_in_D1 + number_of_C1_in_D1)
    arr = (arr - beta/(alpha + beta))*(((1 - beta) / (2 - alpha - beta) - beta / (alpha + beta)) ** -1)
    arr = np.clip(arr,0,1) # значение недообученного классификатора на предыдущем этапе может выйти за границы 0 и 1 
    return arr

In [3]:
def generate_dataset(C0_values, fractions):
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x = np.append(x_train, x_test, axis=0)
    y = np.append(y_train, y_test)
    x = x.reshape(x.shape[0], 28, 28, 1)
    y_C1 = np.array([int(elem not in C0_values) for elem in y])

    NUMBER_OF_C0_IN_D0, NUMBER_OF_C0_IN_D1, NUMBER_OF_C1_IN_D0, NUMBER_OF_C1_IN_D1 = fractions
    i, X, Y_C1, Y_D1 = 0, [], [], []

    while (NUMBER_OF_C0_IN_D0, NUMBER_OF_C0_IN_D1, NUMBER_OF_C1_IN_D0, NUMBER_OF_C1_IN_D1) != (0,0,0,0):
        if y_C1[i] == 1 and NUMBER_OF_C1_IN_D0 != 0:
            Y_C1.append(1)
            Y_D1.append(0)
            X.append(x[i])
            NUMBER_OF_C1_IN_D0 -= 1
            i += 1
        elif y_C1[i] == 0 and NUMBER_OF_C0_IN_D0 != 0:
            Y_C1.append(0)
            Y_D1.append(0)
            X.append(x[i])
            NUMBER_OF_C0_IN_D0 -= 1
            i += 1
        elif y_C1[i] == 1 and NUMBER_OF_C1_IN_D1 != 0:
            Y_C1.append(1)
            Y_D1.append(1)
            X.append(x[i])
            NUMBER_OF_C1_IN_D1 -= 1
            i += 1
        elif y_C1[i] == 0 and NUMBER_OF_C0_IN_D1 != 0:
            Y_C1.append(0)
            Y_D1.append(1)
            X.append(x[i])
            NUMBER_OF_C0_IN_D1 -= 1
            i += 1
        else:
            i += 1
    X = np.array(X)
    Y_C1 = np.array(Y_C1)
    Y_D1 = np.array(Y_D1)
    
    return train_test_split(X, np.expand_dims(Y_C1, axis=1), np.expand_dims(Y_D1, axis=1), test_size=0.4)

In [4]:
class LogReg(Sequential):
    def __init__(self):
        Sequential.__init__(self)
        self.add(Flatten(input_shape=input_shape))
        self.add(Dense(1, activation='sigmoid'))
        self.compile(loss=keras.losses.binary_crossentropy,
                   optimizer=keras.optimizers.Adam())
        
    def fit(self, dataset):
        X_train, X_test, Y_C1_train, Y_C1_test, Y_D1_train, Y_D1_test = dataset
        
        saver_logreg = keras.callbacks.ModelCheckpoint(filepath='/tmp/logreg.hdf5', monitor='val_loss', verbose=0, save_best_only=True)
        lr_decreaser_logreg = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=10, factor=0.1, epsilon=0.001, verbose=0)

        Sequential.fit(self, X_train, Y_D1_train, validation_data = (X_test, Y_D1_test),
                   verbose=0, batch_size=5000, epochs=200, 
                   callbacks=[saver_logreg, lr_decreaser_logreg])
        
    def rocauc(self, dataset, fractions):
        X_train, X_test, Y_C1_train, Y_C1_test, Y_D1_train, Y_D1_test = dataset
        
        return rocauc(Y_C1_test, D1_problem_to_C1_problem(self.predict(X_test), fractions).flat)

In [5]:
class Cnn(Sequential):
    def __init__(self):
        Sequential.__init__(self)
        self.add(Conv2D(8, kernel_size=(3, 3),
                         activation='relu',
                         input_shape=input_shape))
        self.add(Conv2D(8, (3, 3), activation='relu'))
        self.add(MaxPooling2D(pool_size=(2, 2)))
        self.add(Conv2D(16, (3, 3), activation='relu'))
        self.add(Conv2D(16, (3, 3), activation='relu'))
        self.add(MaxPooling2D(pool_size=(2, 2)))
        self.add(Flatten())
        self.add(Dense(128, activation='relu'))
        self.add(Dense(1, activation='sigmoid'))
        self.compile(loss=keras.losses.binary_crossentropy,
                                optimizer=keras.optimizers.Adam())
        
    def fit(self, dataset, epochs, auxiliary_classifier=None):
        if auxiliary_classifier is None:
            X_train, X_test, Y_C1_train, Y_C1_test, Y_D1_train, Y_D1_test = dataset
            name = 'cnn_pure'
        else:
            X_train, X_test, Y_C1_train, Y_C1_test, _, Y_D1_test = dataset
            auxiliary_classifier.load_weights('/tmp/logreg.hdf5')
            Y_D1_train = auxiliary_classifier.predict(X_train)
            name = 'cnn_aux'
        
        saver_cnn = keras.callbacks.ModelCheckpoint(filepath='/tmp/'+name, monitor='val_loss', verbose=0, save_best_only=True)
        lr_decreaser_cnn = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, epsilon=0.001, verbose=0)
        
        Sequential.fit(self, X_train, Y_D1_train, validation_data = (X_test, Y_D1_test),
                            verbose=1, epochs=epochs,
                            callbacks=[saver_cnn, lr_decreaser_cnn])
        self.load_weights('/tmp/'+name)
        
    def rocauc(self, dataset, fractions):
        X_train, X_test, Y_C1_train, Y_C1_test, Y_D1_train, Y_D1_test = dataset
        
        return rocauc(Y_C1_test, D1_problem_to_C1_problem(self.predict(X_test), fractions).flat)

In [6]:
def experiment(C0_values, fractions):
    number_of_C0_in_D0, number_of_C0_in_D1, number_of_C1_in_D0, number_of_C1_in_D1 = fractions
    alpha = number_of_C0_in_D0 / (number_of_C0_in_D0 + number_of_C1_in_D0)
    beta = number_of_C0_in_D1 / (number_of_C0_in_D1 + number_of_C1_in_D1)
    
    experiment_logs = {'C0 values': C0_values, 'number of C0 in D0':number_of_C0_in_D0, 'number of C0 in D1':number_of_C0_in_D1, 'number of C1 in D0':number_of_C1_in_D0, 'number of C1 in D1':number_of_C1_in_D1}
    experiment_logs['alpha'] = alpha
    experiment_logs['beta'] = beta
    
    dataset = generate_dataset(C0_values, fractions)
    logreg, cnn_aux, cnn_pure = LogReg(), Cnn(), Cnn()
    
    t0 = time()
    logreg.fit(dataset)
    experiment_logs['logreg time'] = time() - t0
    experiment_logs['logreg rocauc'] = logreg.rocauc(dataset, fractions)
    
    t0 = time()
    cnn_aux.fit(dataset, 1, auxiliary_classifier=logreg)
    experiment_logs['cnn_aux time'] = time() - t0
    experiment_logs['cnn_aux rocauc'] = cnn_aux.rocauc(dataset, fractions)
    #cnn_aux.fit(dataset, 1, auxiliary_classifier=logreg)
    #experiment_logs['cnn_aux rocauc plus epoch'] = cnn_aux.rocauc(dataset, fractions)
    
    t0 = time()
    cnn_pure.fit(dataset, 2)
    experiment_logs['cnn_pure time'] = time() - t0
    experiment_logs['cnn_pure rocauc'] = cnn_pure.rocauc(dataset, fractions)
    #cnn_pure.fit(dataset, 1)
    #experiment_logs['cnn_pure rocauc plus epoch'] = cnn_pure.rocauc(dataset, fractions)
    
    return experiment_logs

In [7]:
fractions = [[30, 30, 31000, 31000],
             [30, 60, 31000, 31000],
             [30, 120, 31000, 31000],
             [30, 210, 31000, 31000],
             [30, 450, 31000, 31000],
             [30, 600, 31000, 31000],
             [30, 1200, 31000, 31000],
             [30, 3000, 31000, 31000]]

In [15]:
logs = []
#for fraction in fractions:
for i in range(3):
    print('Current experiment: ', i)
    logs.append(
        experiment([0], [30, 450, 31000, 31000])
        )

Current experiment:  0
Train on 31240 samples, validate on 31240 samples
Epoch 1/1
Train on 31240 samples, validate on 31240 samples
Epoch 1/2
Epoch 2/2
Current experiment:  1
Train on 31240 samples, validate on 31240 samples
Epoch 1/1
Train on 31240 samples, validate on 31240 samples
Epoch 1/2
Epoch 2/2
Current experiment:  2
Train on 31240 samples, validate on 31240 samples
Epoch 1/1
Train on 31240 samples, validate on 31240 samples
Epoch 1/2
Epoch 2/2


In [13]:
for log in logs:
    print(log['logreg rocauc'],log['cnn_aux rocauc'],log['cnn_pure rocauc'])
    # print(log['logreg rocauc'],log['cnn_aux rocauc plus epoch'],log['cnn_pure rocauc plus epoch'])
    print('\n')

0.510976834978 0.555161925787 0.453089963815


0.515278688681 0.500231757739 0.483535643504


0.528043881058 0.371186898281 0.680179695872




In [16]:
pd.DataFrame(logs)

Unnamed: 0,C0 values,alpha,beta,cnn_aux rocauc,cnn_aux time,cnn_pure rocauc,cnn_pure time,logreg rocauc,logreg time,number of C0 in D0,number of C0 in D1,number of C1 in D0,number of C1 in D1
0,[0],0.000967,0.014308,0.769242,88.544512,0.869259,143.332973,0.803856,67.252089,30,450,31000,31000
1,[0],0.000967,0.014308,0.894355,77.51842,0.746319,129.98728,0.818521,61.789985,30,450,31000,31000
2,[0],0.000967,0.014308,0.882604,63.863151,0.608896,129.555264,0.806306,61.116509,30,450,31000,31000
