In [None]:
# !pip3 -qq install python_speech_features
# !pip3 -qq install modal

In [None]:
seed=42
import plotly
def plotlyC(x,d): # d to dim the color
    return tuple(map(lambda x: min(int(x)+d*8,255)/255, plotly.colors.DEFAULT_PLOTLY_COLORS[x][4:-1].split(',')))
import plotly.graph_objs as go
from plotly.graph_objs import Layout
from time import time
import numpy as np
import pandas as pd
import matplotlib
from skimage import io
from scipy.stats import entropy
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, recall_score
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from IPython.display import clear_output
from matplotlib import rc, cm
#import seaborn as sns
import tensorflow as tf
rc('font', size=12)
from math import sqrt
SPINE_COLOR = 'gray'

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)

def format_axes(ax):

    for spine in ['top', 'right']:
        ax.spines[spine].set_visible(False)

    for spine in ['left', 'bottom']:
        ax.spines[spine].set_color(SPINE_COLOR)
        ax.spines[spine].set_linewidth(0.5)

    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

    for axis in [ax.xaxis, ax.yaxis]:
        axis.set_tick_params(direction='out', color=SPINE_COLOR)

    return ax
#clear_output()

In [None]:
## Specify color scheme
def hexify(r,g,b,a):
    return '#'+hex(r)[2:]+hex(g)[2:]+hex(b)[2:]+hex(a)[2:]

my_clr   = {'l_b':hexify(147,205,221,255),
            'b':hexify(81,151,213,255),
            'eq':hexify(224,224,224,255),
            'y':hexify(253,215,42,255),
            'l_r':hexify(248,177,99,255)}

<h1><center>Active Learning: A Visual Tour</center></h1>
<h4><center><a style="text-decoration:none" href="https://patel-zeel.github.io/">Zeel B Patel</a>, IIT Gandhinagar, <a style="text-decoration:none" href="mailto:patel_zeel@iitgn.ac.in">patel_zeel@iitgn.ac.in</a>
     <br><br>
            <a style="text-decoration:none" href="https://nipunbatra.github.io/">Nipun Batra</a>, IIT Gandhinagar, <a style="text-decoration:none" href="mailto:nipun.batra@iitgn.ac.in">nipun.batra@iitgn.ac.in</a> </center></h4>

# Rise of Supervised Learning

Today, machine learning (ML) is applied to numerous fields, including, but not limited to Natural Language Processing (<a style="text-decoration:none" href="https://en.wikipedia.org/wiki/Natural_language_processing">NLP</a>), <a style="text-decoration:none" href="https://en.wikipedia.org/wiki/Computer-aided_diagnosis">Computer-aided diagnosis</a>, <a style="text-decoration:none" href="https://en.wikipedia.org/wiki/Mathematical_optimization">Optimization</a>, and <a style="text-decoration:none" href="https://books.google.co.in/books/about/Bioinformatics.html?id=pxSM7R1sdeQC&redir_esc=y">Bioinformatics</a>. A significant proportion of this success is due to a subset of ML called supervised learning. There are three main reasons behind the success of supervised learning (and machine learning, generally): 1) availability of massive data; 2) better algorithms; and 3) powerful computational infrastructure jointly called the <a style="text-decoration:none" href="https://simons.berkeley.edu/events/openlectures2018-fall-1#:~:text=AI%20at%20scale%20requires%20a,data%2C%20algorithms%20and%20cloud%20infrastructure.&text=By%20building%20intelligence%20into%20data,to%20train%20models%20at%20scale.">AI Trinity</a>. Supervised learning techniques require *labeled* data. As the data turns into 'Big data,' the effort required to label it becomes more laborious. In this article, we will talk about active learning, a suite of techniques for intelligent and data-driven annotations.

In [None]:
from IPython.core.display import HTML
eval('HTML("""\
<style>\
.output_png {\
    display: table-cell;\
    text-align: center;\
    vertical-align: middle;\
}\
</style>\
""")')
#clear_output()

# Data Annotation is Expensive

Labeled data is the primary need of supervised learning. Annotating the data is expensive because it may require: i) excessive time and manual effort ii) costly sensors. Let us get an overview of how expensive labeling is with a few examples.

## Speech Recognition

Let us say we need to convert a speech or audio into text for an application such as subtitle generation (speech-to-text task). We have to annotate multiple audio segments that correspond to the words and phrases in the audio. The following example audio of 6 seconds may require around a minute to annotate manually. Thus, annotating hours of publicly available audio data is an impracticable task.

In [None]:
from IPython.display import Audio, display

In [None]:
# aud1 = Audio(url='https://raw.githubusercontent.com/patel-zeel/OpenActiveLearning/master/Audio/demo.wav')
aud1 = Audio(filename='../audio/demo.wav')
#!wget https://raw.githubusercontent.com/patel-zeel/OpenActiveLearning/master/Audio/demo.wav

In [None]:
display(aud1)

Similarly, researchers have made efforts to detect COVID-19 from the sound of the human cough [<a style="text-decoration:none" href="https://www.sciencedirect.com/science/article/pii/S2352914820303026">10</a>]. Collecting labels for thousands of human cough samples is a difficult task.

## Human Activity Recognition

Let us say we want to classify different human activities into different categories (shown in Figure 2). We need expensive sensors to monitor the alignment or motion of the human body parts for such tasks. In the end, we need to map the sensor data with various activities with substantial manual effort.

In [None]:
imgw = io.imread('../images/walking.png')
imgr = io.imread('../images/Running.png')
imgs = io.imread('../images/Sitting.png')
imgc = io.imread('../images/Climbing.png')

In [None]:
from PIL import Image
# fig, axes = plt.subplots(1,4, figsize=(16,4))
# for ax in axes:
#     ax.set_axis_off()
# axes[0].imshow(Image.open('../images/walking.png').resize((220,250)))
# axes[0].set_title('Walking')
# axes[1].imshow(Image.open('../images/Running.png').resize((220,250)))
# axes[1].set_title('Running')
# axes[2].imshow(Image.open('../images/Sitting.png').resize((220,250)));
# axes[2].set_title('Sitting')
# axes[3].imshow(Image.open('../images/Climbing.png').resize((220,250)));
# axes[3].set_title('Climbing')
# plt.gcf().set_facecolor("white");
# plt.figtext(0.4,1,'Figure 1: Various human activities',fontdict={'size':16});
# init = time()
# Create traces
fig = make_subplots(1, 4, subplot_titles=['Walking','Running','Sitting','Climbing'])

############# Common
fig.update_yaxes(automargin=True, showgrid=False, zeroline=False)
fig.update_xaxes(automargin=True, showgrid=False, zeroline=False)
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.update_layout(title_text='<b>Figure 1:</b> Various human activities', 
                  title_x=0.5,height=370,hovermode=False
    )
for ind, img in enumerate(['../images/walking.png','../images/Running.png',
                           '../images/Sitting.png','../images/Climbing.png'],1):
    fig.add_layout_image(
    source=Image.open(img),
    xref="x",
    yref="y",
    x=-1,
    sizex=7,
    y=4,
    sizey=5,
    opacity=1,
    layer="below",
    sizing="stretch",
    row=1, col=ind)

#print(time()-init)
# fig.layout.annotations[0].update(y=0.78)
# fig.layout.annotations[1].update(y=0.78)
# fig.layout.annotations[2].update(y=0.78)
# fig.layout.annotations[3].update(y=0.78)
fig.show()

# All the Samples are Not Equally Important

We know that increasing the training (labeled) data increases model performance. Though, all the samples do not contribute equally in improving the model. Let us understand this with a few examples.

## SVC Says: Closer is Better

We will use synthetic two-class data generated from a bivariate normal distribution for this experiment. Now, we will train a Support Vector Classifier (SVC) [<a style="text-decoration:none" href="https://link.springer.com/article/10.1007%2FBF00994018">8</a>] model on a subset of this dataset (5 data points) and visualize the decision boundary.

(All Figures in this article are interactive. Hover over plots to know more. Click on legend items to hide/show elements)

In [None]:
from sklearn.svm import SVC

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
np.random.seed(seed)

iris = datasets.load_iris()
X1 = np.random.multivariate_normal([2,0],np.eye(2)*2, size=100)
X2 = np.random.multivariate_normal([-2,0],np.eye(2)*2, size=100)
X = np.concatenate([X1, X2])
y = np.concatenate([np.zeros((100, 1)), np.ones((100, 1))]).squeeze()
np.random.seed(seed)
train_ind = np.random.choice(range(len(X)), size=5, replace=False)
test_ind = list(set(range(len(X)))-set(train_ind))

h = .02  # step size in the mesh
C = 1.0 
clf = svm.LinearSVC(C=C).fit(X[train_ind], y[train_ind])

# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-6, 4)
yy = a * xx - (clf.intercept_[0]) / w[1]

# plot the parallels to the separating hyperplane that pass through the
# support vectors (margin away from hyperplane in direction
# perpendicular to hyperplane). This is sqrt(1+a^2) away vertically in
# 2-d.
margin = 1 / np.sqrt(np.sum(clf.coef_ ** 2))
yy_down = yy - np.sqrt(1 + a ** 2) * margin
yy_up = yy + np.sqrt(1 + a ** 2) * margin

# # plot the line, the points, and the nearest vectors to the plane
# #plt.clf()
# plt.plot(xx, yy, 'k-',label='Decision boundary')
# plt.plot(xx, yy_down, 'k--', label='Upper and Lower margin')
# plt.plot(xx, yy_up, 'k--')
# plt.ylim(-4,7)

# # Plot also the training points
# my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list('',[my_clr['y'],my_clr['b']])
# plt.scatter(X[:, 0], X[:, 1], c=y, cmap=my_cmap, alpha=0.4)
# plt.scatter(X[:,0][train_ind][y[train_ind]==0], 
#             X[:,1][train_ind][y[train_ind]==0], c=my_clr['y'], s=200, marker='8',
#             label='Train points: Class-1',edgecolors='k')
# plt.scatter(X[:,0][train_ind][y[train_ind]==1], 
#             X[:,1][train_ind][y[train_ind]==1], c=my_clr['b'], s=200, marker='8', 
#             label='Train points: Class-2',edgecolors='k')
# arr = [1, 38, 19,191, 161, 186]
# ii, ele = 0, arr[0]
# plt.scatter(X[ele,0], X[ele,1], color=my_clr['y'], label='Class-1 (unlabeled)')
# plt.annotate(chr(ii+65), (X[ele,0], X[ele,1]), size=30, color=my_clr['l_r'], xycoords='data',
#             xytext=(X[ele,0]-1, X[ele,1]+2), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3"))

# ii, ele = 1, arr[2]
# plt.scatter(X[ele,0], X[ele,1], color=my_clr['y'])
# plt.annotate(chr(ii+65), (X[ele,0], X[ele,1]), size=30, color=my_clr['l_r'], xycoords='data',
#             xytext=(X[ele,0]-1, X[ele,1]+4), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3", color='k'))

# ii, ele = 2, arr[3]
# plt.scatter(X[ele,0], X[ele,1], color=my_clr['b'], label='Class-2 (unlabeled)')
# plt.annotate(chr(ii+65), (X[ele,0], X[ele,1]), size=30, color=my_clr['l_b'], xycoords='data',
#             xytext=(X[ele,0], X[ele,1]+3), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3"))

# ii, ele = 3, arr[5]
# plt.scatter(X[ele,0], X[ele,1], color=my_clr['b'])
# plt.annotate(chr(ii+65), (X[ele,0], X[ele,1]), size=30, color=my_clr['l_b'], xycoords='data',
#             xytext=(X[ele,0]-4, X[ele,1]+2), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3", color='k'))

# # for i in range(150,200):
# #     plt.annotate(str(i), (X[i,0], X[i,1]))

# plt.xlabel('X')
# plt.ylabel('Y')
# #plt.xlim(xx.min(), xx.max())
# #plt.ylim(yy.min(), yy.max())
# plt.title('Support Vector Classifier with Linear Kernel')
# plt.legend(bbox_to_anchor=(1,0.7))
# plt.figtext(-0.1,-0.1,'Decision boundary and margin for SVC model trained on 5 random samples',fontdict={'size':16})
# #sns.set_context('poster')
# plt.show()
layout = go.Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=xx, y=yy,
                    mode='lines',opacity=1,
                    name='Decision boundary',line=dict(width=4, color='gray'), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

fig.add_trace(go.Scatter(x=xx, y= yy_up,
                    mode='lines',opacity=0.8,
                    name='Upper margin', line=dict(width=4, color='gray',dash='dashdot'), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=xx, y= yy_down,
                    mode='lines',opacity=0.8,
                    name='Lower margin', line=dict(width=4, color='gray',dash='dashdot'), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=X[:, 0][test_ind][y[test_ind]==0], y=X[:, 1][test_ind][y[test_ind]==0],
                    mode='markers',opacity=0.6,
                    name='Unlabeled datapoints (Class 1)',
                    marker=dict(size=6,color=px.colors.DEFAULT_PLOTLY_COLORS[0]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=X[:, 0][test_ind][y[test_ind]==1], y=X[:, 1][test_ind][y[test_ind]==1],
                    mode='markers',opacity=0.6,
                    name='Unlabeled datapoints (Class 2)',
                    marker=dict(size=6,color=px.colors.DEFAULT_PLOTLY_COLORS[1]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=X[:, 0][train_ind][y[train_ind]==0], y=X[:, 1][train_ind][y[train_ind]==0],
                    mode='markers',opacity=1,
                    name='Train datapoints (Class 1)',
                    marker=dict(size=12,color=px.colors.DEFAULT_PLOTLY_COLORS[0],line=dict(width=2,
                                        color='DarkSlateGrey')), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=X[:, 0][train_ind][y[train_ind]==1], y=X[:, 1][train_ind][y[train_ind]==1],
                    mode='markers',opacity=1,
                    name='Train datapoints (Class 2)',
                    marker=dict(size=12,color=px.colors.DEFAULT_PLOTLY_COLORS[1],line=dict(width=2,
                                        color='DarkSlateGrey')), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

for ind, (i, ch, hov) in enumerate(zip([1, 191, 19, 186],['A','C','B','D'],['Far away point from confusion area',
                                                                            'Far away point from confusion area',
                                                                            'Closer point to confusion area',
                                                                            'Closer point to confusion area'])):
    fig.add_annotation(hovertext=hov,
                x=X[i, 0],
                y=X[i, 1],
                text='<b>'+ch+'</b>',font = dict(size = 24, color='green'),
                ay=-70,
                ax=-45
                )
    fig.add_trace(go.Scatter(x=X[i:i+1, 0], y=X[i:i+1, 1],
                    mode='markers',opacity=1,showlegend=False,
                    marker=dict(size=9,color=px.colors.DEFAULT_PLOTLY_COLORS[int(ind%2)]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})')
                )

fig.update_annotations(dict(
            showarrow=True,
            arrowhead=7,
))
############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                 zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,
                 #tickvals=[-4,-2,0,2,4,6],
                 range=[-4, 6]
                )
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                 zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,
                 #tickvals=[-6,-4,-2,0,2,4],
                 #range=[-6, 4]
                )
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(
    title_text='<b>Figure 2:</b> Decision boundary and margin for Support Vector Classifier model trained on 5 random samples', 
                  title_x=0.5,
                 xaxis_title="X",
                 yaxis_title="Y"
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

Support vector points help the SVC model to distinguish between various classes. The model fit in the above diagram is not accurate because we have used a small number of train datapoints. Consider some potential train points A, B, C, and D from unlabeled datapoints. Datapoints B and D are closer to the confusion area than A and C. Thus, B and D are more informative in improving the model if added to the train points. When the SVC model says, "closer is better," it means closer to the confusion area.

## Confusion in Digit Classification

Let us consider the MNIST dataset (a well-known public dataset with labeled images of digits $0$ to $9$) for the classification task. We have shown a few examples here.

In [None]:
np.random.seed(seed)
(train_pool_X, train_pool_y), (test_X, test_y) = tf.keras.datasets.mnist.load_data()
train_X, Pool_X, train_y,  Pool_y = train_test_split(train_pool_X, train_pool_y, train_size=50, random_state=seed)

stack = []
for i in range(5):
    inds = np.random.choice(np.arange(len(train_pool_X)), size=5)
    stack.append(np.hstack(train_pool_X[inds]))
final = np.vstack(stack)

# plt.axis('off')
# plt.imshow(final)
# plt.figtext(0.18,0,'Figure 9: Few samples from the MNIST dataset',fontdict={'size':16});

train_pool_X = train_pool_X.reshape(train_pool_X.shape[0], -1)
train_X = train_X.reshape(train_X.shape[0], -1)
test_X = test_X.reshape(test_X.shape[0], -1)
Pool_X = Pool_X.reshape(Pool_X.shape[0], -1)

from PIL import Image
fig = go.Figure()

############# Common
fig.update_yaxes(automargin=True, showgrid=False, zeroline=False)
fig.update_xaxes(automargin=True, showgrid=False, zeroline=False)
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.update_layout(title_text='<b>Figure 3:</b> Few random samples from the MNIST dataset',
                  title_x=0.5,height=400,hovermode=False
    )

fig.add_layout_image(
source=Image.fromarray(final),
xref="x",
yref="y",
x=1.5,
sizex=5,
y=4,
sizey=5,
opacity=1,
#layer="below",
#sizing="stretch"
)

fig.layout.plot_bgcolor = '#ffffff'
fig.layout.paper_bgcolor = '#ffffff'
fig.show()

We will train the Support Vector Classifier (SVC) model on a few random samples of the MNIST dataset. Let us see what our model learns with a set of $50$ data points ($5$ samples for each class). We show the normalized confusion matrix over the test set having $10000$ samples (Figure $4$).

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
import pandas as pd

import tensorflow as tf
(train_pool_X, train_pool_y), (test_X, test_y) = tf.keras.datasets.mnist.load_data()
splitter = StratifiedKFold(1210)
for _, train_ind in splitter.split(train_pool_X, train_pool_y):
    train_X = train_pool_X[train_ind]
    train_y = train_pool_y[train_ind]
    break

In [None]:
#model = RandomForestClassifier(random_state=seed)
#model = LogisticRegression(random_state=seed)
model = SVC(random_state=seed)
model.fit(train_X.reshape(train_X.shape[0],-1), train_y)
pred_y = model.predict(test_X.reshape(test_X.shape[0], -1))
acc = f1_score(test_y, pred_y,labels=[0,1,2,3,4,5,6,7,8,9], average=None)
# fig, ax = plt.subplots()
# format_axes(ax);
# ax.bar(range(10), acc)
# ax.set_xlabel('Digit')
# #ax.grid(True)
# ax.set_xticks(range(10))
# ax.set_yticks(np.arange(0,1.1,0.1))
# ax.set_ylabel('F1-score\n(Higher is better)')
# plt.tight_layout()
# ax.set_title('Individual F1-score for each digit class')
# df = pd.DataFrame(np.zeros((1, 10)), columns=['Digit_'+str(i) for i in range(10)], index=['Train Ground Truth Frequency'], dtype='int')
# df.iloc[0] = pd.Series(train_y).value_counts().sort_index().values
# #df

In [None]:
####### Matplotlib and Seaborn ########################
#fig, ax = plt.subplots(figsize=(10, 6))
#plt.figtext(0.05,-0.02,'Confusion matrix after fitting the logistic regression model on 50 samples',
#            fontdict={'size':16});ax.set_xlabel('Actual Digit');ax.set_ylabel('Predicted Digit');
#sns.heatmap(np.round(CM/CM.astype(np.float).sum(axis=1),2), annot=True, ax=ax, cmap=my_cmap);
#######################################################
CM = confusion_matrix(test_y, pred_y)
my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list('',[my_clr['l_b'],
                                                                  my_clr['y'],my_clr['l_r']])

# fig = px.imshow(CM, 
#                x=list(range(10)),y=list(range(10)),
#                labels={'x':'Predicted digit','y':'Actual digit','color':'Normalized samples'},
#                height=500)
hover = ['All']
CM = CM[::-1]
Norm_CM = np.round(CM.astype('float')/CM.sum(axis=1)[:, np.newaxis],3)
hover = [["Predicted digit: "+str(i)+\
          '<br>Actual digit: '+str(j)+\
          '<br>Number of samples: '+str(CM[j, i])+\
          '<br>Normalized number of samples: '+str(Norm_CM[j, i])\
          for i in range(len(CM[0]))] for j in range(len(CM[0]))]
fig = ff.create_annotated_heatmap(CM,colorscale=px.colors.sequential.Viridis,
                                  x=list(range(10)),y=list(range(10)),
                                  text=hover, hoverinfo='text', 
                                  annotation_text=Norm_CM)
fig.update_layout(title_text='<b>Figure 4:</b> Confusion matrix over 10000 test samples after fitting an SVC model on 50 random samples', 
                  title_x=0.5,
                 xaxis_title="Predicted digit",
                 yaxis_title="Actual digit", 
                #font=dict(family="Courier New")
                 )
############# Common
fig.update_yaxes(automargin=True);fig.update_xaxes(automargin=True)
fig.update(layout_coloraxis_showscale=False)
fig['layout']['xaxis'].update(side='bottom')
fig.show()

In [None]:
# fig, ax = plt.subplots(1,3)
# for i,j in enumerate([19,17,9]):
#     ax[i].set_axis_off()
#     ax[i].imshow(test_X[j].reshape(28,28))
# plt.figtext(0.2,0.75,'Figure 4: Digits 4, 7 and 9 have similar structure',
#             fontdict={'size':12});

fig = make_subplots(1, 3)#subplot_titles=['Walking','Running','Sitting'])

############# Common
fig.update_yaxes(automargin=True, showgrid=False, zeroline=False)
fig.update_xaxes(automargin=True, showgrid=False, zeroline=False)
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.update_layout(title_text='<b>Figure 5:</b> Digits 4, 7 and 9 have similar structure',
                  title_x=0.5,height=350, hovermode=False
    )

for ind, i in enumerate([19,17,9],1):
    fig.add_layout_image(
    source=Image.fromarray(test_X[i]),
    xref="x",
    yref="y",
    x=0,
    sizex=5,
    y=4,
    sizey=5,
    opacity=1,
    layer="below",
    #sizing="stretch",
    row=1, col=ind)

fig.layout.plot_bgcolor = '#ffffff'
fig.layout.paper_bgcolor = '#ffffff'
#print(time()-init)
# fig.layout.annotations[0].update(y=0.78)
# fig.layout.annotations[1].update(y=0.78)
# fig.layout.annotations[2].update(y=0.78)
# fig.layout.annotations[3].update(y=0.78)
fig.show()

We can see from the confusion matrix that few digits have more confusion than others. For example, the digit '8' has the least confusion; digit '9' is confused with '7' and '4'. Some digits are difficult to distinguish from the model's perspective. Thus, we may need more training examples for the before-mentioned digits to learn them correctly. Now, we will see a regression-based example.

## GP Needs 'Good' Data Points

We will consider a sine curve data with added noise. We take a few samples (8 samples) as the train points, a few as the potential train points, and the rest as the test points.

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel, WhiteKernel
import plotly.graph_objs as go
from plotly.graph_objs import Layout

np.random.seed(seed)
whole_X = np.linspace(-1,1,500).reshape(-1,1)
whole_y = np.sin(whole_X*10) + np.random.normal(size=whole_X.shape[0]).reshape(-1,1)/10
train_ind = np.random.choice(np.arange(500), size=8,replace=False)
np.random.seed(seed)
pool_ind = np.linspace(0,499,20).astype(int)
pool_X = whole_X[pool_ind]
pool_y = whole_y[pool_ind]
train_X = whole_X[train_ind]
train_y = whole_y[train_ind]
model = GaussianProcessRegressor(kernel=ConstantKernel(1)*(Matern(length_scale=1)))
model.fit(train_X, train_y)
pred_y, var_y = model.predict(whole_X, return_cov=True)


# fig, ax = plt.subplots()
# ax.scatter(whole_X, whole_y, s=10, label='Test data', color=my_clr['l_b'])
# ax.scatter(pool_X, pool_y, marker='o', s=100, color=my_clr['y'], label='Potential train points')
# ax.scatter(train_X, train_y, marker='o', s=100, color=my_clr['l_r'], label='Train points')
# ax.legend(bbox_to_anchor=(1,0.5));
# plt.figtext(0,-0.02,'Noisy sine curve dataset with train, test and potential train points',fontdict={'size':16});

# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=train_X.squeeze(), y=train_y.squeeze(),
                    mode='markers', name='Train points',marker=dict(size=12,color='black'), hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=pool_X.squeeze(), y=pool_y.squeeze(),
                    mode='markers',
                    name='Potential train points',marker=dict(size=12, color='rgb(240,0,0)'), hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=whole_y.squeeze(),
                    mode='markers',opacity=0.4,
                    name='Test points',marker=dict(size=6,color=px.colors.DEFAULT_PLOTLY_COLORS[0]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 6:</b> Noisy sine curve dataset with train, test and potential train points', 
                  title_x=0.5,
                 xaxis_title="X",
                 yaxis_title="Y"
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

We will fit a GPR (Gaussian Process Regressor) [11] model to our dataset with <a style="text-decoration:none" href="https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.Matern.html">Matern</a> kernel. GPR models additionally provide the uncertainty about the predictions. The predictive variance is measure of uncertainty of the model about its predictions (predictive mean).

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel, WhiteKernel
from plotly.graph_objs import Layout

np.random.seed(seed)
whole_X = np.linspace(-1,1,500).reshape(-1,1)
whole_y = np.sin(whole_X*10) + np.random.normal(size=whole_X.shape[0]).reshape(-1,1)/10
train_ind = np.random.choice(np.arange(500), size=8,replace=False)
np.random.seed(seed)
pool_ind = np.linspace(0,499,20).astype(int)
pool_X = whole_X[pool_ind]
pool_y = whole_y[pool_ind]
train_X = whole_X[train_ind]
train_y = whole_y[train_ind]
model = GaussianProcessRegressor(kernel=ConstantKernel(0.1)*(Matern(length_scale=0.1)))
# model = GaussianProcessRegressor(kernel=ConstantKernel(0.1)*(Matern(length_scale=0.1)))
model.fit(train_X, train_y)
pred_y, var_y = model.predict(whole_X, return_cov=True)
rmsold = np.round(np.sqrt(np.mean((pred_y.squeeze()- whole_y.squeeze())**2)),3)


# fig, ax = plt.subplots()
# ax.scatter(whole_X, whole_y, s=10, label='Test data', color=my_clr['l_b'])
# ax.plot(whole_X, pred_y, color='grey', label='Predictive mean')
# ax.fill_between(whole_X.squeeze(), pred_y.squeeze()-var_y.diagonal(), pred_y.squeeze()+var_y.diagonal(), 
#                  alpha=0.2, label='Predictive variance',color='grey')
# ax.scatter(pool_X, pool_y, marker='o', s=100, color=my_clr['y'], label='Potential train points')
# ax.scatter(train_X, train_y, marker='o', s=100, color=my_clr['l_r'], label='Train points')
# ax.legend(bbox_to_anchor=(1,0.5));
# ii = 0
# ax.annotate(chr(ii+65), (pool_X[5], pool_y[5]), size=25, color=my_clr['b'], xycoords='data',
#             xytext=(pool_X[5]-0.35, pool_y[5]), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3"))
# ii = 1; iii = 6
# ax.annotate(chr(ii+65), (pool_X[iii], pool_y[iii]), size=25, color=my_clr['b'], xycoords='data',
#             xytext=(pool_X[iii]-0.35, pool_y[iii]), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3"))
# ii = 2; iii = 7
# ax.annotate(chr(ii+65), (pool_X[iii], pool_y[iii]), size=25, color=my_clr['b'], xycoords='data',
#             xytext=(pool_X[iii]-0.40, pool_y[iii]), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3"))
# ii = 3; iii = 8
# ax.annotate(chr(ii+65), (pool_X[iii], pool_y[iii]), size=25, color=my_clr['b'], xycoords='data',
#             xytext=(pool_X[iii]-0.30, pool_y[iii]), textcoords='data',
#             arrowprops=dict(arrowstyle="->",
#                             connectionstyle="arc3"));
# ax.set_xlabel('X');ax.set_ylabel('Y');
# ax.set_title('RMSE: '+str(np.round(np.sqrt(np.mean((pred_y.squeeze()- whole_y.squeeze())**2)),3)));
# plt.figtext(0.34,-0.07,'Trained GP model',fontdict={'size':16})
# format_axes(ax);
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=whole_y.squeeze(),
                    mode='markers',opacity=0.4,
                    name='Test points',marker=dict(size=6, color=px.colors.DEFAULT_PLOTLY_COLORS[0]), hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=pool_X.squeeze(), y=pool_y.squeeze(),
                    mode='markers',
                    name='Potential train points',marker=dict(size=12, color='rgb(240,0,0)'), 
                         hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=train_X.squeeze(), y= train_y.squeeze(),
                    mode='markers',
                    name='Train points', marker=dict(size=12, color='black'), 
                         hovertemplate='(%{x:.2f},%{y:.2f})'))

fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()-var_y.diagonal(), fill='tonexty',
                         fillcolor='rgba(128,128,128,0.2)',showlegend=False,name='Predictive variance',
                         hovertemplate='(%{x:.2f},%{y:.2f})',
                    mode='none' # override default markers+lines
                    ))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()+var_y.diagonal(), fill='tonexty',
                         fillcolor='rgba(128,128,128,0.2)',
                         hovertemplate='(%{x:.2f},%{y:.2f})',
                    mode= 'none', name='Predictive variance'))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y= pred_y.squeeze(),
                    mode='lines',opacity=0.8,
                    name='Predictive mean', line=dict(width=4, color='gray',dash='dashdot'), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

for i in ['1','2']:
    fig.add_annotation(hovertext='Is this a good point?',
                x=pool_X[5][0],
                y=pool_y[5][0],
                text='<b>A</b>',font = dict(size = 24, color='green'),ay=-50
                )
    fig.add_annotation(hovertext='Is this a good point?',
                x=pool_X[6][0],
                y=pool_y[6][0],
                text="<b>B</b>",font = dict(size = 24, color='orange'),ay=50
                )
    fig.add_annotation(hovertext='Is this a good point?',
                x=pool_X[7][0],
                y=pool_y[7][0],
                text="<b>C</b>",font = dict(size = 24, color='orange'),ay=50
                )
    fig.add_annotation(hovertext='Is this a good point?',
                x=pool_X[-3][0],
                y=pool_y[-3][0],
                text="<b>D</b>",font = dict(size = 24, color='green'),ay=-50
                )
fig.update_annotations(dict(
            showarrow=True,
            arrowhead=7,
            ax=-5,
))
############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 7:</b> Trained GP model on 8 randomly selected datapoints<br>RMSE:'+str(np.round(rmsold,3)), 
                  title_x=0.5,
                 xaxis_title="X",
                 yaxis_title="Y"
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

We can observe that uncertainty (predictive variance) is higher at the distant datapoints from the train points. Let us consider a set of datapoints A, B, C, and D, to see if they are equally informative to the model.

In [None]:
# fig, ax = plt.subplots(1,2, figsize=(12,4))

np.random.seed(seed)
whole_X = np.linspace(-1,1,500).reshape(-1,1)
whole_y = np.sin(whole_X*10) + np.random.normal(size=whole_X.shape[0]).reshape(-1,1)/10
train_ind = np.random.choice(np.arange(500), size=8,replace=False)
np.random.seed(seed)
pool_ind = np.linspace(0,499,20).astype(int)
pool_X = whole_X[pool_ind]
pool_y = whole_y[pool_ind]
train_X = whole_X[train_ind]
train_y = whole_y[train_ind]
model = GaussianProcessRegressor(kernel=ConstantKernel(0.1)*(Matern(length_scale=0.1)))
# model = GaussianProcessRegressor(kernel=ConstantKernel(0.1)*(Matern(length_scale=0.1)))
model.fit(train_X, train_y)
pred_y, var_y = model.predict(whole_X, return_cov=True)
rmsold = np.round(np.sqrt(np.mean((pred_y.squeeze()- whole_y.squeeze())**2)),3)



layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure()


##########################
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=whole_y.squeeze(),
                    mode='markers',opacity=0.4,
                    name='Test points',marker=dict(size=6, color=px.colors.DEFAULT_PLOTLY_COLORS[0]), 
                         hovertemplate='(%{x:.2f},%{y:.2f})', visible=True))
fig.add_trace(go.Scatter(x=pool_X.squeeze(), y=pool_y.squeeze(),
                    mode='markers',
                    name='Potential train points',marker=dict(size=12, color='rgb(240,0,0)'), 
                         hovertemplate='(%{x:.2f},%{y:.2f})', visible=True))
fig.add_trace(go.Scatter(x=train_X.squeeze(), y= train_y.squeeze(),
                    mode='markers',
                    name='Train points', marker=dict(size=12, color='black'), 
                         hovertemplate='(%{x:.2f},%{y:.2f})', visible=True))

fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()-var_y.diagonal(), fill='tonexty',
                         fillcolor='rgba(128,128,128,0.2)',showlegend=False, visible=True,
                         name='Predictive variance',hovertemplate='(%{x:.2f},%{y:.2f})',
                    mode='none' # override default markers+lines
                    ))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()+var_y.diagonal(), fill='tonexty',
                         fillcolor='rgba(128,128,128,0.2)',hovertemplate='(%{x:.2f},%{y:.2f})',
                    mode= 'none', name='Predictive variance'))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y= pred_y.squeeze(),
                    mode='lines',opacity=0.8,
                    name='Predictive mean', line=dict(width=4, color='gray',dash='dashdot'), 
                         hovertemplate='(%{x:.2f},%{y:.2f})', visible=True))


##########################



rms = []
for n_i, (n1, n2) in enumerate([[6,7], [5,-3]], 1):
  model = GaussianProcessRegressor(kernel=ConstantKernel(0.1)*(Matern(length_scale=0.1)))
  new_train_X = np.array(train_X.tolist() + [pool_X[n1], pool_X[n2]])
  new_train_y = np.array(train_y.tolist() + [pool_y[n1], pool_y[n2]])
  model.fit(new_train_X, new_train_y)
  pred_y, var_y = model.predict(whole_X, return_cov=True)
  rms.append(np.round(np.sqrt(np.mean((pred_y.squeeze()- whole_y.squeeze())**2)),3))
#   ax[n_i].scatter(whole_X, whole_y, s=10, label='Test data',color=my_clr['l_b'])
#   ax[n_i].plot(whole_X, pred_y, color='grey', label='Predictive mean')
#   ax[n_i].fill_between(whole_X.squeeze(), pred_y.squeeze()-var_y.diagonal(), 
#                        pred_y.squeeze()+var_y.diagonal(), alpha=0.2, label='Predictive variance',color='grey')
#   ax[n_i].scatter(pool_X, pool_y, marker='o', s=100, color=my_clr['y'], label='Potential train points')
#   ax[n_i].scatter(new_train_X, new_train_y, marker='o', s=100, color=my_clr['l_r'], label='Train points')
#   ii = 0
#   ax[n_i].annotate(chr(ii+65), (pool_X[5], pool_y[5]), size=25, color=my_clr['b'], xycoords='data',
#               xytext=(pool_X[5]-0.35, pool_y[5]), textcoords='data',
#               arrowprops=dict(arrowstyle="->",
#                               connectionstyle="arc3"))
#   ii = 1; iii = 6
#   ax[n_i].annotate(chr(ii+65), (pool_X[iii], pool_y[iii]), size=25, color=my_clr['b'], xycoords='data',
#               xytext=(pool_X[iii]-0.35, pool_y[iii]), textcoords='data',
#               arrowprops=dict(arrowstyle="->",
#                               connectionstyle="arc3"))
#   ii = 2; iii = 7
#   ax[n_i].annotate(chr(ii+65), (pool_X[iii], pool_y[iii]), size=25, color=my_clr['b'], xycoords='data',
#               xytext=(pool_X[iii]-0.25, pool_y[iii]), textcoords='data',
#               arrowprops=dict(arrowstyle="->",
#                               connectionstyle="arc3"))
#   ii = 3; iii = -3
#   ax[n_i].annotate(chr(ii+65), (pool_X[iii], pool_y[iii]), size=25, color=my_clr['b'], xycoords='data',
#               xytext=(pool_X[iii]-0.30, pool_y[iii]), textcoords='data',
#               arrowprops=dict(arrowstyle="->",
#                               connectionstyle="arc3"));
#################################
  fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=whole_y.squeeze(),
                    mode='markers',opacity=0.4,
                    name='Test points',showlegend=True,
                    marker=dict(size=6, color=px.colors.DEFAULT_PLOTLY_COLORS[0]), 
                           hovertemplate='(%{x:.2f},%{y:.2f})', visible=False))
  fig.add_trace(go.Scatter(x=pool_X.squeeze(), y=pool_y.squeeze(),
                        mode='markers',showlegend=True,
                        name='Potential train points',marker=dict(size=12, color='rgb(240,0,0)'), 
                        hovertemplate='(%{x:.2f},%{y:.2f})', visible=False))
  fig.add_trace(go.Scatter(x=new_train_X.squeeze(), y= new_train_y.squeeze(),
                        mode='markers',showlegend=True,
                        name='Train points', marker=dict(size=12, color='black'), 
                        hovertemplate='(%{x:.2f},%{y:.2f})', visible=False))

  fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()-var_y.diagonal(), fill='tonexty',
                             fillcolor='rgba(128,128,128,0.2)',showlegend=False,
                        mode='none', visible=False # override default markers+lines
                        ))
  fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()+var_y.diagonal(), fill='tonexty',
                             fillcolor='rgba(128,128,128,0.2)',showlegend=True,
                        mode= 'none', name='Predictive variance', visible=False))
  fig.add_trace(go.Scatter(x=whole_X.squeeze(), y= pred_y.squeeze(),
                        mode='lines',opacity=0.8,showlegend=True,
                        name='Predictive mean', line=dict(width=4, color='gray',dash='dashdot'), 
                           hovertemplate='(%{x:.2f},%{y:.2f})', visible=False))
#################################
# ax[0].set_title('Added points {B, C} to the train points\nRMSE on the test data: '+str(rms[0]));
# ax[1].set_title('Added points {A, D} to the train points\nRMSE on the test data: '+str(rms[1]));
# ax[1].legend(bbox_to_anchor=(1,0.5));
# ax[0].set_ylabel('Y');
# ax[1].set_yticks(())
# ax[1].set_xlabel('X');ax[0].set_xlabel('X');
# plt.subplots_adjust(wspace=.0);
# plt.figtext(0.2,-0.1,'Effect on the predictions after adding a few specific points to the train points',fontdict={'size':16})
# format_axes(ax[0]);format_axes(ax[1]);

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                 zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                 zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,title_text="X", side='bottom')
fig.update(layout_coloraxis_showscale=False)

fig.update_layout(title_text='<b>Figure 8:</b> Trained GP model on 8 randomly selected datapoints<br>RMSE:'+str(np.round(rmsold,3)), 
                  title_x=0.5,title_y=0.95,
                 xaxis_title="X",
                 yaxis_title="Y", paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
                #font=dict(family="Courier New")
                 )

for i in ['1','2']:
    fig.add_annotation(hovertext='Good point',
                x=pool_X[5][0],
                y=pool_y[5][0],
                text='<b>A</b>',font = dict(size = 24, color='green'),ay=-50
                )
    fig.add_annotation(hovertext='Not so good point',
                x=pool_X[6][0],
                y=pool_y[6][0],
                text="<b>B</b>",font = dict(size = 24, color='orange'),ay=50
                )
    fig.add_annotation(hovertext='Not so good point',
                x=pool_X[7][0],
                y=pool_y[7][0],
                text="<b>C</b>",font = dict(size = 24, color='orange'),ay=50
                )
    fig.add_annotation(hovertext='Good point',
                x=pool_X[-3][0],
                y=pool_y[-3][0],
                text="<b>D</b>",font = dict(size = 24, color='green'),ay=-50
                )
fig.update_annotations(dict(
            showarrow=True,
            arrowhead=7,
            ax=-5,
))

updatemenus=[
        dict(
            type = "buttons",
            direction = "left",
            buttons=list([
                dict(
                    args = [{'visible': [True]*6+[False]*6+[False]*6},
                         {'title': '<b>Figure 7:</b> Trained GP model on 8 randomly selected datapoints<br>RMSE:'+str(np.round(rmsold,3))}],
                    label="Original model",
                    method="update"
                ),
                dict(
                    args = [{'visible': [False]*6+[True]*6+[False]*6},
                         {'title': '<b>Figure 7:</b> Effect of adding various train points to the model<br>RMSE:'+str(np.round(rms[0],3))}],
                    label="<b>Click</b> to add points B and C<br>to the train points of the Original model",
                    method="update"
                ),
                dict(
                    args = [{'visible': [False]*6+[False]*6+[True]*6},
                         {'title': '<b>Figure 7:</b> Effect of adding various train points to the model<br>RMSE:'+str(np.round(rms[1],3))}],
                    label="<b>Click</b> to add points A and D<br>to the train points of the Original model",
                    method="update"
                )
            ]),
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.12,
            yanchor="top"
        ),
    ]

fig['layout']['updatemenus'] = updatemenus
fig.show()

We can say from RMSE and predictive variance that datapoints A and D are more informative to the model than B and C. Note that adding points to the train set is equivalent to annotating unlabeled data and using them for training. We can either have an intelligent way to choose these 'good' points or randomly choose some datapoints and label them. Active learning techniques can help us determine the 'good' datapoints, which are likely to improve our model. Now, we will discuss active learning techniques in detail.

# The Basics of Active Learning

Wikipedia quotes the definition of active learning as the following, 
* *'Active learning is a special case of machine learning in which a learning algorithm can interactively query a user (or some other information source) to label new data points with the desired outputs.'*

The below diagram illustrates the general flow of active learning.

In [None]:
# plt.figure(figsize=(12,8),dpi=70)
# plt.imshow(Image.open("../images/AL basics.png"))
# plt.axis('off');
# plt.figtext(0.37,0.2,'Figure 8: General flow of Active Learning',fontdict={'size':16});
from PIL import Image
fig = go.Figure()

############# Common
fig.update_yaxes(automargin=True, showgrid=False, zeroline=False)
fig.update_xaxes(automargin=True, showgrid=False, zeroline=False)
fig.update(layout_coloraxis_showscale=False)
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.update_layout(title_text='<b>Figure 9:</b> General flow of active learning',
                  title_x=0.5,hovermode=False
    )

fig.add_layout_image(
source=Image.open('../images/AL basics.png'),
xref="x",
yref="y",
x=0,
sizex=5,
y=4,
sizey=6,
opacity=1,
#layer="below",
#sizing="stretch"
)

fig.layout.plot_bgcolor = '#ffffff'
fig.layout.paper_bgcolor = '#ffffff'
fig.show()

As shown in the flow diagram, an ML model gives a few samples to the oracle (human annotator or data source) for labeling from an unlabeled pool or distribution. These samples are chosen intelligently by a few criteria. Thus, active learning is also called as optimal experimental design in other words [[link](http://eprints.sics.se/3600/)].

### Random Baseline

An ML model can randomly sample datapoints and send them to the oracle for labeling. Random sampling will also eventually result in capturing the global distribution of the dataset in the train datapoints. However, active learning aims to improve the model by intelligently selecting the datapoints for labeling. Thus, Random sampling is an appropriate baseline to compare with active learning.

# Different Scenarios for Active Learning

We have mainly three different scenarios of active learning:
1. **Membership Query Synthesis** [12]: In this scenario, the model has an underlying distribution of data points from where it can generate the samples. The generated samples are sent to the oracle for labeling.
1. **Stream-Based Selective Sampling** [13]: We have a live stream of online data samples, and for each incoming sample model can choose to query for it or discard it based on some criteria. One possible criterion is to have some information measure or a query strategy to query the incoming sample [<a style="text-decoration:none" href="http://burrsettles.com/pub/settles.activelearning.pdf">2</a>, <a style="text-decoration:none" href="https://dl.acm.org/doi/10.5555/3091622.3091641">3</a>]. Another way is define several hypotheses that define a region where they agree for labeled dataset called *version space* [<a style="text-decoration:none" href="http://burrsettles.com/pub/settles.activelearning.pdf">2</a>, <a style="text-decoration:none" href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.121.5764&rep=rep1&type=pdf">4</a>] but disagree for some unlabeled dataset. Calculating the exact region is expensive thus approximations and other methods are used in practice [<a style="text-decoration:none" href="http://burrsettles.com/pub/settles.activelearning.pdf">2</a>, <a style="text-decoration:none" href="https://dl.acm.org/doi/10.1145/130385.130417">5</a>, <a style="text-decoration:none" href="https://link.springer.com/article/10.1007/BF00993277">6</a>, <a style="text-decoration:none" href="https://papers.nips.cc/paper/3325-a-general-agnostic-active-learning-algorithm">7</a>].
1. **Pool-Based Sampling** [8]: In this case, we already have a pool of unlabeled samples (We called them potential train points in the prior discussion). Based on some criteria, model queries for a few samples. 

The pool-based sampling scenario is suitable for most of the real-world applications. Thus, we restrict our article to pool-based sampling only.

# Pool-Based Sampling

We can query the datapoints from an unlabeled pool with the following methods:
1. **Uncertainty Sampling** [8]: We query the samples based on the model's uncertainty about the predictions. 
1. **Query by Committee** [5]: In this approach, we create a committee of two or more models.  The Committee queries for the samples where predictions disagree the most among themselves.

We will demonstrate each of the above strategies with examples in the subsequent sections.

## Uncertainty Sampling

There are different approaches for the Classification and Regression tasks in uncertainty sampling. We will go through them one by one with examples here.

### Digit Classification with MNIST Dataset

In [None]:
from sklearn.metrics import mean_squared_error
def individual_acc(pred_yy):
    #fig, ax = plt.subplots()
    ind_acc = np.array([np.inf]*len(set(test_y)))
    for each in range(len(set(test_y))):
        ind_acc[each] = f1_score(test_y, pred_yy, labels=[each], average=None)
        #ind_acc[each] = recall_score(test_y, pred_yy, labels=[each], average=None)
    return ind_acc.squeeze()

def overall_rmse(pred_y):
    rmse = []
    for preds in pred_y:
        rmse.append(mean_squared_error(test_y, preds, squared=False))
    return rmse

def overall_acc(pred_y):
    acc = []
    for preds in pred_y:
        acc.append(f1_score(test_y, preds, average='macro'))
    return acc


def plot_samples(X, ax):
    ax.set_axis_off()
    upper = np.hstack([x.reshape(28,28) for x in X[:5]])
    lower = np.hstack([x.reshape(28,28) for x in X[5:]])
    final = np.vstack([upper, lower])
    ax.imshow(final)
    ax.set_title('Queried samples')
    return ax

We will fit a Random Forest Classifier model [10] (an ensemble model consisting of multiple Decision Tree Classifiers) on a few random samples (50 samples) of MNIST dataset and visualize the predictions. We will explain different ways to perform uncertainty sampling using the predictions.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

np.random.seed(seed)
(train_pool_X, train_pool_y), (test_X, test_y) = tf.keras.datasets.mnist.load_data()
train_X, Pool_X, train_y,  Pool_y = train_test_split(train_pool_X, train_pool_y, train_size=50, random_state=seed)

train_pool_X = train_pool_X.reshape(train_pool_X.shape[0], -1)
train_X = train_X.reshape(train_X.shape[0], -1)
test_X = test_X.reshape(test_X.shape[0], -1)
Pool_X = Pool_X.reshape(Pool_X.shape[0], -1)

model = RandomForestClassifier(random_state=seed)
#model=SVC(kernel='linear', random_state=seed, probability=True)
#model = LogisticRegression(random_state=seed, max_iter=200)
model.fit(train_X, train_y)
proba = model.predict_proba(test_X)
p = np.argsort([entropy(p) for p in proba])[::-1]
q = np.argsort([max(p) for p in proba])[::-1]
r = np.argsort([sorted(p)[-1]-sorted(p)[-2] for p in proba])[::-1]

ii = 5000
i_list = [p[ii], q[ii], r[ii]]
# fig, ax = plt.subplots(3, 3, figsize=(16,7))
# plt.subplots_adjust(wspace=0.,hspace=0.4)
# for i,img_i in enumerate(i_list):
#     ax[0, i].imshow(test_X[img_i].reshape(28, 28))
#     ax[0, i].set_yticks(());ax[0, i].set_xticks(())
#     ax[0, i].set_title('Ground Truth\nSample '+str(i+1))
#     ax[1, i].bar(range(10), proba[img_i])
#     ax[1, i].set_xticks(range(10));ax[1, 0].set_yticks(np.arange(0,1,0.2));ax[1, max(1,i)].set_yticks(())
#     ax[1, max(1,i)].set_ylim(0,1)
#     #ax[1, i].grid(True)
#     ax[1, i].set_xlabel('Pedicted digit');ax[1, 0].set_ylabel('Probability');
#     l = max(proba[img_i])
#     m = sorted(proba[img_i])[-1] - sorted(proba[img_i])[-2]
#     e = entropy(proba[img_i])
#     l_a = np.argmax(proba[img_i])
#     l_b = np.argsort(proba[img_i])[-2]
#     ax[1, i].annotate(np.round(l,2),(l_a-0.4,np.round(l,2)+0.05))
#     ax[1, i].annotate(np.round(sorted(proba[img_i])[-2],2),(l_b-0.4,np.round(sorted(proba[img_i])[-2],2)+0.05))
#     ax[2, i].axhline(entropy(proba[i_list[1]]), color='y')
#     img_im = i_list[0]
#     img_il = i_list[0]
#     ax[2, i].axhline(max(proba[img_il]), color='g')
#     ax[2, i].axhline(sorted(proba[img_im])[-1] - sorted(proba[img_im])[-2], color='r')
#     ax[2, i].set_ylim(0,2.5);ax[2, max(1,i)].set_yticks(())
#     b_l = ax[2, i].bar(['Least confident','Margin sampling','Entropy'], [l, m, e])
#     b_l[0].set_color('g')
#     b_l[1].set_color('r')
#     b_l[2].set_color('y')
#     ax[2, i].annotate(str(np.round(l,3)),(0, 1),size=18);ax[2, i].annotate(str(np.round(m, 3)),(1, 1),size=18)
#     ax[2, i].annotate(str(np.round(e,3)),(2, 1),size=18)
#     format_axes(ax[0, i]);format_axes(ax[1, i]);format_axes(ax[2, i]);
#     #ax[2, i].set_xlim(0,3)

# plt.figtext(0.38,+0.02,'Figure 10: Various methods of uncertainty sampling',fontdict={'size':16});
# #plt.tight_layout();

fig = make_subplots(3, 3,
                    vertical_spacing=0.1,
                    subplot_titles=['Ground Truth<br>Sample 1',
                                          'Ground Truth<br>Sample 2',
                                          'Ground Truth<br>Sample 3'])

############# Common
#fig.update_yaxes(automargin=True, showgrid=False, zeroline=False)
#fig.update_xaxes(automargin=True, showgrid=False, zeroline=False)
#fig.update(layout_coloraxis_showscale=False)
methods = ['Least confident','Margin sampling','Entropy']

for ind, i in enumerate(i_list,1):
    fig.add_layout_image(
    source=Image.fromarray(test_X[i].reshape(28, 28)),
    xref="x",
    yref="y",
    x=0.6,
    sizex=6,
    y=4,
    sizey=5,
    opacity=1,
    layer="below",
    #sizing="stretch",
    row=1, col=ind)
    fig.update_xaxes(automargin=True, showgrid=False, 
                     zeroline=False, showticklabels=False, 
                     row=1,col=ind)
    fig.update_yaxes(automargin=True, showgrid=False, 
                     zeroline=False, showticklabels=False, 
                     row=1,col=ind)
for ind, i in enumerate(i_list,1):
    fig.add_trace(go.Bar(x=list(range(10)), y=proba[i],text=list(map(lambda x: "%.2f"%x, proba[i])),
                  textposition='outside',
                  marker=dict(color=plotly.colors.DEFAULT_PLOTLY_COLORS[ind]),showlegend=False,
                  name='Predicted probabilities<br>Sample '+str(ind),
                  hovertemplate='(%{x:d},%{y:.2f})'),row=2, col=ind)
    fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                     zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,
                     title=dict(text='Predicted digit',standoff=0),
                     tickvals=list(range(10)),row=2,col=ind)
    fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                     zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,
                     tickvals=list(np.linspace(0,1,5)),
                     title='Probability'*(not bool(ind-1)),
                     range=[0,1],
                     row=2,col=ind)
    fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',
                     gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
for ind, i in enumerate(i_list,1):
    l = max(proba[i])
    m = sorted(proba[i])[-1] - sorted(proba[i])[-2]
    e = entropy(proba[i])
    fig.add_trace(go.Bar(x=methods, y=[l, m, e],showlegend=False,text=["%.3f"%l, "%.3f"%m, "%.3f"%e],
                  textposition='auto',
                  marker=dict(color=plotly.colors.DEFAULT_PLOTLY_COLORS[ind]),
                  hovertemplate='(%{x},%{y:.2f})'),row=3, col=ind)
    fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                     zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,
                     row=3,col=ind)
    fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,
                     zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1,
                     #tickvals=list(np.linspace(0,0.9,10)),
                     range=[0,2],
                     row=3,col=ind)
    fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',
                     gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.layout.plot_bgcolor = '#ffffff'
fig.layout.paper_bgcolor = '#ffffff'
fig.update_layout(title_text='<b>Figure 10:</b> Various methods of uncertainty sampling',
                  title_x=0.5,title_y=0.97,height=600)

for ind, i in enumerate(i_list,1):
    fig.add_annotation(arrowcolor='rgba(128,128,128,0.4)',arrowwidth=0.1,
                hovertext="Samples having minimum 'Max proba' value are selected<br>for annotation in 'Least confident' method.",
                x=np.argmax(proba[i]),y=max(proba[i]),
                text='<b>'+'Max<br>Proba'+'</b>',font = dict(size = 12, color='grey'),
                ay=-40,
                ax=0,row=2,col=ind
                )
    fig.add_annotation(arrowcolor='rgba(128,128,128,0.4)',arrowwidth=0.1,
                hovertext="This is used to calculate the margin<br>in 'Margin sampling'. method<br>margin = (Max Proba) - (2nd Max Proba)",
                x=np.argsort(proba[i])[-2],y=np.sort(proba[i])[-2],
                text='<b>'+'2nd<br>Max<br>Proba'+'</b>',font = dict(size = 12, color='grey'),
                ay=-40,
                ax=0,row=2,col=ind
                )
#     fig.add_trace(go.Scatter(x=X[i:i+1, 0], y=X[i:i+1, 1],
#                     mode='markers',opacity=1,showlegend=False,
#                     marker=dict(size=9,color=px.colors.DEFAULT_PLOTLY_COLORS[int(ind%2)]), 
#                     hovertemplate='(%{x:.2f},%{y:.2f})'),row=2,col=ind
#                 )

#print(time()-init)
# fig.layout.annotations[0].update(y=0.78)
# fig.layout.annotations[1].update(y=0.78)
# fig.layout.annotations[2].update(y=0.78)
# fig.layout.annotations[3].update(y=0.78)
fig.show()

Above are the model predictions in terms of probability for a few random test samples. We can use different uncertainty strategies as the following.
1. **Least confident** [16]: In this method, we choose samples for which the most probable class's probability is minimum. In the above example, sample 1 is least confident about its highest probable class digit '1'. So, we will choose sample 1 among all for labeling using this approach.

1. **Margin sampling** [17]: In this method, we choose samples for which the difference between the probability of the most probable class and the second most probable class is minimum. In the above example, sample 1 has the least margin; thus, we will choose sample 1 for labeling using this approach.

1. **Entropy** [15]: Entropy can be calculated for N number of classes using the following equation, where $P(x_i)$ is predicted probability for $i^{th}$ class. 
\begin{equation}
H(X) = -\sum\limits_{i=0}^{N}P(x_i)log_2P(x_i)
\end{equation}
Entropy is likely to be higher if the probability is distributed over all classes. Thus, we can say that if entropy is higher, the model is more confused among all classes.
For the above example, sample 2 has the highest entropy in predictions. So, we can choose the same for labeling.

We will now see the effect of active learning with these strategies on test data (contains 10000 samples). We will continue using the Random Forest Classifier model for this problem. We start with 50 samples as the initial train set and add 100 actively chosen samples over 100 iterations.

In [None]:
from modAL.uncertainty import uncertainty_sampling, margin_sampling, entropy_sampling
from multiprocessing import Pool
from IPython.display import clear_output
np.random.seed(seed)
N = 100
C = 1

def random_sampling(clf, X, *args):
    ind = [np.random.choice(range(len(X)))]
    return ind, X[ind]
    
# def run(x):
#   s_i, strategy = x
#   list_pred_y = {s_i:[]}
#   list_new_X = {s_i:[]}
#   Pool_X_tmp, Pool_y_tmp = Pool_X.copy(), Pool_y.copy()
#   train_X_tmp, train_y_tmp = train_X.copy(), train_y.copy()
#   learner = ActiveLearner(estimator=LogisticRegression(random_state=seed, max_iter=500), 
#                         query_strategy=strategy, 
#                        X_training=train_X_tmp, y_training=train_y_tmp)

#   for i in range(N):
#       print(s_i, i)
#       clear_output(wait=True)
#       pred_y = learner.predict(test_X)
#       list_pred_y[s_i].append(pred_y)
#       new_inds, new_X = learner.query(Pool_X_tmp, C)
#       list_new_X[s_i].append(new_X)
#       learner.teach(Pool_X_tmp[new_inds], Pool_y_tmp[new_inds])
#       Pool_X_tmp, Pool_y_tmp = np.delete(Pool_X_tmp, new_inds, axis=0), np.delete(Pool_y_tmp, new_inds, axis=0)
#   return (list_pred_y, list_new_X)

# ans = [run((si,st)) for si,st in enumerate([uncertainty_sampling, margin_sampling, 
#                                             entropy_sampling, random_sampling])]
# list_pred_y = {i:ans[i][0][i] for i in range(4)}
# list_new_X = {i:ans[i][1][i] for i in range(4)}
# pd.to_pickle(list_pred_y,'../data/list_pred_y_LR')
# pd.to_pickle(list_new_X, '../data/list_new_X_LR')
# clear_output()
# from modAL.models import Committee
# def runQBC():
#     list_pred_y = {4:[]}
#     list_new_X = {4:[]}
#     Pool_X_tmp, Pool_y_tmp = Pool_X.copy(), Pool_y.copy()
#     train_X_tmp, train_y_tmp = train_X.copy(), train_y.copy()
#     learner1 = ActiveLearner(estimator=LogisticRegression(random_state=seed, max_iter=500),
#                        X_training=train_X_tmp, y_training=train_y_tmp)
#     learner2 = ActiveLearner(
#         estimator=RandomForestClassifier(random_state=seed),
#         X_training=train_X_tmp, y_training=train_y_tmp
#     )
#     learner3 = ActiveLearner(
#         estimator=SVC(random_state=seed, probability=True),
#         X_training=train_X_tmp, y_training=train_y_tmp
#     )
#     learner = Committee(learner_list=[learner1, learner2, learner3])

#     for i in range(N):
#       print(i)
#       clear_output(wait=True)
#       pred_y = learner.predict(test_X)
#       list_pred_y[4].append(pred_y)
#       new_inds, new_X = learner.query(Pool_X_tmp, C)
#       list_new_X[4].append(new_X)
#       learner.teach(Pool_X_tmp[new_inds], Pool_y_tmp[new_inds])
#       Pool_X_tmp, Pool_y_tmp = np.delete(Pool_X_tmp, new_inds, axis=0), np.delete(Pool_y_tmp, new_inds, axis=0)
#     return (list_pred_y, list_new_X)

In [None]:
# list_pred_y_QBC, list_new_X_QBC = runQBC()

In [None]:
# list_pred_y = pd.read_pickle('../data/list_pred_y_RF')
# list_new_X = pd.read_pickle('../data/list_new_X_RF')
# list_pred_y.update(list_pred_y_QBC)
# list_new_X.update(list_new_X_QBC)
# pd.to_pickle(list_new_X, '../data/list_new_X_ALL')
# pd.to_pickle(list_pred_y, '../data/list_pred_y_ALL')

In [None]:
list_pred_y = pd.read_pickle('../data/list_pred_y_RF')
list_new_X = pd.read_pickle('../data/list_new_X_RF')

fig = plt.figure(figsize=(13,7))
fig.suptitle(r"$\bf{" + 'Animation 1:' + "}$"+'Comparison among various querying methods of uncertainty sampling and random baseline')
ax = np.empty((3,4)).astype(np.object)
ij = 1
for i in range(3):
  ax[i,0] = fig.add_subplot(3, 4, ij)
  ij += 1
  for j in range(1,4):
    ax[i,j] = fig.add_subplot(3, 4, ij, sharey = ax[i,0])
    ij += 1

plt.subplots_adjust(wspace=0.3, hspace=0.5)
def iter(i):
    colrs = ['r','g','b','y']
    for s_i, s_name in enumerate(['Least confident', 'Margin sampling', 'Entropy', 'Random sampling']):
      print(i)
      clear_output(wait=True)
      ax[0, s_i].cla()
      ax[1, s_i].cla()
      ax[2, s_i].cla()
      ax[0, s_i].set_title(s_name, color=plotlyC(s_i, 0))
      ax[0, s_i].set_xlabel('Digit')
      ax[0, 0].set_ylabel('Individual F1-score')
      ax[2, s_i].set_xlabel('Iterations')
      ax[2, 0].set_ylabel('Overall F1-score')
      ax[0, s_i].set_xticks(range(10))
      ax[0, s_i].set_yticks(np.linspace(0,1,5))
      #ax[1, s_i].set_ylim(0,28)
      #ax[2, s_i].set_ylim(0.4, 0.6)
      ax[0, s_i].grid(True)
      ax[2, s_i].grid(True)
      ax[0, s_i].set_ylim(0,1)
      ax[0, s_i].bar(range(10), individual_acc(list_pred_y[s_i][i]), color=plotlyC(s_i, 0))
      ax[1, s_i].imshow(list_new_X[s_i][i].reshape(28,28))
      ax[1, s_i].set_xticks(())
      ax[1, s_i].set_yticks(())
      ax[1, s_i].set_xlabel('Queried sample')
      ax[2, s_i].plot(range(1,i+1), overall_acc(list_pred_y[s_i][:i]), 'o-', color=plotlyC(s_i, 0), markersize=3)
      ax[2, s_i].set_ylim(0.5,0.9)
      #ax[2, s_i].legend()
# ax[0, 0].set_title('Animation 1: Comparison among various querying methods of Uncertainty sampling and random baseline')
plt.legend()
plt.close()
anim = FuncAnimation(fig, iter, frames=range(100))
rc('animation', html='jshtml')
anim

The above animation shows F1-scores for samples of individual digits and overall F1-scores across all digits after each iteration. We can see that each of the strategies, except random sampling, tends to choose more samples of a digit class having a lower F1-score. Margin sampling performs better than the other strategies in terms of F1-score. Margin sampling and Least confident method easily outperform the random baseline. The entropy method, in this case, is comparable to the random baseline. The Figure below shows a comparison of all strategies.

In [None]:
# fig, ax = plt.subplots()
# for s_i, s_name in enumerate(['least_confident', 'margin_sampling', 'entropy', 'random']):
#   ax.plot(range(1,N+1), overall_acc(list_pred_y[s_i]), 'o-',label=s_name, markersize=1)
# ax.set_xlabel('Iterations');ax.set_ylabel('Overall F1-Score on test data')
# ax.legend(bbox_to_anchor=(1, 0.5));
# plt.figtext(-0.1,-0.08,'Figure 11: Comparison of overall F1-score among all strategies and random baseline',fontdict={'size':16})
# format_axes(ax);

# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
for s_i, s_name in enumerate(['Least confident', 'Margin sampling', 'Entropy', 'Random sampling']):
    fig.add_trace(go.Scatter(x=list(range(1,N+1)), y=overall_acc(list_pred_y[s_i]),
                    mode='lines+markers',
                    name=s_name,
                    line=dict(width=2,color=px.colors.DEFAULT_PLOTLY_COLORS[s_i]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 11:</b> Comparison of F1-score between uncertainty sampling methods and random sampling',
                  title_x=0.5,
                 xaxis_title='Iterations',
                 yaxis_title='Overall F1-Score on test data',
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

Thus far, we have seen uncertainty for classification tasks. Now, we see an example of regression to understand uncertainty sampling.

### Regression on Noisy Sine Curve 

We will consider the sine curve dataset we have used in an earlier discussion. We will fit the Gaussian Process regressor model with Matern kernel on randomly chosen 8 data points from the noisy sine curve dataset. The uncertainty measure for the regression tasks is the standard deviation or the predictive variance. In this example, we will take predictive variance as our measure of uncertainty.

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel, WhiteKernel

np.random.seed(seed)
whole_X = np.linspace(-1,1,500).reshape(-1,1)
whole_y = np.sin(whole_X*10) + np.random.normal(size=whole_X.shape[0]).reshape(-1,1)/10
train_ind = np.random.choice(np.arange(500), size=8,replace=False)
pool_ind = np.linspace(0,499,20).astype(int)
pool_X = whole_X[pool_ind]
pool_y = whole_y[pool_ind]
train_X = whole_X[train_ind]
train_y = whole_y[train_ind]
test_X, test_y = whole_X.copy(), whole_y.copy()
model = GaussianProcessRegressor(kernel=ConstantKernel(1)*(Matern(length_scale=0.1)))
model.fit(train_X, train_y)
pred_y, var_y = model.predict(whole_X, return_cov=True)


# fig, ax = plt.subplots()
# ax.fill_between(whole_X.squeeze(), pred_y.squeeze()-var_y.diagonal(), 
#                  pred_y.squeeze()+var_y.diagonal(), alpha=0.2, label='Predictive variance',color='grey')
# ax.scatter(whole_X, whole_y, label='Test set',s=10,c=my_clr['l_b'])
# ax.scatter(pool_X, pool_y, label='Pool set',s=100,c=my_clr['y'])
# ax.scatter(train_X, train_y, label='Train set',s=100,c=my_clr['l_r'])
# ax.plot(whole_X, pred_y, label='Predictive mean', c='grey')
# ax.legend(bbox_to_anchor=(1,0.5));
# ax.set_xlabel('X');ax.set_ylabel('Y');
# plt.figtext(0.3,-0.08,'Initial fit on the dataset',fontdict={'size':16})
# format_axes(ax);

# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=whole_y.squeeze(),
                    mode='markers',opacity=0.6,
                    name='Test points',marker=dict(size=6,color=px.colors.DEFAULT_PLOTLY_COLORS[0]), hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=pool_X.squeeze(), y=pool_y.squeeze(),
                    mode='markers',
                    name="Pool points<br>(potential train points)",marker=dict(size=12, color='rgb(240,0,0)'), hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=train_X.squeeze(), y=train_y.squeeze(),
                    mode='markers', name='Train points',marker=dict(size=12,color='black'), hovertemplate='(%{x:.2f},%{y:.2f})'))

fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()-var_y.diagonal(), fill='tonexty',
                         fillcolor='rgba(128,128,128,0.2)',showlegend=False,name='Predictive variance',
                         hovertemplate='(%{x:.2f},%{y:.2f})',
                    mode='none' # override default markers+lines
                    ))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y=pred_y.squeeze()+var_y.diagonal(), fill='tonexty',
                         fillcolor='rgba(128,128,128,0.2)',
                         hovertemplate='(%{x:.2f},%{y:.2f})',
                    mode= 'none', name='Predictive variance'))
fig.add_trace(go.Scatter(x=whole_X.squeeze(), y= pred_y.squeeze(),
                    mode='lines',opacity=0.8,
                    name='Predictive mean', line=dict(width=4, color='gray',dash='dashdot'), hovertemplate='(%{x:.2f},%{y:.2f})'))
############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 12:</b> GP model fitted on 8 random datapoints of noisy sine curve dataset', 
                  title_x=0.5,
                 xaxis_title="X",
                 yaxis_title="Y"
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

As per uncertainty criteria, we should label the samples with higher predictive variance. Now, we will show a comparison of uncertainty sampling with random sampling for ten iterations. We also show the next sample to query at each iteration.

In [None]:
list_pred_y_gp = {0:[], 1:[]}
list_var_y_gp = {0:[], 1:[]}
list_new_X_gp = {0:[], 1:[]}
list_new_y_gp = {0:[], 1:[]}
list_new_model = {0:[], 1:[]}
np.random.seed(seed)
N=10
def GP_regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
    query_idx = np.argmax(std)
    return [query_idx], X[[query_idx]]
def GP_regression_random(regressor, X):
    query_idx = np.random.choice(range(len(X)))
    return [query_idx], X[[query_idx]]

for s_i, q_func in enumerate([GP_regression_std, GP_regression_random]):
    pool_X_tmp, pool_y_tmp = pool_X.copy(), pool_y.copy()
    train_X_tmp, train_y_tmp = train_X.copy(), train_y.copy()
    learner = ActiveLearner(
        estimator=GaussianProcessRegressor(kernel=ConstantKernel(0.1)*(Matern(length_scale=0.1))+WhiteKernel(0.1), 
                                           random_state=seed),
        query_strategy=q_func,
        X_training=train_X_tmp, y_training=train_y_tmp
    )
    for i in range(N):
        print(s_i, i)
        clear_output(wait=True)
        pred_y, var_y = learner.predict(whole_X, return_cov=True)
        list_pred_y_gp[s_i].append(pred_y)
        list_var_y_gp[s_i].append(var_y)
        new_ind, new_X = learner.query(pool_X_tmp)
        list_new_X_gp[s_i].append(new_X[0])
        list_new_y_gp[s_i].append(pool_y_tmp[new_ind][0])
        learner.teach(new_X, pool_y_tmp[new_ind])
        pool_X_tmp, pool_y_tmp = np.delete(pool_X_tmp, new_ind, axis=0), np.delete(pool_y_tmp, new_ind, axis=0)
        
clear_output()

In [None]:
fig, ax = plt.subplots(2,2, figsize=(13,7))
fig.suptitle(r"$\bf{" + 'Animation 2:' + "}$"+'Comparison of uncertainty sampling and random sampling on GP regression')
plt.subplots_adjust(wspace=0.04)
def iter_gp(i):
    ax[0,0].cla()
    ax[1,0].cla()
    ax[0,1].cla()
    ax[1,1].cla()
    for s_i, s_name in enumerate(['Uncertainty sampling', 'Random sampling']):
        pred_y, var_y = list_pred_y_gp[s_i][i], list_var_y_gp[s_i][i]
        ax[0, s_i].fill_between(whole_X.squeeze(), pred_y.squeeze()-var_y.diagonal(), 
                                pred_y.squeeze()+var_y.diagonal(), alpha=0.2, label='Predictive variance')
        ax[0, s_i].scatter(pool_X, pool_y,s=10,c='g')
        ax[0, s_i].plot(whole_X, pred_y,label='Predictive mean')
        ax[0, s_i].scatter(whole_X, whole_y,s=10,c=my_clr['l_b'],label='Test data')
        ax[0, s_i].set_ylim(-1.5,3);ax[0, 1].set_yticks(())
        ax[0, s_i].set_xlabel('X');ax[0, 0].set_ylabel('Y');
        ax[0, s_i].set_title(s_name)
        #ax[1, s_i].grid(True)
        ax[1, s_i].plot(range(1,i+2), overall_rmse(list_pred_y_gp[s_i][:i+1]), 'o-', label=s_name)
#         for x_iii, yyy in enumerate(overall_rmse(list_pred_y_gp[s_i][:i]), 1):
#             ax[1, s_i].annotate(np.round(yyy,2), (x_iii-0.5, yyy+0.5*((-1)**x_iii)))
        ax[1, s_i].set_xlabel('Iterations');ax[1, 0].set_ylabel('RMSE');
        ax[0, s_i].scatter(train_X.tolist()+list_new_X_gp[s_i][:i], y=train_y.tolist()+list_new_y_gp[s_i][:i], 
                           s=100, c='k',
                           label='Train points')
        ax[0, s_i].scatter(list_new_X_gp[s_i][i], list_new_y_gp[s_i][i], 
                           label='Next sample to query', s=100,c='tab:red')
        ax[1, s_i].set_xticks(np.arange(0,N+1,2))
        ax[1, s_i].set_ylim(0.1, 0.65);ax[1, 1].set_yticks(());
        format_axes(ax[1, s_i]);
        format_axes(ax[0, s_i]);
    ax[0, 1].legend(loc='upper left',prop={'size':10});
        #ax[1, s_i].set_yticks(np.linspace(-1.5,2,11))

#plt.tight_layout()
plt.close()
anim = FuncAnimation(fig, iter_gp, frames=range(N))
rc('animation', html='jshtml')
anim

Animation 2 demonstrates a comparison between uncertainty sampling and random sampling. We can observe that uncertainty sampling-based samples are more informative to the model and ultimately help reduce model uncertainty (variance) and RMSE compared to random sampling. 

Now, we will discuss the query by committee method.

## Query by Committee (QBC)

Query by committee approach involves creating a committee of two or more learners or models. Each of the learners can vote for samples in the pool set. Samples for which all committee members disagree the most are considered for querying. For classification tasks, we can take a mode of votes from all learners, and in regression settings, we can take average predictions from all the learners. The central intuition behind QBC is to minimize the <a style="text-decoration:none" href="http://www2.cs.uregina.ca/~dbd/cs831/notes/ml/vspace/3_vspace.html#:~:text=A%20version%20space%20is%20a,remembering%20any%20of%20the%20examples.">*version space*</a>. Initially, each model has different hypotheses that try to converge as we query more samples.

We can set up a committee for the QBC using the following approaches
1. Same model with different hyperparameters
1. Same model with different segments of the dataset
1. Different models with the same dataset

## Classification on Iris Dataset

We will explain the first approach (Same model with different hyperparameters) using SVC (Support Vector Classifier) model with <a style="text-decoration:none" href="https://en.wikipedia.org/wiki/Radial_basis_function_kernel">RBF</a> kernel on Iris dataset.

We initially train the model on six samples and Actively choose 30 samples from the pool set. We will test the model performance at each iteration on the same test set of 30 samples. 

In [None]:
from sklearn.datasets import load_iris
IR = load_iris()
X = IR['data'][:,:2]
y = IR['target']
t_names = IR['target_names']
f_names = IR['feature_names']


# fig, ax = plt.subplots()
# ax.scatter(X[:,0][y==0],X[:,1][y==0], label=t_names[0], c=my_clr['y'])
# ax.scatter(X[:,0][y==1],X[:,1][y==1], label=t_names[1], c=my_clr['l_b'])
# ax.scatter(X[:,0][y==2],X[:,1][y==2], label=t_names[2], c=my_clr['l_r'])
# ax.legend(bbox_to_anchor=(1,0.5));plt.xlabel(f_names[0]);
# ax.set_ylabel(f_names[1]);
# plt.figtext(0.4,-0.1,'Iris dataset',fontdict={'size':16});
# format_axes(ax);
aa,bb,cc=0.5,1,10

# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
for i in range(3):
    fig.add_trace(go.Scatter(x=X[:,0][y==i],y=X[:,1][y==i],
                    mode='markers',
                    name='Iris '+t_names[i].capitalize(),marker=dict(size=6,color=px.colors.DEFAULT_PLOTLY_COLORS[i]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 13:</b> Iris dataset',
                  title_x=0.5,
                 xaxis_title=f_names[0].capitalize(),
                 yaxis_title=f_names[1].capitalize(),
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
from modAL import Committee

cmap_light = ListedColormap([plotlyC(0,10),
                             plotlyC(1,10), 
                             plotlyC(2,10)])
train_pool_X, test_X, train_pool_y, test_y = train_test_split(X, y, test_size=0.2, random_state=seed)
train_X, pool_X, train_y, pool_y = train_test_split(train_pool_X, train_pool_y, train_size=6, random_state=seed)
#clf = KNeighborsClassifier(n_neighbors=6, weights='distance')
#clf = SVC()
#clf.fit(train_X, train_y)

### dec
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
def get_Z(clf):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    Z[0,0] = 0
    Z[Z.shape[0]-1,Z.shape[1]-1] = 1
    Z[0,1] = 2
    return Z

def plot_decision(Z, ax, i):
    ax.pcolormesh(xx, yy, Z, cmap=cmap_light)
    ax.scatter(pool_X[:, 0], pool_X[:, 1], c=['white'],edgecolor='k',
                     s=20, label='Pool set')
    ax.scatter(np.array(list_new_x_iris)[:i, 0],np.array(list_new_x_iris)[:i, 1],
               c=[plotlyC(0,0) if list_new_y_iris[j]==0 \
               else plotlyC(1,0) if list_new_y_iris[j]==1 \
               else plotlyC(2,0) for j in range(i)],
               s=20)
    ax.scatter(np.array(list_new_x_iris)[i:i+1, 0],np.array(list_new_x_iris)[i:i+1, 1], 
               c=[plotlyC(0,0)] if list_new_y_iris[i]==0 \
               else [plotlyC(1,0)] if list_new_y_iris[i]==1 \
               else [plotlyC(2,0)], 
               #cmap=cmap_bold,
               edgecolor='k', s=100, label='Point to query')
    return ax

In [None]:
# al = lambda x: ActiveLearner(estimator=KNeighborsClassifier(n_neighbors=x, weights='distance'),
#                        X_training=train_X, y_training=train_y)
al = lambda x: ActiveLearner(estimator=SVC(C=x, probability=True, random_state=seed, kernel='rbf'),
                        X_training=train_X, y_training=train_y)

committee = Committee(learner_list=[al(aa), al(bb), al(cc)])

In [None]:
list_pred_y_iris = {i:[] for i in range(3)}
list_pred_all_iris = []
list_Z_iris = {i:[] for i in range(3)}
list_new_x_iris = []
list_new_y_iris = []
N = 30
committee.fit(train_X, train_y)
pool_X_tmp, pool_y_tmp = pool_X.copy(), pool_y.copy()
for i in range(N):
    print(i)
    clear_output(wait=True)
    list_pred_all_iris.append(committee.predict(test_X))
    for l_i, learner in enumerate(committee):
        list_pred_y_iris[l_i].append(learner.predict(test_X))
        list_Z_iris[l_i].append(get_Z(learner))
    new_ind, new_X = committee.query(pool_X_tmp, 1)
    list_new_x_iris.append(new_X[0])
    list_new_y_iris.append(pool_y_tmp[new_ind][0])
    committee.teach(new_X, pool_y_tmp[new_ind])
    pool_X_tmp, pool_y_tmp = np.delete(pool_X_tmp, new_ind, axis=0), np.delete(pool_y_tmp, new_ind, axis=0)
    
clear_output()

In [None]:
rand_pred_all_iris = []
rand_new_x_iris = []
rand_new_y_iris = []
al = lambda x: ActiveLearner(estimator=SVC(C=x, probability=True, random_state=seed),
                        X_training=train_X, y_training=train_y)
np.random.seed(seed)
committee = Committee(learner_list=[al(aa), al(bb), al(cc)])

#N = 20
committee.fit(train_X, train_y)
pool_X_tmp, pool_y_tmp = pool_X.copy(), pool_y.copy()
for i in range(N):
    print(i)
    clear_output(wait=True)
    rand_pred_all_iris.append(committee.predict(test_X))
    new_ind = [np.random.choice(range(len(pool_X_tmp)))]
    new_X = pool_X_tmp[new_ind]
    rand_new_x_iris.append(new_X[0])
    rand_new_y_iris.append(pool_y_tmp[new_ind[0]])
    committee.teach(new_X, pool_y_tmp[new_ind])
    pool_X_tmp, pool_y_tmp = np.delete(pool_X_tmp, new_ind, axis=0), np.delete(pool_y_tmp, new_ind, axis=0)
    
clear_output()

In [None]:
import warnings
warnings.filterwarnings('ignore')

fig = plt.figure(figsize=(13,7))
fig.suptitle(r"$\bf{" + 'Animation 3:' + "}$"+'QBC is choosing points with maximum disagreement across learners')
ax = np.zeros((2,3)).astype(np.object)
ij = 1
for i in range(2):
    ax[i, 0]=fig.add_subplot(2,3,ij);ij+=1;
    ax[i, 1] = fig.add_subplot(2,3,ij);ij+=1;
    ax[i, 2] = fig.add_subplot(2,3,ij);ij+=1;
plt.subplots_adjust(wspace=0.,hspace=0.3)
C_l = [aa,bb,cc]
def iter_iris(i):
    for ii in range(3):
        for jj in range(2):
            ax[jj,ii].cla()
            format_axes(ax[jj, ii]);
        plot_decision(list_Z_iris[ii][i], ax[0,ii],i)
        #ax[0,ii].annotate('Current',(list_new_x_iris[i-1][0], list_new_x_iris[i-1][1]))
        ax[0,ii].set_xlabel(f_names[0]); ax[0,0].set_ylabel(f_names[1]);
        ax[0,ii].set_title('SVC with C: '+str(C_l[ii]))
        #ax[1,ii].grid(True)
        b_l = ax[1,ii].bar(t_names, individual_acc(list_pred_y_iris[ii][i]))
        #ax[1,ii].set_title(str(individual_acc(list_pred_y_iris[ii][i])))
        b_l[0].set_color(plotlyC(0,0))
        b_l[1].set_color(plotlyC(1,0))
        b_l[2].set_color(plotlyC(2,0))
        
        ax[0, 0].legend(loc='lower right')
        ax[1, ii].set_ylim(0,1)
        ax[0, ii].set_ylim(0.5,5.5)
        ax[1, 0].set_yticks(np.linspace(0,1,11))
        ax[1, 1].set_yticks(())
        ax[1, 2].set_yticks(())
        ax[0, 1].set_yticks(())
        ax[0, 2].set_yticks(())
        ax[1, 0].set_ylabel('F1-score on test set')
        #ax[1, 0].legend(['F1-score on test set'])

#plt.tight_layout()
plt.close()
anim = FuncAnimation(fig, iter_iris, frames=range(N))
rc('animation', html='jshtml')
clear_output()
anim

Separation boundaries between different colors are decision boundaries in Animation 3. Points queried by the committee are the points where the learners disagree the most. This can be observed from the above plot. We can see that initially, all models learn different decision boundaries for the same data. Iteratively they converge to a similar hypothesis and thus start learning similar decision boundaries.

We now show the comparison of the overall F1-score between random sampling and our model. QBC, most of the time, outperforms the random sampling method.

In [None]:
# fig, ax = plt.subplots(figsize=(12,4))
# ax.plot(range(1,1+len(rand_pred_all_iris)), overall_acc(rand_pred_all_iris), label='Random baseline', color=my_clr['l_b'])
# ax.plot(range(1,1+len(rand_pred_all_iris)), overall_acc(list_pred_all_iris), label='QBC',color=my_clr['l_r'])
# ax.legend();ax.set_xlabel('Iterations');ax.set_ylabel('Overall F1-score');
# ax.set_ylim(0,1)
# plt.figtext(0.2,-0.1,'Comparison between QBC and random baseline on Iris dataset',fontdict={'size':16});
# format_axes(ax);

# plt.xticks(np.arange(1,1+len(rand_pred_all_iris),2));

# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=list(range(1,1+len(list_pred_all_iris))), y=overall_acc(list_pred_all_iris),
                    mode='lines+markers',
                    name='Query by committee',
                    line=dict(width=2,color=px.colors.DEFAULT_PLOTLY_COLORS[1]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))
fig.add_trace(go.Scatter(x=list(range(1,1+len(rand_pred_all_iris))), y=overall_acc(rand_pred_all_iris),
                    mode='lines+markers',
                    name='Random sampling',
                    line=dict(width=2,color=px.colors.DEFAULT_PLOTLY_COLORS[0]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',
                 gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',
                 zerolinewidth=1,tickvals=list(range(1,31)))
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 14:</b> Comparison between QBC and Random baseline on Iris dataset',
                  title_x=0.5,
                 xaxis_title='Iterations',
                 yaxis_title='Overall F1-score',
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

# Comparison between Uncertainty sampling and QBC

Thus far, we have seen and understood various active learning strategies by examples. Now, let us compare the uncertainty sampling methods and query by committee (QBC).

We will use MNIST dataset to demonstrate the performance of various sampling techniques. For uncertainty sampling, we will use Random Forest classifier. For QBC, let us use three different classifiers (Random Forest Classifier, Logistic Regression [14], and Support Vector Classifier). Animation 4 shows the simulation of active learning for 100 iterations and testing F1-score on the test set.

In [None]:
np.random.seed(seed)
(train_pool_X, train_pool_y), (test_X, test_y) = tf.keras.datasets.mnist.load_data()
train_X, Pool_X, train_y,  Pool_y = train_test_split(train_pool_X, train_pool_y, train_size=50, random_state=seed)

stack = []
for i in range(5):
    inds = np.random.choice(np.arange(len(train_pool_X)), size=5)
    stack.append(np.hstack(train_pool_X[inds]))
final = np.vstack(stack)

# plt.axis('off')
# plt.imshow(final)
# plt.figtext(0.18,0,'Figure 9: Few samples from the MNIST dataset',fontdict={'size':16});

train_pool_X = train_pool_X.reshape(train_pool_X.shape[0], -1)
train_X = train_X.reshape(train_X.shape[0], -1)
test_X = test_X.reshape(test_X.shape[0], -1)
Pool_X = Pool_X.reshape(Pool_X.shape[0], -1)

list_pred_y = pd.read_pickle('../data/list_pred_y_ALL')
list_new_X = pd.read_pickle('../data/list_new_X_ALL')
N = 100
## Replacing QBC with random
list_pred_y[3] = list_pred_y[4]
list_new_X[3] = list_new_X[4]

fig = plt.figure(figsize=(13,7))
fig.suptitle(r"$\bf{" + 'Animation 4:' + "}$"+'Comparison among all active learning techniques')
ax = np.empty((3,4)).astype(np.object)
ij = 1
for i in range(3):
  ax[i,0] = fig.add_subplot(3, 4, ij)
  ij += 1
  for j in range(1,4):
    ax[i,j] = fig.add_subplot(3, 4, ij, sharey = ax[i,0])
    ij += 1

plt.subplots_adjust(wspace=0.3, hspace=0.5)
def iter(i):
    colrs = ['r','g','b','y']
    for s_i, s_name in enumerate(['Least confident', 'Margin sampling', 'Entropy', 'QBC']):
      print(i)
      clear_output(wait=True)
      ax[0, s_i].cla()
      ax[1, s_i].cla()
      ax[2, s_i].cla()
      ax[0, s_i].set_title(s_name, color=plotlyC(s_i,0))
      ax[0, s_i].set_xlabel('Digit')
      ax[0, 0].set_ylabel('Individual F1-score')
      ax[2, s_i].set_xlabel('Iterations')
      ax[2, 0].set_ylabel('Overall F1-score')
      ax[0, s_i].set_xticks(range(10))
      ax[0, s_i].set_yticks(np.linspace(0,1,5))
      #ax[1, s_i].set_ylim(0,28)
      #ax[2, s_i].set_ylim(0.4, 0.6)
      ax[0, s_i].grid(True)
      ax[2, s_i].grid(True)
      ax[0, s_i].set_ylim(0,1)
      ax[0, s_i].bar(range(10), individual_acc(list_pred_y[s_i][i]), color=plotlyC(s_i,0))
      ax[1, s_i].imshow(list_new_X[s_i][i].reshape(28,28))
      ax[1, s_i].set_xticks(())
      ax[1, s_i].set_yticks(())
      ax[1, s_i].set_xlabel('Queried sample')
      ax[2, s_i].plot(range(1,i+1), overall_acc(list_pred_y[s_i][:i]), 'o-', color=plotlyC(s_i,0), markersize=3)
      ax[2, s_i].set_ylim(0.5,0.9)
      #ax[2, s_i].legend()
      
plt.legend()

plt.close()
anim = FuncAnimation(fig, iter, frames=range(N))
rc('animation', html='jshtml')
anim

In [None]:
# fig, ax = plt.subplots()
# for s_i, s_name in enumerate(['least_confident', 'margin_sampling', 'entropy', 'random']):
#   ax.plot(range(1,N+1), overall_acc(list_pred_y[s_i]), 'o-',label=s_name, markersize=1)
# ax.set_xlabel('Iterations');ax.set_ylabel('Overall F1-Score on test data')
# ax.legend(bbox_to_anchor=(1, 0.5));
# plt.figtext(-0.1,-0.08,'Figure 11: Comparison of overall F1-score among all strategies and random baseline',fontdict={'size':16})
# format_axes(ax);
N = 100
# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = go.Figure(layout=layout)
for s_i, s_name in enumerate(['Least confident', 'Margin sampling', 'Entropy', 'QBC']):
    fig.add_trace(go.Scatter(x=list(range(1,N+1)), y=overall_acc(list_pred_y[s_i]),
                    mode='lines+markers',
                    name=s_name,
                    line=dict(width=2,color=px.colors.DEFAULT_PLOTLY_COLORS[s_i]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'))

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(title_text='<b>Figure 11:</b> Comparison of overall F1-score among all active learning strategies',
                  title_x=0.5,
                 xaxis_title='Iterations',
                 yaxis_title='Overall F1-Score on test data',
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

Query by committee is performing better than uncertainty sampling. The reason is that uncertainty sampling tends to be biased towards the actual learner, and it may miss important examples that are not in sight of the estimator [1, 5]. QBC overcomes this problem by taking votes from different models on the same datapoints or different datapoints with the same model. Our case was different models with the same datapoints.

# How many samples to query at once?

Till now, in the article, we have queried only one sample at a time. We should consider the time to retrain the model over the train set and evaluate it on the pool set. Indeed, updating the model after each queried sample is ideal as the informativeness of samples is updated without adding noise in the form of non-informative samples. Let us assume that each sample's annotation cost is constant, and thus, we can ignore it here. We will now see the effect of selecting $ K $ samples at once on model improvement and overall time-taken for the train-test process.

We will use query by committee strategy to query for 30 samples on IRIS dataset. The mean results of repeating each experiment 50 times with different train, validation, test splits are shown below.

In [None]:
from time import time
IR = load_iris()
t_names = IR['target_names']
f_names = IR['feature_names']

def GetF1AndTime(K, rs):
    IR = load_iris()
    X = IR['data'][:,:2]
    y = IR['target']
    train_pool_X, test_X, train_pool_y, test_y = train_test_split(X, y, test_size=0.2, random_state=rs)
    train_X, pool_X, train_y, pool_y = train_test_split(train_pool_X, train_pool_y, train_size=5, random_state=0)
    pool_X_tmp, pool_y_tmp = pool_X.copy(), pool_y.copy()
    al = lambda x: ActiveLearner(estimator=SVC(C=x, probability=True, random_state=seed, kernel='rbf'),
                        X_training=train_X, y_training=train_y)
    committee = Committee(learner_list=[al(aa), al(bb), al(cc)])
    N = 30
    init = time()
    committee.fit(train_X, train_y)
    i = N
    while i>0:
        print('K:',K,'query:',i)
        clear_output(wait=True)
#         list_pred_all_iris.append(committee.predict(test_X))
#         for l_i, learner in enumerate(committee):
#             list_pred_y_iris[l_i].append(learner.predict(test_X))
#             list_Z_iris[l_i].append(get_Z(learner))
        new_ind, new_X = committee.query(pool_X_tmp, min(i,K))
        #list_new_x_iris.append(new_X[0])
        #list_new_y_iris.append(pool_y_tmp[new_ind][0])
        committee.teach(new_X, pool_y_tmp[new_ind])
        pool_X_tmp, pool_y_tmp = np.delete(pool_X_tmp, new_ind, axis=0), np.delete(pool_y_tmp, new_ind, axis=0)
        i -= K
        
    pred_last = committee.predict(test_X)
    return (time()-init, f1_score(test_y, pred_last, average='macro'))

# (train_pool_X, train_pool_y), (test_X, test_y) = tf.keras.datasets.mnist.load_data()
# train_pool_X = train_pool_X.reshape(train_pool_X.shape[0], -1)
# test_X = test_X.reshape(test_X.shape[0], -1)


##################
# Very time consuming
##################
# def GetF1andTimeMNIST(K, rs):
#     train_X, Pool_X, train_y,  Pool_y = train_test_split(train_pool_X, train_pool_y, train_size=50, 
#                                                          random_state=rs)
#     train_X = train_X.reshape(train_X.shape[0], -1)
#     Pool_X = Pool_X.reshape(Pool_X.shape[0], -1)
#     Pool_X_tmp, Pool_y_tmp = Pool_X.copy(), Pool_y.copy()
#     train_X_tmp, train_y_tmp = train_X.copy(), train_y.copy()
    
#     learner1 = ActiveLearner(estimator=LogisticRegression(C=0.5, random_state=seed, max_iter=500),
#                        X_training=train_X_tmp, y_training=train_y_tmp)
#     learner2 = ActiveLearner(estimator=LogisticRegression(C=1, random_state=seed, max_iter=500),
#                        X_training=train_X_tmp, y_training=train_y_tmp)
#     learner3 = ActiveLearner(estimator=LogisticRegression(C=10, random_state=seed, max_iter=500),
#                        X_training=train_X_tmp, y_training=train_y_tmp)
    
#     learner = Committee(learner_list=[learner1, learner2, learner3])

#     i = 100
#     while i>0:
#         print('K:',K,'query:',i)
#         clear_output(wait=True)
#         new_inds, new_X = learner.query(Pool_X_tmp, min(i, K))
#         learner.teach(Pool_X_tmp[new_inds], Pool_y_tmp[new_inds])
#         Pool_X_tmp, Pool_y_tmp = np.delete(Pool_X_tmp, new_inds, axis=0), np.delete(Pool_y_tmp, new_inds, axis=0)
#         i -= K
#     pred_last = committee.predict(test_X)
#     return (time()-init, f1_score(test_y, pred_last, average='macro'))

In [None]:
XY = []
for K in range(1,11):    
    rep = []
    for n_repeat in range(50):
        rep.append(GetF1AndTime(K, n_repeat))
    XY.append(np.mean(np.array(rep), axis=0))
clear_output()

In [None]:
XYN = np.array(XY)
# Create traces
layout = Layout(
    paper_bgcolor='rgb(255,255,255)',
    plot_bgcolor='rgb(255,255,255)'
)
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=list(range(1,11)), y=XYN[:,1],
                    mode='lines+markers',
                    name='Macro F1-score',
                    line=dict(width=2,color=px.colors.DEFAULT_PLOTLY_COLORS[0]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'),secondary_y=True)
fig.add_trace(go.Scatter(x=list(range(1,11)), y=XYN[:,0],
                    mode='lines+markers',
                    name='Time taken to query 30 samples<br>by querying K samples at once',
                    line=dict(width=2,color=px.colors.DEFAULT_PLOTLY_COLORS[1]), 
                    hovertemplate='(%{x:.2f},%{y:.2f})'),secondary_y=False)

############# Common
fig.update_yaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',
                 zerolinewidth=1,secondary_y=True,title_text='Macro F1-score<br>(Higher is better)')
fig.update_xaxes(automargin=True,gridcolor='rgba(128,128,128,0.2)',gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',
                 zerolinewidth=1)
fig.update_yaxes(title_text="Time (in seconds)", secondary_y=False,automargin=True,gridcolor='rgba(128,128,128,0.2)',
                 gridwidth=1,zerolinecolor='rgba(128,128,128,0.2)',zerolinewidth=1)
fig.update(layout_coloraxis_showscale=False,layout=layout)
fig.update_layout(title_text='<b>Figure 15:</b> Trade-off between time and performance while queriying for K samples at once',
                  title_x=0.5,
                 xaxis_title='K',
                #font=dict(family="Courier New")
                 )
fig['layout']['xaxis'].update(side='bottom')
fig.show()

We can see that, as K increases, time taken to complete the task decreases. Macro averaged F1-score also descreases with K. From this experiment, we can conclude that a good trade-off between time and K should be choosen to achieve the optimal results.

# Few More Active Learning Strategies

There are few more active learning techniques we are not covering in this article, but we describe them in brief here:
1. Expected model change: Selecting the samples that would have the most significant change in the model.
1. Expected error reduction: Selecting the samples likely to reduce the generalization error of the model.
1. Variance reduction: Selecting samples that may help reduce output variance.

With this, we end the visual tour to the active learning techniques.

# References

1. Settles, Burr. Active learning literature survey. University of Wisconsin-Madison Department of Computer Sciences, 2009.
1. Danka, Tivadar, and Peter Horvath. "modAL: A modular active learning framework for Python." arXiv preprint arXiv:1805.00979, 2018.
1. Dagan and S. Engelson. Committee-based sampling for training probabilistic classifiers. In Proceedings of the International Conference on Machine Learning (ICML), pages 150–157. Morgan Kaufmann, 1995.
1. T. Mitchell. Generalization as search. Artificial Intelligence, 18:203–226, 1982.
1. H.S. Seung, M. Opper, and H. Sompolinsky. Query by committee. In Proceedings of the ACM Workshop on Computational Learning Theory, pages 287–294, 1992.
1. D. Cohn, L. Atlas, and R. Ladner. Improving generalization with active learning. Machine Learning, 15(2):201–221, 1994.
1. S. Dasgupta, D. Hsu, and C. Monteleoni. A general agnostic active learning algorithm. In Advances in Neural Information Processing Systems (NIPS), volume 20, pages 353–360. MIT Press, 2008.
1. D. Lewis and W. Gale. A sequential algorithm for training text classifiers. In Proceedings of the ACM SIGIR Conference on Research and Development in Information Retrieval, pages 3–12. ACM/Springer, 1994.
1. Imran, Ali, et al. "AI4COVID-19: AI enabled preliminary diagnosis for COVID-19 from cough samples via an app." arXiv preprint arXiv:2004.01275, 2020.
1. Breiman, Leo. "Random forests." Machine learning 45.1 (2001): 5-32.
1. Carl Eduard Rasmussen and Christopher K.I. Williams, “Gaussian Processes for Machine Learning”, MIT Press 2006.
1. D. Angluin. Queries and concept learning. Machine Learning, 2:319–342, 1988.
1. D. Cohn, L. Atlas, R. Ladner, M. El-Sharkawi, R. Marks II, M. Aggoune, and D. Park. Training connectionist networks with queries and selective sampling. In Advances in Neural Information Processing Systems (NIPS). Morgan Kaufmann, 1990.
1. Yu, Hsiang-Fu, Fang-Lan Huang, and Chih-Jen Lin. "Dual coordinate descent methods for logistic regression and maximum entropy models." Machine Learning 85.1-2 (2011): 41-75.
1. C.E. Shannon. A mathematical theory of communication. Bell System Technical Journal, 27:379–423,623–656, 1948.
1. D. Lewis and J. Catlett. Heterogeneous uncertainty sampling for supervised learning. In Proceedings of the International Conference on Machine Learning (ICML), pages 148–156. Morgan Kaufmann, 1994.
1. T. Scheffer, C. Decomain, and S. Wrobel. Active hidden Markov models for information extraction. In Proceedings of the International Conference on Advances in Intelligent Data Analysis (CAIDA), pages 309–318. Springer-Verlag, 2001.