<a href="https://colab.research.google.com/github/paridhika/DDL/blob/main/CSC2516_FinalProject(Code)_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Active Transfer Learning

In this project we experiment different active learning setups.

## Importing the Necessary Packages and Setting GPUs

In [1]:
import os
import torch
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

import json
import time
import random
import datetime

!pip install transformers
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig

import seaborn as sns
from sklearn.metrics import matthews_corrcoef

!pip install modAL

tf.random.set_seed(42)
np.random.seed(42)

Collecting modAL
  Downloading modal-0.63.46-py3-none-any.whl (498 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m498.1/498.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiostream~=0.5.2 (from modAL)
  Downloading aiostream-0.5.2-py3-none-any.whl (39 kB)
Collecting fastapi (from modAL)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpclib==0.4.7 (from modAL)
  Downloading grpclib-0.4.7.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting synchronicity~=0.6.6 (from modAL)
  Downloading synchronicity-0.6.7-py3-none-any.whl (28 kB)
Collecting type

In [2]:
# Get the GPU device name
device_name=tf.test.gpu_device_name()

# The device name should look like the following
if device_name=='/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU
    device=torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device=torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second
    elapsed_rounded=int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Loading and Preprocessing Data

In [None]:
topic_num=0

all_topics=pickle.load(open("all_topics_with_meta.p","rb"))
data=pd.DataFrame.from_dict(all_topics[0]).T
data['topic']=0
tags=data.tag.values

for i in range(1,9):
  topic=pd.DataFrame.from_dict(all_topics[i]).T
  topic['topic']=i
  data=pd.concat([data,topic],axis=0)


# Converting boolean features into integers in the dataset
map={"rumours":1,"non-rumours":0}
data=data.replace({'tag':map})
map={"photo":1,"none":0}
data=data.replace({'media_type':map})
map={True:1,False:0}
data=data.replace({'verified':map})


# Normalizing features
import copy
df=copy.deepcopy(data)
cols=['favorite_count_log','retweet_count','followers','follow_ratio','length','capital_ratio']
for item in cols:
  column=item
  df[column]=(df[column]-df[column].min())/(df[column].max()-df[column].min())


data=df
train_df=data[data['topic']!=topic_num]
train_sentences=train_df.text.values
train_labels=train_df.tag.values

test_df=data[data['topic']==topic_num]
test_sentences=test_df.text.values
test_labels=test_df.tag.values


data2=data.reset_index()
def pd_iter_func(df,topic):
    for row in df.itertuples():
        # Define your criteria here
        if row.topic==topic:
            return row

start_test=pd_iter_func(data2,topic_num).Index

if topic_num==8:
  end_test=len(data2)
else:
  end_test=pd_iter_func(data2,topic_num+1).Index-1

train_start=start_test
train_end=end_test

## Active Learning Multiple Runs
Since every model might have a different input and output and training specific setting, we have multiple functions for a different model. The Multipart comes from the fact that we need to run the model multiple times to avoid the randomness effect of selecting a specific train test. We do this because our data set is around 2k, and we want concrete results.


### run_multy_epsilon_greedy()

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from modAL.models import ActiveLearner
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import uncertainty_sampling

from functools import partial
from collections import Counter

!pip install scikeras
from scikeras.wrappers import KerasClassifier

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def run_multy_epsilon_greedy(BATCH_SIZE, data_test, test_labels, num_of_run, model_name, strategy, train_size, is_random):

  test_pool_split=0.5
  preset_batch=partial(strategy, n_instances=40)
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True)
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(X_pool_0, y_pool_0, test_size=test_pool_split, shuffle=True )
  N_QUERIES=int(len(X_pool_0)/BATCH_SIZE)

  pref_hist_multy_accuracy=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_multy_f1=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_majority=np.zeros((num_of_run, N_QUERIES+1))
  pref_confusion=np.zeros((num_of_run, N_QUERIES+1,4))
  pref_random=0

  for i in range(num_of_run):
    clf = get_model(model_name)

    X_pool, X_train, y_pool, y_train = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True, random_state=random_seed_list[i])
    X_pool, X_test, y_pool, y_test   = train_test_split(X_pool, y_pool, test_size=test_pool_split, shuffle=True, random_state=random_seed_list[i])

    learner=ActiveLearner(
        estimator=clf,
        query_strategy=preset_batch,
        X_training=X_train,
        y_training=y_train)

    t1=time.time()

    # Allow our model to query our unlabeled dataset for the most informative points according to our query strategy (uncertainty sampling).
    counter_random= 1

    # Calculate initial batch
    y_pred=learner.predict(X_test)
    macro=f1_score(y_test,y_pred, average='macro')
    pref_hist_multy_f1[i][0]=macro
    tn, fp, fn, tp=confusion_matrix(y_test, y_pred).ravel()
    pref_confusion[i][0]=[tn, fp, fn, tp]

    # Calculate initial batch majority
    counts=np.bincount(y_train)
    value=np.argmax(counts)
    majority=[value for i in range(len(y_test))]
    macro=f1_score(y_test,majority, average='macro')
    pref_hist_majority[i][0]=macro

    # Calculate random
    y_rand=[random.randint(0,2) for i in range(len(y_test))]
    macro=f1_score(y_test, y_rand, average='macro')
    pref_random=pref_random + macro


    # Active Leraning loop
    for index in range(1, N_QUERIES+1):
      counter_random += 1
      t2=time.time()
      print(i,'th run and query number',index)
      if not is_random:
        query_index, query_instance=learner.query(X_pool)
        print('num if query',len(query_index))

      if is_random:
        index_list=range(len(X_pool))
        query_index=random.sample(index_list, BATCH_SIZE)

      X, y = X_pool[query_index], y_pool[query_index]
      learner.teach(X=X, y=y)

      # Remove the queried instance from the unlabeled pool.
      X_pool, y_pool=np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)

      index_list=range(len(X_pool))
      query_index=random.sample(index_list, 10)

      X, y = X_pool[query_index], y_pool[query_index]
      learner.teach(X=X, y=y)

      X_pool, y_pool=np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)

      # Calculate and report our model's accuracy.
      model_accuracy=learner.score(X_test, y_test)
      y_pred=learner.predict(X_test)
      macro=f1_score( y_test, y_pred, average='macro')
      print('after query {n}: Accuracy :{acc:0.4f} macro f1 :{f1:0.4f}'.format(n=index + 1, acc=model_accuracy,f1=macro))

      # Save our model's performance for plotting.
      tn, fp, fn, tp=confusion_matrix(y_test,y_pred).ravel()
      pref_confusion[i][index]=[tn, fp, fn, tp]
      pref_hist_multy_accuracy[i][index]=model_accuracy
      pref_hist_multy_f1[i][index]=macro

      # Calculate and add majority
      counts=np.bincount(y_train)
      value=np.argmax(counts)
      majority=[value for i in range(len(y_test))]
      macro=f1_score(y_test,majority, average='macro')
      pref_hist_majority[i][index]=macro
      print(format_time(time.time()-t2))

    print(format_time(time.time()-t1))
    pref_random= pref_random/num_of_run
    pref_hist_majority_avg=pref_hist_majority.mean(0)
    pref_hist_multy_acc_avg=pref_hist_multy_accuracy.mean(0)
    pref_hist_multy_f1_avg=pref_hist_multy_f1.mean(0)

  return pref_hist_multy_accuracy, pref_hist_multy_f1, pref_random, pref_hist_majority, N_QUERIES

### run_multy_sklearn()

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from modAL.models import ActiveLearner
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import uncertainty_sampling

from keras.wrappers.scikit_learn import KerasClassifier

from functools import partial
from collections import Counter

random_seed_list=[5,12,42,29,54]

In [None]:
def run_multy_sklearn(BATCH_SIZE, data_test, test_labels, num_of_run, model_name, strategy, train_size, is_random):

  test_pool_split=0.5
  preset_batch=partial(strategy, n_instances=BATCH_SIZE)
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True )
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(X_pool_0, y_pool_0, test_size=test_pool_split, shuffle=True )
  N_QUERIES=int(len(X_pool_0)/BATCH_SIZE)

  pref_hist_multy_accuracy=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_multy_f1=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_majority=np.zeros((num_of_run, N_QUERIES+1))
  pref_random=0

  for i in range(num_of_run):
    clf=get_model(model_name)

    X_pool, X_train, y_pool, y_train = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True, random_state=random_seed_list[i])
    X_pool, X_test, y_pool, y_test = train_test_split(X_pool, y_pool, test_size=test_pool_split, shuffle=True, random_state=random_seed_list[i])

    learner = ActiveLearner(
        estimator=clf,
        query_strategy=preset_batch,
        X_training=X_train,
        y_training=y_train)

    t1 = time.time()

    # Allow our model to query our unlabeled dataset for the most informative points according to our query strategy (uncertainty sampling).
    # counter_random= 1

    # Calculate initial batch
    y_pred=learner.predict(X_test)
    macro=f1_score(y_test, y_pred, average='macro')
    pref_hist_multy_f1[i][0]=macro

    # Calculate initial batch majority
    counts=np.bincount(y_train)
    value=np.argmax(counts)
    majority=[value for i in range(len(y_test))]
    macro=f1_score(y_test, majority, average='macro')
    pref_hist_majority[i][0]=macro

    # Calculate random
    y_rand=[random.randint(0,2) for i in range(len(y_test))]
    macro=f1_score( y_test,y_rand, average='macro')
    pref_random=pref_random + macro

    # Active leraning loop
    for index in range(1,N_QUERIES+1):
      # Counter_random += 1
      t2 = time.time()
      print(i,'th run and query number',index)
      query_index, query_instance = learner.query(X_pool)
      print('num if query',len(query_index))

      if is_random : # or counter_random % random_ratio == 0:
        index_list=range(len(X_pool))
        query_index=random.sample(index_list,BATCH_SIZE)

      # Teach our ActiveLearner model the record it has requested.
      X, y = X_pool[query_index], y_pool[query_index]

      for j in range(1):
        learner.teach(X=X, y=y)

      # Remove the queried instance from the unlabeled pool.
      X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)

      # Calculate and report our model's accuracy.
      model_accuracy=learner.score(X_test, y_test)
      y_pred=learner.predict(X_test)
      macro=f1_score(y_test, y_pred, average='macro')
      print('after query {n}: Accuracy :{acc:0.4f} macro f1 :{f1:0.4f}'.format(n=index + 1, acc=model_accuracy, f1=macro))

      # Save our model's performance for plotting.
      pref_hist_multy_accuracy[i][index]=model_accuracy
      pref_hist_multy_f1[i][index]=macro

      # Calculate and add majority
      counts=np.bincount(y_train)
      value=np.argmax(counts)
      majority=[value for i in range(len(y_test))]
      macro=f1_score(y_test,majority, average='macro')
      pref_hist_majority[i][index]=macro

      print(format_time(time.time()-t2))
      print(format_time(time.time()-t1))
      pref_random= pref_random/num_of_run
      pref_hist_majority_avg=pref_hist_majority.mean(0)
      pref_hist_multy_acc_avg=pref_hist_multy_accuracy.mean(0)
      pref_hist_multy_f1_avg=pref_hist_multy_f1.mean(0)

  return pref_hist_multy_acc_avg, pref_hist_multy_f1_avg, pref_random, pref_hist_majority_avg, N_QUERIES


### run_multy()

In [None]:
from sklearn.metrics import f1_score

from modAL.models import ActiveLearner
from modAL.batch import uncertainty_batch_sampling

from functools import partial

random_seed_list=[5,12,42,29,54]

In [None]:
def run_multy(BATCH_SIZE, data_test, test_labels, num_of_run, model, strategy, train_size, is_random):

  test_pool_split=0.5
  preset_batch=partial(strategy, n_instances=BATCH_SIZE)
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True )
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(X_pool_0, y_pool_0, test_size=test_pool_split, shuffle=True )
  N_QUERIES=int(len(X_pool_0)/BATCH_SIZE)
  print(num_of_run, N_QUERIES)

  pref_hist_multy_accuracy=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_multy_f1=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_majority=np.zeros((num_of_run, N_QUERIES+1))
  pref_random=0

  test_labels=keras.utils.to_categorical(test_labels, 2)

  for i in range(num_of_run):
    X_pool, X_train, y_pool, y_train = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True, random_state=random_seed_list[i])
    X_pool, X_test, y_pool, y_test = train_test_split(X_pool, y_pool, test_size=test_pool_split, shuffle=True, random_state=random_seed_list[i])

    clf = KerasClassifier(model)
    ## clf = get_model(model)

    print('x pool length', X_train.shape)
    print('y pool length', y_train.shape)

    learner = ActiveLearner(
        estimator=clf,
        query_strategy=preset_batch,
        X_training=X_train,
        y_training=y_train)

    t1 = time.time()

    # Allow our model to query our unlabeled dataset for the most informative points according to our query strategy (uncertainty sampling).
    # Calculate initial batch
    y_pred=learner.predict(X_test)
    y_pred=y_pred[:,1]
    y_test_cat = np.argmax(y_test,axis=1)
    macro=f1_score(y_test_cat, y_pred, average='macro')
    pref_hist_multy_f1[i][0]=macro

    # Calculate initial batch majority
    y_train_cat=np.argmax(y_test,axis=1)
    counts=np.bincount(y_train_cat)
    value=np.argmax(counts)
    majority=[value for i in range(len(y_test_cat))]
    macro=f1_score(y_test_cat, majority, average='macro')
    pref_hist_majority[i][0]=macro

    # Calculate random
    y_rand=[random.randint(0,2) for i in range(len(y_test_cat))]
    macro=f1_score(y_test_cat, y_rand, average='macro')
    pref_random=pref_random + macro

    for index in range(1,N_QUERIES+1):
      query_index, query_instance=learner.query(X_pool)
      # print('num if query',len(query_index))

      if is_random:
        index_list=range(len(X_pool))
        query_index=random.sample(index_list,BATCH_SIZE)

      # Teach our ActiveLearner model the record it has requested.
      X, y = X_pool[query_index], y_pool[query_index]

      for j in range(5):
        learner.teach(X=X, y=y)

      # Remove the queried instance from the unlabeled pool.
      X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)
      print('sec',y_pool.shape)

      # Calculate and report our model's accuracy.
      model_accuracy=learner.score(data_test, test_labels)
      y_pred=learner.predict(X_test)
      y_pred=y_pred[:,1]

      y_test_cat=np.argmax(y_test,axis=1)
      # print('shapeè',y_test_cat.shape)
      macro=f1_score(y_test_cat, y_pred, average='macro')
      print('after query {n}: Accuracy :{acc:0.4f} macro f1 :{f1:0.4f}'.format(n=index + 1, acc=model_accuracy, f1=macro))

      # Save our model's performance for plotting.
      pref_hist_multy_accuracy[i][index]=model_accuracy
      pref_hist_multy_f1[i][index]=macro

      # Calculate and add majority
      y_train_cat=np.argmax(y_test, axis=1)
      counts=np.bincount(y_train_cat)
      value=np.argmax(counts)
      majority =[value for i in range(len(y_test_cat))]
      macro=f1_score(y_test_cat, majority, average='macro')
      pref_hist_majority[i][index]=macro

    print(format_time(time.time()-t1))
    pref_random= pref_random/num_of_run
    pref_hist_majority_avg=pref_hist_majority.mean(0)
    pref_hist_multy_acc_avg=pref_hist_multy_accuracy.mean(0)
    pref_hist_multy_f1_avg=pref_hist_multy_f1.mean(0)

  return pref_hist_multy_acc_avg, pref_hist_multy_f1_avg, pref_random, pref_hist_majority_avg, N_QUERIES


### run_multy_committe()

In [None]:
from sklearn.metrics import f1_score

from modAL.models import ActiveLearner
from modAL.models import ActiveLearner, Committee
from modAL.batch import uncertainty_batch_sampling

from functools import partial

random_seed_list=[5,12,42,29,54]

In [None]:
commite_size = 5
def run_multy_commite(BATCH_SIZE, data_test, test_labels, num_of_run,model_name, strategy, train_size, is_random, mode):

  test_pool_split=0.5
  preset_batch=partial(strategy, n_instances=BATCH_SIZE)
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(data_test, test_labels, test_size=train_size,shuffle=True )
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(X_pool_0, y_pool_0, test_size=test_pool_split,shuffle=True )
  N_QUERIES=int(len(X_pool_0)/BATCH_SIZE)

  pref_hist_multy_accuracy=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_multy_f1=np.zeros((num_of_run, N_QUERIES+1))
  pref_hist_majority=np.zeros((num_of_run, N_QUERIES+1))

  pref_random=0
  boost=False
  if mode=='boost'or mode=='bag':
    boost=True

  for i in range(num_of_run):
    clf=get_model(model_name)

    X_pool, X_train, y_pool, y_train = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True, random_state=random_seed_list[i])
    X_pool, X_test, y_pool, y_test = train_test_split(X_pool, y_pool, test_size=test_pool_split, shuffle=True, random_state=random_seed_list[i])

    learner_list=[]
    for j in range(commite_size):
      learner = ActiveLearner(
          estimator=clf,
          query_strategy=preset_batch,
          X_training=X_train,
          y_training=y_train,
          bootstrap_init=boost,)

      learner_list.append(learner)

    committee=Committee(learner_list=learner_list)
    if mode=='bag':
      committee.rebag()

    t1=time.time()

    # Calculate initial batch
    y_pred=committee.predict(X_test)
    macro=f1_score(y_test,y_pred, average='macro')
    pref_hist_multy_f1[i][0]=macro

    # Calculate initial batch majority
    counts=np.bincount(y_train)
    value=np.argmax(counts)
    majority=[value for i in range(len(y_test))]
    macro=f1_score(y_test,majority, average='macro')
    pref_hist_majority[i][0]=macro

    # Calculate random
    y_rand=[random.randint(0,2) for i in range(len(y_test))]
    macro=f1_score( y_test,y_rand, average='macro')
    pref_random=pref_random + macro

    # Active leraning loop
    for index in range(1,N_QUERIES+1):
      t2=time.time()

      query_index, query_instance=committee.query(X_pool)

      if is_random : # or counter_random % random_ratio == 0:
        index_list=range(len(X_pool))
        query_index=random.sample(index_list,BATCH_SIZE)

      X, y = X_pool[query_index], y_pool[query_index]

      for j in range(1):
        committee.teach(X=X, y=y)

      # Remove the queried instance from the unlabeled pool.
      X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)

      # Calculate and report our model's accuracy.
      model_accuracy=committee.score(X_test, y_test)
      y_pred=committee.predict(X_test)
      macro=f1_score( y_test,y_pred, average='macro')
      print('after query {n}: Accuracy :{acc:0.4f} macro f1 :{f1:0.4f}'.format(n=index + 1, acc=model_accuracy,f1=macro))

      pref_hist_multy_accuracy[i][index]=model_accuracy
      pref_hist_multy_f1[i][index]=macro

      # Calculate and add majority
      counts=np.bincount(y_train)
      value=np.argmax(counts)
      majority=[value for i in range(len(y_test))]
      macro=f1_score(y_test,majority, average='macro')
      pref_hist_majority[i][index]=macro
      print(format_time(time.time()-t2))

    print(format_time(time.time()-t1))
    pref_random= pref_random/num_of_run
    pref_hist_majority_avg=pref_hist_majority.mean(0)
    pref_hist_multy_acc_avg=pref_hist_multy_accuracy.mean(0)
    pref_hist_multy_f1_avg=pref_hist_multy_f1.mean(0)

  return pref_hist_multy_accuracy, pref_hist_multy_f1, pref_random, pref_hist_majority, N_QUERIES



### run_multy_cross()

In [None]:
def run_multy_cross(BATCH_SIZE, data_test, test_labels, num_of_run, model_name, strategy, train_size, is_random):

  test_pool_split=0.5
  preset_batch=partial(strategy, n_instances=40)
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True )
  X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(X_pool_0, y_pool_0, test_size=test_pool_split, shuffle=True )
  N_QUERIES=int(len(X_pool_0)/BATCH_SIZE)

  pref_hist_multy_accuracy=np.zeros((num_of_run,N_QUERIES+1))
  pref_hist_multy_f1=np.zeros((num_of_run,N_QUERIES+1))
  pref_hist_majority=np.zeros((num_of_run,N_QUERIES+1))
  pref_random = 0

  for i in range(num_of_run):
    clf=get_model(model_name)

    X_pool, X_train, y_pool, y_train = train_test_split(data_test, test_labels, test_size=train_size, shuffle=True, random_state=random_seed_list[i])
    X_pool, X_test, y_pool, y_test = train_test_split(X_pool, y_pool, test_size=test_pool_split, shuffle=True, random_state=random_seed_list[i])

    learner = ActiveLearner(
        estimator=clf,
        query_strategy=preset_batch,
        X_training=X_train,
        y_training=y_train)

    t1 = time.time()


    #Allow our model to query our unlabeled dataset for the most informative points according to our query strategy (uncertainty sampling)
    counter_random= 1

    # Calculate initial batch
    y_pred=learner.predict(X_test)
    macro=f1_score(y_test,y_pred, average='macro')
    pref_hist_multy_f1[i][0]=macro

    # Calculate initial batch majority
    counts=np.bincount(y_train)
    value=np.argmax(counts)
    majority=[value for i in range(len(y_test))]
    macro=f1_score(y_test,majority, average='macro')
    pref_hist_majority[i][0]=macro

    # Calculate random
    y_rand=[random.randint(0,2) for i in range(len(y_test))]
    macro=f1_score(y_test, y_rand, average='macro')
    pref_random = pref_random + macro

    # Active Leraning loop
    for index in range(1, N_QUERIES+1):
      counter_random += 1
      t2=time.time()
      print(i,'th run and query number',index)

      query_index, query_instance = learner.query(X_pool)
      print('num if query',len(query_index))

      if is_random:
        index_list=range(len(X_pool))
        query_index=random.sample(index_list, BATCH_SIZE)

      X, y = X_pool[query_index], y_pool[query_index]
      for j in range(1):
        learner.teach(X=X, y=y)

      # Remove the queried instance from the unlabeled pool.
      X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)
      index_list=range(len(X_pool))
      query_index=random.sample(index_list,10)

      X, y = X_pool[query_index], y_pool[query_index]
      X_train=np.concatenate((X_train, X))
      y_train=np.concatenate((y_train, y))

      for j in range(1):
        learner.teach(X=X, y=y)

      X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)

      # Calculate and report our model's accuracy.
      model_accuracy=learner.score(X_test, y_test)
      y_pred=learner.predict(X_test)
      macro=f1_score(y_test, y_pred, average='macro')
      print('after query {n}: Accuracy :{acc:0.4f} macro f1 :{f1:0.4f}'.format(n=index + 1, acc=model_accuracy,f1=macro))

      # Save our model's performance for plotting.
      pref_hist_multy_accuracy[i][index]=model_accuracy
      pref_hist_multy_f1[i][index]=macro

      # Calculate and add majority
      counts=np.bincount(y_train)
      value=np.argmax(counts)
      majority=[value for i in range(len(y_test))]
      macro=f1_score(y_test,majority, average='macro')
      pref_hist_majority[i][index]=macro
      print(format_time(time.time()-t2))

  print(format_time(time.time()-t1))
  pref_random=pref_random/num_of_run
  pref_hist_majority_avg=pref_hist_majority.mean(0)
  pref_hist_multy_acc_avg=pref_hist_multy_accuracy.mean(0)
  pref_hist_multy_f1_avg=pref_hist_multy_f1.mean(0)

  return pref_hist_multy_accuracy, pref_hist_multy_f1, pref_random, pref_hist_majority, N_QUERIES

### MLP Model
#### create_keras_model_berts() \& create_keras_model_GLOVE()

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, GlobalMaxPooling1D, Activation
from keras.layers import Embedding, LSTM

from scikeras.wrappers import KerasClassifier

In [None]:
def create_keras_model_berts():
  model=Sequential()
  model.add(keras.Input(shape=(64,),name="source"))
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.3))
  # model.add(Dense(64, activation='relu'))
  # model.add(Dropout(0.3))
  model.add(Dense(2, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def create_keras_model_GLOVE():
  model=Sequential()
  model.add(keras.Input(shape=(50,),name="source"))
  model.add(Dense(250, activation='relu'))
  model.add(Dropout(0.4))
  # model.add(Dense(128, activation='relu'))
  # model.add(Dropout(0.5))
  # model.add(Dense(64, activation='relu'))
  # model.add(Dropout(0.5))
  model.add(Dense(2, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

### get_model()\_Other ML Models

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBClassifier

In [None]:
def get_model(model_name):

  if model_name=="mlp":
    clf = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000)

  if model_name=="bag":
    clf = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=100, max_samples=0.8)

  if model_name=="gbc":
    clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1, max_depth=1)

  if model_name=="ada":
    clf = AdaBoostClassifier(n_estimators=200,learning_rate=0.01)

  if model_name=="svm":
    clf=svm.SVC(probability=True)

  if model_name=="rf":
    clf=RandomForestClassifier(max_depth=1000, random_state=0)

  if model_name=="lr":
    clf=LogisticRegression(random_state=0,class_weight='balanced')

  if model_name=="knn3":
    clf=KNeighborsClassifier(n_neighbors=3)

  if model_name=="knn5":
    clf=KNeighborsClassifier(n_neighbors=5)

  if model_name=="lda":
    clf=LinearDiscriminantAnalysis()

  if model_name=="qda":
    clf=QuadraticDiscriminantAnalysis()

  return clf

### Dimension Reduction with PCA for All Features (befor Metadata)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA(0.75)
sentence_feature_normal  =pca.fit_transform(np.load('normal_bert.npy'))
sentence_feature_tweet   =pca.fit_transform(np.load('tweet_bert.npy'))
sentence_feature_GLOVE   =pca.fit_transform(np.load('GLOVE.np'))
sentence_feature_GLOVE25 =pca.fit_transform(np.load('GLOVE25.np'))
sentence_feature_GLOVE50 =pca.fit_transform(np.load('GLOVE50.np'))
sentence_feature_GLOVE100=pca.fit_transform(np.load('GLOVE100.np'))


### Sentence Features Concatenated with Metadata

In [None]:
s0=data[cols]
s1=pd.DataFrame(sentence_feature_normal)
s2=pd.DataFrame(sentence_feature_GLOVE)
s3=pd.DataFrame(sentence_feature_GLOVE25)
s4=pd.DataFrame(sentence_feature_GLOVE50)
s5=pd.DataFrame(sentence_feature_GLOVE100)
s6=pd.DataFrame(sentence_feature_tweet)
s1.reset_index(drop=True,inplace=True)
s0.reset_index(drop=True,inplace=True)
s3.reset_index(drop=True,inplace=True)
s2.reset_index(drop=True,inplace=True)
s1.reset_index(drop=True,inplace=True)
s5.reset_index(drop=True,inplace=True)
s6.reset_index(drop=True,inplace=True)

sentence_feature_normal_new  =pd.concat([s1,s0],axis=1)
sentence_feature_tweet_new   =pd.concat([s6,s0],axis=1)
sentence_feature_GLOVE_new   =pd.concat([s2,s0],axis=1)
sentence_feature_GLOVE25_new =pd.concat([s3,s0],axis=1)
sentence_feature_GLOVE50_new =pd.concat([s4,s0],axis=1)
sentence_feature_GLOVE100_new=pd.concat([s5,s0],axis=1)

sentence_feature_normal_new  =sentence_feature_normal_new.to_numpy()
sentence_feature_tweet_new   =sentence_feature_tweet_new.to_numpy()
sentence_feature_GLOVE_new   =sentence_feature_GLOVE_new.to_numpy()
sentence_feature_GLOVE25_new =sentence_feature_GLOVE25_new.to_numpy()
sentence_feature_GLOVE50_new =sentence_feature_GLOVE50_new.to_numpy()
sentence_feature_GLOVE100_new=sentence_feature_GLOVE100_new.to_numpy()

sentence_feature_GLOVE.shape

(6425, 44)

In [None]:
reps_dict={
    "BERT_pca":sentence_feature_normal,
    "tweetBERT_pca":sentence_feature_tweet,
    "GLOVE_pca":sentence_feature_GLOVE,
    "GLOVE50_pca":sentence_feature_GLOVE25,
    "GLOVE25_pca":sentence_feature_GLOVE50,
    "GLOVE100_pca":sentence_feature_GLOVE100,
    "BERT_pca_concat":sentence_feature_normal_new,
    "tweetBERT_pca_concat":sentence_feature_tweet_new,
    "GLOVE_pca_concat":sentence_feature_GLOVE_new,
    "GLOVE50_pca_concat":sentence_feature_GLOVE25_new,
    "GLOVE25_pca_concat":sentence_feature_GLOVE50_new,
    "GLOVE100_pca_concat":sentence_feature_GLOVE100_new,
}

### Run Settings
### AL_loop()

In [None]:
number_of_run = 1

def AL_loop(rep, model, AL_stg):

  sentence_feature=reps_dict[rep]
  data_test=sentence_feature[train_start:train_end + 1]
  strat = uncertainty_sampling

  if AL_stg=="lc": stg=False


  if AL_stg=="Random": stg=True


  if AL_stg=="batch_lc":
     stg=False
     strat=uncertainty_batch_sampling


  if AL_stg=="qbc" or AL_stg=='boost' or AL_stg=='bag':
     stg=False
     print(AL_stg,'   ',rep,'    ',model)
     acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES = run_multy_commite(50, data_test, test_labels, number_of_run, model, strat, 20, stg, AL_stg)


  if AL_stg=="epsilon_in_batch" or AL_stg=="cross":
     stg=False


  if model=="mlp":
    if rep!= "GLOVE_pca_concat":
      acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES = run_multy(50, data_test, test_labels, number_of_run, create_keras_model_berts, strat, 20, stg)
    else:
      acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES = run_multy(50, data_test, test_labels, number_of_run, create_keras_model_GLOVE, strat, 20, stg)


  elif AL_stg=="epsilon_in_batch":
    acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES = run_multy_epsilon_greedy(50, data_test, test_labels, number_of_run,model, strat, 20, False)

  elif AL_stg=="cross":
    acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES = run_multy_cross(50, data_test, test_labels, number_of_run, model, strat, 20, False)

  elif AL_stg!="qbc":
    acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES = run_multy_sklearn(50, data_test, test_labels, number_of_run, model, strat, 20, stg)



  return acc_avg, f1_avg, pref_random, pref_hist_majority, N_QUERIES


In [None]:
representation = "GLOVE_pca_concat"
model          = "mlp"
AL_Strategy    = "lc"

acc, f1, pref_random, pref_hist_majority, N_QUERIES = AL_loop(representation, model, AL_Strategy)


1 20
x pool length (20, 50)
y pool length (20, 2)
sec (979, 2)
after query 2: Accuracy :0.8196 macro f1 :0.6567
sec (929, 2)
after query 3: Accuracy :0.8292 macro f1 :0.7067
sec (879, 2)
after query 4: Accuracy :0.8499 macro f1 :0.7551
sec (829, 2)
after query 5: Accuracy :0.8413 macro f1 :0.7437
sec (779, 2)
after query 6: Accuracy :0.8408 macro f1 :0.7229
sec (729, 2)
after query 7: Accuracy :0.8567 macro f1 :0.7596
sec (679, 2)
after query 8: Accuracy :0.8538 macro f1 :0.7391
sec (629, 2)
after query 9: Accuracy :0.8595 macro f1 :0.7577
sec (579, 2)
after query 10: Accuracy :0.8581 macro f1 :0.7535
sec (529, 2)
after query 11: Accuracy :0.8543 macro f1 :0.7372
sec (479, 2)
after query 12: Accuracy :0.8547 macro f1 :0.7433
sec (429, 2)
after query 13: Accuracy :0.8552 macro f1 :0.7523
sec (379, 2)
after query 14: Accuracy :0.8567 macro f1 :0.7479
sec (329, 2)
after query 15: Accuracy :0.8504 macro f1 :0.7352
sec (279, 2)
after query 16: Accuracy :0.8509 macro f1 :0.7364
sec (229, 2)


In [None]:
# acc_25perc_avg=acc[:,:int(0.25*N_QUERIES)].mean(0)
# f1_25perc_avg=f1[:,:int(0.25*N_QUERIES)].mean(0)

# print(acc_25perc_avg[-1])
# print(f1_25perc_avg[-1])


acc_25perc_avg=acc[:int(0.25*N_QUERIES)].mean(0)
f1_25perc_avg=f1[:int(0.25*N_QUERIES)].mean(0)

print(acc_25perc_avg)
print(f1_25perc_avg)


0.6680134680134681
0.6696482184128798


## For Fine_Tuning


In [None]:
# topic_num=0
# test_df=data[data['topic']==topic_num]
# train_df=data[data['topic']!=topic_num]
# train_df.head()

In [None]:
# data2=data.reset_index()
# start_test=data2[data2['topic']==topic_num].iloc[0]
# def pd_iter_func(df,topic):
    # for row in df.itertuples():
        ## Define your criteria here
        # if row.topic==topic:
            # return row

# start_test=pd_iter_func(data2, 1).Index
# end_test=pd_iter_func(data2, 2).Index-1
# train_start=pd_iter_func(data2, topic_num).Index
# train_end=len(data2)

# print(start_test,',', end_test)


In [None]:
# train_sentences=train_df.text.values
# test_sentences=test_df.text.values
# train_labels=train_df.tag.values
# test_labels=test_df.tag.values
# len(train_labels)

# train_sentences.shape


# test_labels=test_df.tag.values
# train_start=0
# train_end=2078

### Word Representation
#### BERT Representation

In [None]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification

bert_tweet = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
tokenizer_tweet = AutoTokenizer.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
def tokenizer_func(tokenizer_kind, sentences, labels):
  '''
  inputs:
    tokenizer_kind: is the the tokenizer of choice (normal bert, tweet bert)
    sentences: train , dev, test
  outputs:
  torchs of
    ids
    attention_mask
    labels
  '''
  input_ids=[]
  attention_masks=[]

  # For every sentence...
  for sent in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict=tokenizer_kind.encode_plus(
                          sent,                       # Sentence to encode.
                          add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
                          max_length=128,             # Pad & truncate all sentences.
                          pad_to_max_length=True,
                          return_attention_mask=True, # Construct attn. masks.
                          return_tensors='pt',        # Return pytorch tensors.
                          truncation=True,
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids=torch.cat(input_ids, dim=0)
  attention_masks=torch.cat(attention_masks, dim=0)
  labels=torch.tensor(labels)

  return input_ids, attention_masks, labels

In [None]:
sentences=data.text.values
train_labels=data.tag.values
input_ids, attention_masks, labels=tokenizer_func(tokenizer_tweet, sentences, train_labels)




### train_valid_split()

In [None]:
def train_valid_split(input_ids, attention_masks, labels, batch_size=128):

    # Use 70% for training and 30% for validation.
    train_inputs, validation_inputs,  train_masks, validation_masks, train_labels, validation_labels = train_test_split(
        input_ids, attention_masks, labels, random_state=32, test_size=0.3, stratify=labels)

    print('example train_input:    ', train_inputs[0])
    print('example attention_mask: ', train_masks[0])

    train_labels=torch.tensor(train_labels)
    validation_labels=torch.tensor(validation_labels)

    # Create the DataLoader for our training set.
    train_data=TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader=DataLoader(train_data, shuffle=True, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data=TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_dataloader=DataLoader(validation_data, shuffle=False, batch_size=batch_size)

    return train_dataloader, validation_dataloader

In [None]:
bert_train_dataloader, bert_validation_dataloader = train_valid_split(
    input_ids=input_ids,
    attention_masks=attention_masks,
    labels=labels,
    batch_size=32)

example train_input:     tensor([  101, 16098,  2021,  1138,  1208,  3626,  1103, 14551,  1321,  1197,
         1120,   108,  3122,  1708,  1663,  2176,   119,  4222, 21832,  1138,
         5742,   117,  1405,  1234,  1132,  2475,  1253,  1217,  1316,   119,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,    

  train_labels=torch.tensor(train_labels)
  validation_labels=torch.tensor(validation_labels)


### Load Data

In [None]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
sentence_feature_normal=np.load('normal_bert.npy')
X_pretrain=sentence_feature_normal[train_start:train_end + 1]
X_pretrain.shape

y_pretrain=test_labels
y_pretrain.shape

(2079,)

In [None]:
X_pretrain.dtype

dtype('float32')

In [None]:
test_labels.dtype

dtype('int64')

### Define and Pre-train the Model

In [None]:
import tensorflow as tf
import numpy as np
from keras.wrappers.scikit_learn import KerasClassifier
from modAL.models import ActiveLearner
import keras
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential

In [None]:
# X_pretrain=np.random.randn(1000,768)
# y_pretrain=np.random.randint(2,size=(1000,))
y_pretrain_categorical=keras.utils.to_categorical(y_pretrain , 2)

print("Define and Pre-train a model")
def create_keras_model():
    model = Sequential()
    model.add(Dense(768, activation='relu'))
    model.add(Dense(300, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X_pretrain , y_pretrain_categorical, verbose=1, epochs=15)
    for layer in model.layers:
        layer.trainable = True
    for layer in model.layers[:-2]:
        layer.trainable = False
    return model

classifier = KerasClassifier(create_keras_model)

Define and Pre-train a model


  classifier = KerasClassifier(create_keras_model)


### Test ActiveLearner on the Model and a different Dataset

In [None]:
# X_2 = np.random.randn(500,768)
# y_2 = np.random.randint(2,size=(500,))
# y_2_categorical = keras.utils.to_categorical(y_2, 2)
# print("ActiveLearner")
# learner = ActiveLearner(
#     estimator=classifier,
#     X_training=X_2, y_training=y_2_categorical,
#     verbose=1)

In [None]:
sentence_feature_tweet=np.load('tweet_bert.npy')
data_test=sentence_feature_tweet[train_start:train_end + 1]

data_test.shape

(2079, 768)

In [None]:
test_labels.shape

(2079,)

In [None]:
BATCH_SIZE, num_of_run, strategy, is_random = 50, 1, uncertainty_sampling, False

test_pool_split=0.5
preset_batch=partial(strategy, n_instances=BATCH_SIZE)
X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(data_test, test_labels, test_size=20, shuffle=True)
X_pool_0, X_test_0, y_pool_0, y_test_0 = train_test_split(X_pool_0, y_pool_0, test_size=test_pool_split, shuffle=True)

N_QUERIES=int(len(X_pool_0)/BATCH_SIZE)

pref_hist_multy_accuracy=np.zeros((num_of_run, N_QUERIES+1))
pref_hist_multy_f1=np.zeros((num_of_run, N_QUERIES+1))
pref_random=0

# test_labels=keras.utils.to_categorical(test_labels, 2)

for i in range(num_of_run):
  X_pool, X_train, y_pool, y_train = train_test_split(data_test, test_labels, test_size=20, shuffle=True, random_state=random_seed_list[i])
  X_pool, X_test, y_pool, y_test = train_test_split(X_pool, y_pool, test_size=test_pool_split, shuffle=True, random_state=random_seed_list[i])

  # y_train_categorical = keras.utils.to_categorical(y_train, 2)
  # y_test_cat = keras.utils.to_categorical(y_test, 2)
  # y_pool_cat = keras.utils.to_categorical(y_pool, 2)

  clf = classifier

  # clf = KerasClassifier(model)

  print('x pool length', X_train.shape)
  print('y pool length', y_train.shape)

  learner = ActiveLearner(
      estimator=clf,
      query_strategy=preset_batch,
      X_training=X_train,
      y_training=y_train)


  t1 = time.time()

  # Calculate initial batch
  y_pred=learner.predict(X_test)
  # y_test_cat=np.argmax(y_test,axis=1)
  macro=f1_score(y_test, y_pred, average='macro')
  pref_hist_multy_f1[i][0]=macro


  for index in range(1,N_QUERIES+1):
    query_index, query_instance=learner.query(X_pool)

    # indices=[i for i, val in enumerate(y_pool) if val >= len(y_pool)]
    # query_index, query_instance = np.delete(query_index, indices, axis=0), np.delete(query_instance, indices,axis=0)
    # print('num if query',len(query_index))

    if is_random:
      index_list=range(len(X_pool))
      query_index=random.sample(index_list,BATCH_SIZE)

    # Teach our ActiveLearner model the record it has requested.
    X, y = X_pool[query_index], y_pool[query_index]

    for j in range(1):
      learner.teach(X=X, y=y)

      # Remove the queried instance from the unlabeled pool.
      X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index,axis=0)
      print('sec',y_pool.shape)

      # Calculate and report our model's accuracy.
      model_accuracy=learner.score(data_test, test_labels)
      y_pred=learner.predict(X_test)
      # y_test_cat=np.argmax(y_test, axis=1)
      # print('shapeè',y_test_cat.shape)
      macro=f1_score(y_test, y_pred, average='macro')
      print('after query {n}: Accuracy :{acc:0.4f} macro f1 :{f1:0.4f}'.format(n=index + 1, acc=model_accuracy, f1=macro))

      # Save our model's performance for plotting.
      pref_hist_multy_accuracy[i][index]=model_accuracy
      pref_hist_multy_f1[i][index]=macro

  pref_hist_multy_acc_avg=pref_hist_multy_accuracy.mean(0)
  pref_hist_multy_f1_avg=pref_hist_multy_f1.mean(0)


x pool length (20, 768)
y pool length (20,)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
sec (979,)
after query 2: Accuracy :0.2867 macro f1 :0.2771
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
sec (929,)
after query 3: Accuracy :0.7797 macro f1 :0.4381
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
sec (879,)
after query 4: Accuracy :0.7797 macro f1 :0.4381
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoc

In [None]:
f1_25perc_avg=pref_hist_multy_f1[:,:int(0.25*N_QUERIES)].mean(0)

print(f1_25perc_avg[-1])

In [None]:
query_instance

array([[-0.10937989, -0.32409355, -0.22804809, ..., -0.31877008,
         0.3237741 ,  0.44401392],
       [-0.12156366, -0.32348505, -0.238762  , ..., -0.3617471 ,
         0.4970476 ,  0.5812354 ],
       [-0.15743075, -0.46739998,  0.21498464, ..., -0.6134482 ,
         0.10459223,  0.27613413],
       ...,
       [-0.15280154, -0.35130453,  0.1686634 , ..., -0.11777963,
         0.29943648,  0.23850745],
       [-0.15289503, -0.26090252,  0.11517086, ..., -0.12757997,
         0.43884858,  0.32419595],
       [-0.32888773, -0.08943853, -0.22164007, ..., -0.23371893,
         0.47447217,  0.4885551 ]], dtype=float32)

In [None]:
query_index

array([222, 252, 158,   3, 105, 636, 930, 798, 488, 413, 559, 944,  38,
       288, 822, 191, 857, 520, 302, 128, 946, 842, 121, 704, 820, 926,
       956, 785, 836, 603, 176, 827, 273,  15, 517, 682, 750, 890, 840,
       328, 925, 582, 834, 792, 228, 131, 708, 684, 193, 241])

In [None]:
X_pool.shape

(1029, 768)

In [None]:
X.shape

(50, 768)

In [None]:
y.shape

(50,)

In [None]:
test_sentences.shape

(2079,)

In [None]:
X.shape

(50, 768)

In [None]:
y.shape

(50,)

In [None]:
y_train_categorical.shape

(20, 2)

In [None]:
X_train.shape

(20,)

In [None]:
batch_size=128
prediction_data=TensorDataset(input_ids, attention_masks, labels)
prediction_sampler=SequentialSampler(prediction_data)
prediction_dataloader=DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


bert_tweet.cuda()
bert_tweet.eval()

# Tracking variables
sentence_feature=[]
t1=time.time()

for batch in prediction_dataloader:
  t0=time.time()

  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      outputs=bert_tweet(b_input_ids, attention_mask=b_input_mask)

  sentence_features_slice = outputs[0][:,0,:].cpu().numpy()

  # Store predictions and true labels
  sentence_feature.append(sentence_features_slice)
  elapsed = format_time(time.time() - t0)

  print("time elapse:",elapsed)

print("full time",format_time(time.time()-t1))

sentence_feature=np.concatenate(sentence_feature, axis=0)
print(sentence_feature.shape)

In [None]:
with open('tweet_bert.npy', 'wb') as f:
    np.save(f, sentence_feature)