## Self-training Implementation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input,Dense,Dropout,Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore")

### Cleaning data

In [2]:
import tensorflow_hub as hub
nnlm_embed = hub.load("../other/nnlm-en-dim128_2")

In [3]:
all_text = []
all_labels = []
with open("../data/sentiment/amazon_electronics_reviews/reviews.txt") as infile:
    lines = infile.readlines()
    for line in lines:
        all_labels.append(int(line[0]))
        all_text.append(line[2:])
        
all_text = all_text[:20000]
all_labels = all_labels[:20000]

data_x = nnlm_embed(all_text)
data_y = np.array(all_labels)
data = np.hstack([data_x,np.expand_dims(data_y,-1)])

In [4]:
np.random.seed(11)
data = shuffle(data)
labeled_data = data[:100]
x_lab,y_lab = labeled_data[:,:-1],labeled_data[:,-1]
unlabeled_data = data[100:5100]
x_unl = unlabeled_data[:,:-1]
test_data = data[5100:6100]
x_test,y_test = test_data[:,:-1],test_data[:,-1]
print(x_lab.shape,x_unl.shape,x_test.shape)

(100, 128) (5000, 128) (1000, 128)


### Modeling

In [5]:
def get_model(input_dim=128):
    """ Model instantiation
    """
    x = Input(shape=(input_dim))
    h = Dense(50,activation="relu")(x)
    o = Dense(1,activation="sigmoid")(h)
    
    model = Model(inputs=x,outputs=o)
    model.compile(loss=BinaryCrossentropy(),optimizer=Adam(lr=0.01))
    return model

In [6]:
def train_model(x_train,y_train,sample_weights,batch_size=25,epochs=50):
    """
    """
    model=get_model()
    
    for epoch_i in range(epochs):
        losses = []
        x_train,y_train = shuffle(x_train,y_train)
        for i in range(0,len(x_train),batch_size):
            x_train_subset = x_train[i:i+batch_size]
            y_train_subset = y_train[i:i+batch_size]
            sample_weight = sample_weights[i:i+batch_size]
            batch_loss = model.train_on_batch(x_train_subset,y_train_subset,sample_weight=sample_weight)
            losses.append(float(batch_loss))

    test_pred = model(x_test).numpy()
    test_pred[test_pred>=0.5]=1 ; test_pred[test_pred<0.5]=0
    test_acc = balanced_accuracy_score(y_test,test_pred)
    
    return model,test_acc

In [7]:
n_iter_total=5
epochs=[10]*5
x_train,y_train = x_lab,y_lab
x_unl_all = x_unl
sample_weights = np.ones((len(x_train))) # if wanting to weigh additional datapoints

for n_iter in range(1,n_iter_total+1):
    model,test_acc = train_model(x_train,y_train,sample_weights,epochs=epochs[n_iter-1])
    print("Acc:",round(test_acc,3))
    
    if len(x_unl_all)<=1:
        break
    
    unl_predictions = model(x_unl_all).numpy()[:,0]
    unl_indices = np.array(range(len(unl_predictions)))
    neg_pred_indices = unl_indices[unl_predictions<=0.001]
    pos_pred_indices = unl_indices[unl_predictions>=0.999]
    print("New labels:",len(neg_pred_indices)+len(pos_pred_indices))
    
    x_train,y_train = np.vstack([x_train,x_unl_all[neg_pred_indices]]),np.concatenate([y_train,np.zeros((len(neg_pred_indices)))])
    x_train,y_train = np.vstack([x_train,x_unl_all[pos_pred_indices]]),np.concatenate([y_train,np.ones((len(pos_pred_indices)))])
    sample_weights = np.concatenate([sample_weights,np.ones((len(neg_pred_indices)+len(pos_pred_indices)))/n_iter])
    
    new_unl_indices = [i for i in unl_indices if i not in neg_pred_indices and i not in pos_pred_indices]
    x_unl_all = x_unl_all[new_unl_indices]

Acc: 0.686
New labels: 57
Acc: 0.707
New labels: 93
Acc: 0.706
New labels: 100
Acc: 0.703
New labels: 251
Acc: 0.703
New labels: 275
