## Co-training Implementation

Based on papers: <i>Combining Labeled and Unlabeled Data with Co-Training</i> and <i>Co-Training for Cross-Lingual Sentiment Classification</i>.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
import spacy
nlp = spacy.load("en_core_web_lg")


import warnings
warnings.filterwarnings("ignore")

### Cleaning data

In [2]:
import tensorflow_hub as hub
universal_embed = hub.load("../other/universal-sentence-encoder_4")

In [3]:
all_text = []
all_labels = []
with open("../data/sentiment/amazon_electronics_reviews/reviews.txt") as infile:
    lines = infile.readlines()
    for line in lines:
        all_labels.append(int(line[0]))
        all_text.append(line[2:])
        
all_text = all_text[:10000]
all_labels = all_labels[:10000]

all_text_nc = [] # noun chunks
all_text_no_nc = [] # no noun chunks
for text in all_text:
    doc = nlp(text)
    noun_chunks = [tok.text for tok in doc.noun_chunks]
    text_nc = " ".join(noun_chunks).lower()
    all_text_nc.append(text_nc)
    text_no_nc = text
    for nc in noun_chunks:
        text_no_nc = re.sub(re.escape(nc),"",text_no_nc)
    text_no_nc = re.sub("\\s+"," ",text_no_nc)
    all_text_no_nc.append(text_no_nc.lower())

In [4]:
data_x = universal_embed(all_text_nc).numpy()
data_y = np.array(all_labels)
data_x_o = universal_embed(all_text_no_nc).numpy()

In [5]:
np.random.seed(11)
data_x,data_x_o,data_y = shuffle(data_x,data_x_o,data_y)
x_lab,x_lab_o,y_lab = data_x[:100],data_x_o[:100],data_y[:100]
x_unl,x_unl_o = data_x[100:5100],data_x_o[100:5100]
x_test,x_test_o,y_test = data_x[5100:6100],data_x_o[5100:6100],data_y[5100:6100]

print(x_lab.shape,x_unl.shape,x_test.shape)
print(x_lab_o.shape,x_unl_o.shape,x_test_o.shape)

(100, 512) (5000, 512) (1000, 512)
(100, 512) (5000, 512) (1000, 512)


### Modeling

In [6]:
n_iter_total=5
n_add=10

# initializing U and U'
u_ind = [i for i in range(len(x_unl))] # U for both models
np.random.seed(10)
u_p_ind_o = list(np.random.choice(u_ind,400,replace=False)) # U'
u_p_ind = u_p_ind_o # U'
u_ind = [i for i in u_ind if i not in u_p_ind_o]

lx,y = x_lab,y_lab
lx_o,y_o = x_lab_o,y_lab

for n_iter in range(1,n_iter_total+1):
    # adding to U'
    np.random.seed(10)
    u_p_ind_o_addition = list(np.random.choice(u_ind,100,replace=False))
    u_ind = [i for i in u_ind if i not in u_p_ind_o_addition]
    u_p_ind_o = u_p_ind_o+u_p_ind_o_addition
    u_p_ind = u_p_ind+u_p_ind_o_addition
    
    lr = LogisticRegression()
    lr_o = LogisticRegression()
    lr.fit(lx,y)
    lr_o.fit(lx_o,y_o)
    
    print(lr_o.score(x_test_o,y_test),lr.score(x_test,y_test))
    
    ind_o = [i for i in range(len(u_p_ind_o))]
    ind = ind_o
    
    lr_prob = lr.predict_proba(x_unl[u_p_ind])[:,1] # prob of class 1
    lr_prob_o = lr_o.predict_proba(x_unl[u_p_ind_o])[:,1]

    rank_i,lr_prob_sorted = zip(*sorted(zip(ind,list(lr_prob)),key=lambda x:x[1]))
    rank_i_o,lr_prob_o_sorted = zip(*sorted(zip(ind_o,list(lr_prob_o)),key=lambda x:x[1]))    
    
    # adding the new labels for L for both models
    lx_neg,y_neg = x_unl[np.array(u_p_ind_o)[list(rank_i_o[:n_add])]],np.zeros((n_add))
    lx_pos,y_pos = x_unl[np.array(u_p_ind_o)[list(rank_i_o[-n_add:])]],np.ones((n_add))
    lx,y = np.vstack([lx,lx_neg,lx_pos]),np.concatenate([y,y_neg,y_pos])
    
    lx_o_neg = x_unl[np.array(u_p_ind)[list(rank_i[:n_add])]]
    lx_o_pos = x_unl[np.array(u_p_ind)[list(rank_i[-n_add:])]]
    lx_o,y_o = np.vstack([lx_o,lx_o_neg,lx_o_pos]),np.concatenate([y_o,y_neg,y_pos])
    
    # updating U' for both models
    lx_o_indices = list(np.array(u_p_ind_o)[list(rank_i_o[:n_add])])+list(np.array(u_p_ind_o)[list(rank_i_o[-n_add:])])
    u_p_ind_o = [i for i in u_p_ind_o if i not in lx_o_indices]
    lx_indices = list(np.array(u_p_ind)[list(rank_i[:n_add])])+list(np.array(u_p_ind)[list(rank_i[-n_add:])])
    u_p_ind = [i for i in u_p_ind if i not in lx_indices]

0.754 0.659
0.775 0.691
0.784 0.69
0.793 0.69
0.788 0.697
