In [1]:
import numpy as np
import pickle
import copy

from utils import load_imdb, load_yelp, load_toxic_comment, load_amazon

from sklearn.model_selection import train_test_split

from proglearn.forest import LifelongForest, UncertaintyForest

In [2]:
# Experimental parameters.
n_estimators = 10
subsample_fracs = [6e-5, 6e-4, 6e-3, 6e-2]
verbose = True

In [3]:
source_tasks = [
    {
        'name' : 'Yelp Review Sentiment Analysis',
        'filename' : 'yelp',
        'load' : load_yelp,
        'subsample_frac' : 0.02,
        'task_id' : 0,
    },
    {
        'name' : 'IMDB Review Sentiment Analysis',
        'filename' : 'imdb',
        'load' : load_imdb,
        'subsample_frac' : 0.2,
        'task_id' : 1,
    },
#     {
#         'name' : 'Amazon Review Sentiment Analysis',
#         'filename' : 'amazon',
#         'load' : load_amazon,
#         'subsample_frac' : 0.01,
#         'task_id' : 3,
#     }
]
target_task = {
        'name' : 'Toxic Comment Identification',
        'filename' : 'toxic_comment',
        'load' : load_toxic_comment,
        'task_id' : 2,
}

In [4]:
# Load data.

for task in source_tasks:
    print("------------------------------------------------------")
    print("LOADING TASK:", task['name'])
    print("------------------------------------------------------")
    task['X_train'], task['y_train'], task['X_test'], task['y_test'] = task['load'](verbose = verbose, subsample_frac = task['subsample_frac'])
print("------------------------------------------------------")

------------------------------------------------------
LOADING TASK: Yelp Review Sentiment Analysis
------------------------------------------------------
'X_train' and 'X_test' are each an n-by-d array of BERT embedded reviews of a business.
'y_train' and 'y_test' are each list of binary sentiments, where 0 = 'negative' and 1 = 'positive'.
Number of training examples = 11200
Input dimension d = 512
Number of testing examples = 38000
------------------------------------------------------
LOADING TASK: IMDB Review Sentiment Analysis
------------------------------------------------------
'X_train' and 'X_test' are each an n-by-d array of BERT embedded movie reviews.
'y_train' and 'y_test' are each list of binary sentiments, where 0 = 'negative' and 1 = 'positive'.
Number of training examples = 9000
Input dimension d = 512
Number of testing examples = 5000
------------------------------------------------------


In [5]:
# Source task training.

lf = LifelongForest(n_estimators=n_estimators)
num_tasks = len(source_tasks) + 1

for task in source_tasks:
    print("TRAINING TASK:", task['name'])
        
    X_train, y_train, X_test, y_test = task['X_train'], task['y_train'], task['X_test'], task['y_test']
    
    task['n_train'] = len(X_train)
    task['n_test'] = len(y_train)
    
    # uf = UncertaintyForest(n_estimators=num_tasks*n_estimators)
    # uf.fit(X_train, y_train)
    lf.add_task(X_train, y_train, task_id=task['task_id'])
    
    # task['err_uf_train'] = np.mean(np.abs(uf.predict(X_train) - y_train))
    # task['err_uf_test'] = np.mean(np.abs(uf.predict(X_test) - y_test))

TRAINING TASK: Yelp Review Sentiment Analysis
TRAINING TASK: IMDB Review Sentiment Analysis


In [6]:
X_train_full, y_train_full, X_test, y_test = load_toxic_comment(verbose = verbose, subsample_frac = None)

'X_train' and 'X_test' are each an n-by-d array of BERT embedded reviews of a business.
'y_train' and 'y_test' are each list of multilabel binary sentiments, 
                where the columns indicate 'toxic', 'severe_toxic', 'obscene', 'threat', 
                'insult', 'identity_hate', and 'not_toxic', in that order.
Number of training examples = 159571
Input dimension d = 512
Number of testing examples = 63978


In [7]:
# Target task training and testing.
task = target_task

# n_train, err_uf_train, err_uf_test, err_lf_train, err_lf_test, te
results = np.zeros((len(subsample_fracs), 6))

for s, subsample_frac in enumerate(subsample_fracs):
    
    task['subsample_frac'] = subsample_frac
    _, X_train, _, y_train = train_test_split(X_train_full, y_train_full, test_size=subsample_frac)
    n_train = len(X_train)
    print("TESTING TASK:", task['name'], "at sample size n =", n_train)
    
    uf = UncertaintyForest(n_estimators=num_tasks*n_estimators)
    uf.fit(X_train, y_train)
    
    lf2 = copy.deepcopy(lf)
    lf2.add_task(X_train, y_train, task_id=task['task_id'])
    
    results[s, 0] = n_train
    results[s, 1] = np.mean(np.abs(uf.predict(X_train) - y_train))
    results[s, 2] = np.mean(np.abs(uf.predict(X_test) - y_test))
    results[s, 3] = np.mean(np.abs(lf2.predict(X_train, task['task_id']) - y_train))
    results[s, 4] = np.mean(np.abs(lf2.predict(X_test, task['task_id']) - y_test))
    results[s, 5] = results[s, 2] / results[s, 4]
    
pickle.dump(results, open("output/toxic_comment_sweep_%d.p" % n_estimators, "wb"))

TESTING TASK: Toxic Comment Identification at sample size n = 10


  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors

TESTING TASK: Toxic Comment Identification at sample size n = 96


  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors

TESTING TASK: Toxic Comment Identification at sample size n = 958


  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors

TESTING TASK: Toxic Comment Identification at sample size n = 9575


  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors = np.nan_to_num(np.array(label_counts) / np.sum(label_counts))
  posteriors

In [8]:
# n_train, err_uf_train, err_uf_test, err_lf_train, err_lf_test, te
print(results[0])
print(results[1])
print(results[2])

[10.          0.          0.03776819  0.          0.03776819  1.        ]
[9.60000000e+01 3.47222222e-02 3.77681911e-02 3.29861111e-02
 3.77681911e-02 1.00000000e+00]
[9.58000000e+02 1.54836465e-01 1.39813686e-01 3.82741823e-02
 4.48096742e-02 3.12016743e+00]


In [9]:
# Display results.

# num_tasks = len(tasks)

# for task_ in tasks:
    
#     task = pickle.load(open("output/%s.p" % task_['filename'], "rb"))
    
#     print("-------------------------------------------------------")
#     print("TASK:", task['name'])
#     print("-------------------------------------------------------")

#     print("Number of training examples:", len(task['X_train']))
#     print("Number of testing examples:", len(task['X_test']))
#     print(n_estimators, "estimators per task for Lifelong Forest.")
#     print(num_tasks*n_estimators, "estimators for Uncertainty Forest.")

#     print("UF train error: ", task['err_uf_train'])
#     print("UF test error: ", task['err_uf_test'])
#     print("LF train error: ", task['err_lf_train'])
#     print("LF test error: ", task['err_lf_test'])
    
#     print("Transfer Efficiency: ", task['te'])
#     print("-------------------------------------------------------")