# Prose Kaleidoscopes - RNN Experiment

In [1]:
# Imports

from collections import Counter

import copy
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import torch.nn as nn

from torch_rnn_classifier import TorchRNNClassifier
from torch_tree_nn import TorchTreeNN
import sst
import cs224u_utils

# If Jupyter complains that torch is an unknown module, try:
# conda install pytorch torchvision -c pytorch
import torch

# Transformers
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import BertModel

%matplotlib inline

# Own files
import dataset_io


# These params should never change
NUM_EPOCHS = 4 
SEED = 42
BATCH_SIZE = 8
LEARNING_RATE = 4e-5
NUM_WARMUP_STEPS = 100
MAX_SEQ_LEN = 512

# Constants
DATASET_TYPE_IMDB = "imdb"
DATASET_TYPE_AMAZON = "amazon"
DATASET_TYPE_SST2 = "sst2"

FileNotFoundError: [Errno 2] No such file or directory: 'glove/glove.6B.50d.txt'

In [15]:
# Retrieve GloVe pretrained vectors.

GLOVE_DIR = "glove"
GLOVE_ZIP_FILE = "{0}/glove.6B.zip".format(GLOVE_DIR)
GLOVE_PATH = "{0}/glove.6B.50d.txt".format(GLOVE_DIR)
GLOVE_FILE_EXISTS = os.path.exists(GLOVE_PATH)

if not os.path.exists(GLOVE_DIR):
    GLOVE_FILE_EXISTS = False
    os.mkdir(GLOVE_DIR)

if not GLOVE_FILE_EXISTS:
    # Works on both Mac and Linux. If you are on Windows,
    # please download the zip file manually.
    if not os.path.exists(GLOVE_ZIP_FILE):
        !cd {GLOVE_DIR}; curl -LO http://nlp.stanford.edu/data/glove.6B.zip
    !cd {GLOVE_DIR}; unzip glove.6B.zip


GLOVE_LOOKUP = cs224u_utils.glove2dict(GLOVE_PATH)

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [16]:
def normalize_list_labels(lst, dataset_type):
    if dataset_type == DATASET_TYPE_IMDB:
        # Labels are either "positive" or "negative"
        return [0 if v == "negative" else 1 for v in lst]
    if dataset_type == DATASET_TYPE_AMAZON:
        # Labels are in the range [1, 5]
        return [(v - 1) * (1.0 / 5) for v in lst]
    # SST-2 are 0 (negative) or 1 (positive).
    return lst

def normalize_labels(df, label_field):
    labels = getattr(df, label_field).to_numpy()
    # Fix IMDB labels.
    is_imdb = len(set(labels)) == 2 and not str(labels[0]).isdigit() and "positive" in labels and "negative" in labels
    # Fix Amazon labels.
    is_amazon = len(set(labels)) == 5 and min(set(labels)) == 1 and max(set(labels)) == 5
    if is_imdb:
        return pd.get_dummies(labels)["positive"]  # 0 is neg, 1 is pos
    if is_amazon:
        return labels - 1
    return labels

def vsm_phi(text, lookup, np_func=np.mean):
    allvecs = np.array([lookup[w] for w in text.split() if w in lookup])
    if len(allvecs) == 0:
        dim = len(next(iter(lookup.values())))
        feats = np.zeros(dim)
    else:
        feats = np_func(allvecs, axis=0)
    return feats

def glove_phi(text, np_func=np.mean):
    return vsm_phi(text, glove_lookup, np_func=np_func)

def simple_leaves_phi(text):
    return text.split()

#  {'embed_dim': 50, 'eta': 0.005, 'hidden_dim': 100}
def fit_rnn_classifier(X, y):
    #print(y)
    basemod = TorchRNNClassifier(
        GLOVE_VOCAB,
        embedding=GLOVE_EMBEDDING,
        batch_size=25,
        embed_dim=50,
        hidden_dim=100,
        bidirectional=True,
        early_stopping=True, 
        eta=0.005)
    basemod.fit(X, y)
    return basemod

In [17]:
from sklearn.metrics import accuracy_score, roc_curve, auc

def get_roc_metrics(probs, y_true, num_classes, dataset_type):
    preds = np.array(probs)
    if dataset_type != DATASET_TYPE_AMAZON:
        fpr, tpr, threshold = roc_curve(y_true, preds)
        roc_auc = auc(fpr, tpr)
        
        # Get accuracy over the test set
        y_pred = np.where(preds >= 1.0 / num_classes, 1, 0)
        accuracy = accuracy_score(y_true, y_pred)
        return { 
            'fpr' : fpr, 
            'tpr' : tpr, 
            'roc_auc' : roc_auc, 
            'accuracy' : accuracy
        }
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # Force any empty categories to be present
    cat_preds = pd.DataFrame(preds)
    cat_preds = cat_preds.astype(pd.CategoricalDtype(categories=list([round(i / num_classes, 1) for i in range(NUM_CLASSES)])))
    y_true = pd.get_dummies(y_true).to_numpy()
    preds = pd.get_dummies(cat_preds).to_numpy()

    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], preds[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), preds.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(num_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= NUM_CLASSES
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    y_pred = np.where(preds >= 1.0 / num_classes, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'accuracy' : accuracy}
    for k in fpr.keys():
        metrics["fpr_{0}".format(k)] = fpr[k]
        metrics["tpr_{0}".format(k)] = tpr[k]
        metrics["roc_auc_{0}".format(k)] = roc_auc[k]
    return metrics
    
def evaluate_roc_twoclass(probs, y_true, num_classes, dataset_type):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    fpr, tpr, roc_auc, accuracy = list(get_roc_metrics(probs, y_true, num_classes, dataset_type).values())
    print(f'AUC: {roc_auc:.4f}')
    print(f'Accuracy: {accuracy*100:.4f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
def evaluate_roc_multiclass(probs, y_true, num_classes, dataset_type):
    metrics = get_roc_metrics(probs, y_true, num_classes, dataset_type)
    ks = ["macro", "micro"] + [i for i in range(NUM_CLASSES)]
    fpr = { k : metrics["fpr_{0}".format(k)] for k in ks }
    tpr = { k : metrics["tpr_{0}".format(k)] for k in ks }
    roc_auc = { k : metrics["roc_auc_{0}".format(k)] for k in ks }
    accuracy = metrics['accuracy']
    
    print(f'AUC: {0}', roc_auc)
    print(f'Accuracy: {accuracy*100:.4f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr["micro"], tpr["micro"], 'b', label = 'Micro-avg AUC = %0.2f' % roc_auc["micro"], color = 'navy')
    plt.plot(fpr["macro"], tpr["macro"], 'b', label = 'Macro-avg AUC = %0.2f' % roc_auc["macro"], color = 'darkviolet')
    colors = ['orange', 'forestgreen', 'cornflowerblue', 'darkgoldenrod', 'tomato', 'dodgerblue']
    lw=2
    for i, color in zip(range(NUM_CLASSES), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='AUC of class {0} = {1:0.2f})'.format(i, roc_auc[i]))
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

def evaluate_roc(probs, y_true, num_classes, dataset_type):
    if dataset_type == DATASET_TYPE_AMAZON:
        evaluate_roc_multiclass(probs, y_true, num_classes, dataset_type)
    else:
        evaluate_roc_twoclass(probs, y_true, num_classes, dataset_type)

In [18]:
# Execution and Metrics
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import accuracy_score, balanced_accuracy_score, cohen_kappa_score, roc_auc_score

def get_pred(value, num_classes):
    interval = 1.0 / num_classes
    for i in range(0, num_classes):
        if value < (i + 1) * interval:
            return i
    return num_classes - 1

def run_and_eval(train, test, text_field, label_field, test_labels, num_classes, dataset_type):
    # Evaluate on test set
    rnn_experiment = sst.experiment(
        train,
        simple_leaves_phi, 
        fit_rnn_classifier,
        assess_dataframes=[test], 
        vectorize=False,
        text_field=text_field,
        label_field=label_field)
    
    test_probs = rnn_experiment['predictions'][0]
    test_probs_norm = normalize_list_labels(test_probs, dataset_type)

    # Evaluate the classifier
    print("Test Set")
    evaluate_roc(test_probs_norm, test_labels, num_classes, dataset_type)
    
    test_preds = np.array([get_pred(p, num_classes) for p in test_probs_norm])
    return copy.deepcopy(test_preds)

def get_metrics_report(train, test, test_preds, test_labels, 
                       train_path, test_path, label_field,
                       dataset_type, expr_type, num_samples, run_id, report_destpath=None):
    metrics_report = classification_report(_TEST_LABELS, test_preds, output_dict=True, digits=4)
    run_report = { 
        "dataset" : dataset_type, 
        "expr" : expr_type,
        "num_samples" : num_samples, 
        "run_id" : run_id
    }
    metrics_report.update(run_report)

    report_str = "### RNN | Dataset: {0}, Expr: {1}, N{2}, R{3}\n".format(
        dataset_type, expr_type, num_samples, run_id)
    report_str += "```\n"
    report_str += "Train: {0}\nTest: {1}\n-------------------------------\n".format(
        train_path, test_path)
    
    report_str += "AUC: {0:.4f}\n".format(roc_auc_score(test_labels, test_preds, multi_class='ovr'))
    report_str += "Accuracy:\t{0:.4f}\t\tBalanced Acc: {1:.4f}\n".format(
        accuracy_score(test_preds, test_labels), balanced_accuracy_score(test_preds, test_labels))
    report_str += "Kappa:\t{0}\n".format(cohen_kappa_score(test_preds, test_labels))
    report_str += classification_report(test_labels, test_preds, digits=4)
    report_str += "\n"
    report_str += "Train distribution:\t{0}\nTest distribution:\t{1}\n".format(
        getattr(train, label_field).value_counts().to_dict(),
        getattr(test, label_field).value_counts().to_dict())
    labels = [0, 1] if dataset_type != DATASET_TYPE_AMAZON else [0, 1,2 , 3, 4]
    cm = multilabel_confusion_matrix(test_labels, test_preds, labels=labels)
    report_str += "Confusion matrix:\n\t{0}".format(
        "\n\t".join(["Label {0}: TP {1}, FP {2}, TN {3}, FN {4}".format(
            labels[i], cm[i][1][1], cm[i][0][1], cm[i][0][0], cm[i][1][0]) for i in range(len(cm))]))
    report_str += "\n```\n\n"
    
    if report_destpath is not None:
        file_obj = open(report_destpath, 'a')
        file_obj.write(report_str)
        file_obj.close()


In [20]:
def get_train_path(expr_type, num_samples, run_id, dataset_filename, file_ext):
    return "expr_data/{0}/{1}/r{2}/expr_{0}_n{1}_r{2}_{3}.{4}".format(
        expr_type, num_samples, run_id, dataset_filename, file_ext)

# RNN Processing
def proc_rnn_x(df):
    return list(getattr(df, TEXT_FIELD).apply(lambda s : s.split())) 

def load_glove(train):
    x_train = proc_rnn_x(train)
    #full_train_vocab = cs224u_utils.get_vocab(x_train)
    train_vocab = cs224u_utils.get_vocab(x_train, mincount=2)
    glove_embedding, glove_vocab = cs224u_utils.create_pretrained_embedding(GLOVE_LOOKUP, train_vocab)
    return glove_embedding, glove_vocab

"""
# SST-2
DATASET_FILENAME = "sst2_train"
DATASET_TYPE = "sst2"
FILE_EXT = "tsv"
LABEL_FIELD = "label"
TEXT_FIELD = "sentence"
TEST_PATH = "expr_data/devtest/{0}_test.{1}".format(DATASET_TYPE, FILE_EXT)
"""

"""
# IMDB
DATASET_FILENAME = "imdb"
DATASET_TYPE = "imdb"
FILE_EXT = "csv"
LABEL_FIELD = "sentiment"
TEXT_FIELD = "review"
TEST_PATH = "expr_data/devtest/{0}_test.{1}".format(DATASET_FILENAME, FILE_EXT)
"""

#"""
# Amazon Reviews
DATASET_FILENAME = "amazon_reviews_digital_music"
DATASET_TYPE = "amazon"
FILE_EXT = "json"
LABEL_FIELD = "overall"
TEXT_FIELD = "reviewText"
OUTPUT_RESLTS_PATH = "results/amazon_results.json"
TEST_PATH = "expr_data/devtest/{0}_test.{1}".format(DATASET_FILENAME, FILE_EXT)
#"""


TEST = dataset_io.to_df(TEST_PATH)
_TEST_LABELS = normalize_labels(TEST, LABEL_FIELD)
NUM_CLASSES = len(set(getattr(TEST, LABEL_FIELD)))

### Experiment Changes Start Here

In [21]:
# NUM_SAMPLES can be 10 or 50.
NUM_SAMPLES = 10

RESULTS_DIR = "results"
if not os.path.exists(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)

REPORT_RESULTS_PATH = "{0}/{1}_{2}_rnn_report.md".format(RESULTS_DIR, DATASET_TYPE, NUM_SAMPLES)

if not os.path.exists(REPORT_RESULTS_PATH):
    open(REPORT_RESULTS_PATH, 'a').close()
    print("Metrics report will be written to {0}".format(REPORT_RESULTS_PATH))
else:
    print("Metrics report will be appended to {0}".format(REPORT_RESULTS_PATH))

Metrics report will be written to results/amazon_10_rnn_report.md


In [22]:
RUN_IDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
EXPR_TYPES =  ["orig", "para", "para_tc", "para_editdist", "para_tc_editdist", "tc", "tc_editdist", "eda", "bt"]

for expr_type in EXPR_TYPES:
    for run_id in RUN_IDS:
        train_path = get_train_path(expr_type, NUM_SAMPLES, run_id, DATASET_FILENAME, FILE_EXT)
        train = dataset_io.to_df(train_path)
        GLOVE_EMBEDDING, GLOVE_VOCAB = load_glove(train)
        train_labels = normalize_labels(train, LABEL_FIELD)
        print("\n\nRunning {0} experiment on {1}, N{2}, run# {3}".format(
            DATASET_TYPE, expr_type, NUM_SAMPLES, run_id))
        test_preds = run_and_eval(train, TEST, TEXT_FIELD, LABEL_FIELD, _TEST_LABELS, NUM_CLASSES, DATASET_TYPE)
        get_metrics_report(train, TEST, test_preds, _TEST_LABELS, 
                          train_path, TEST_PATH, LABEL_FIELD, 
                          DATASET_TYPE, expr_type, NUM_SAMPLES, run_id, REPORT_RESULTS_PATH)

print("Finished. Please see the experiment metrics report at {0}".format(REPORT_RESULTS_PATH))



Running amazon experiment on orig, N10, run# 0


AttributeError: module 'cs224u_utils' has no attribute 'progress_bar'