In [4]:
import csv
from transformers import BertModel, BertTokenizer, AutoTokenizer
import torch
import numpy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression as sklearn_LR

model_name = 'bert-base-uncased'

ModuleNotFoundError: ignored

**The following cell uses the load data function defined in the LogisticRegression file provided by the course staff of CS490A @Umass Amherst. It is used to extract the texts and labels from each data split**

In [5]:
# code provided by CS490A course staff

def load_data(filename):
    '''
    Method for loading in data from tsv.
    
    Input:
    - filename (str): tsv file containing annotated data must include
                      header with the named columns "label" and "text"

    Returns a lists of strings corresponding to the document texts
    and labels respectively
    '''
    with open(filename, encoding="utf-8", newline='') as tsv:
        reader = csv.DictReader(tsv, delimiter='\t')
        
        texts, labels = [], []
        for row in reader:
            texts.append(row['text'])
            labels.append(row['label'])

    return texts, labels

**Here we used the load data function to extract texts and labels from each split**

In [None]:
train_texts, train_labels = load_data("/Users/seanmurphy/Downloads/AP4 Supporting Notebooks-20221210/splits/train.tsv")
test_texts, test_labels = load_data("/Users/seanmurphy/Downloads/AP4 Supporting Notebooks-20221210/splits/test.tsv")
dev_texts, dev_labels = load_data("/Users/seanmurphy/Downloads/AP4 Supporting Notebooks-20221210/splits/dev.tsv")

**Here we define Bert model and tokenizer**

In [None]:
model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**The following cell is a function provided by the course staff of CS490A @Umass Amherst. It used to get confidence intervals for given classifications**

In [None]:
# code provided by CS490A course staff

import scipy

def get_confidence_intervals(accuracy, sample_size, confidence_level):
  z_score = -1 * scipy.stats.norm.ppf((1-confidence_level)/2)
  standard_error = numpy.sqrt(accuracy * (1-accuracy) / sample_size)
  lower_ci = accuracy - standard_error*z_score
  upper_ci = accuracy + standard_error*z_score
  return lower_ci, upper_ci

**The following function are used to extract to binary features from the documents. The first is whether or not the question contains a blank line _____. This is because most questions which contain a black line are sports and leisure quetions. The second checks if their is a date present in the question by matching 3-4 adjacent digits. Questions with dates are usually History.**

In [None]:
import re

# check if a questions contains a fill in the blank. If this is the case it is almost certainly Sports and Leisure
def contains_line(doc):
    if ("_______" in doc):
        return 1
    else:
        return 0

# check if a question contains a date. If this is the case, it is likely History
def contains_date(doc):
    if (len(re.findall("[0-9]{3,4}", doc))>0):
        return 1
    else:
        return 0

**Here is a function partially created in HW4 of the course. This has been modified from it's original state. The function is used to extract the hidden layers provided by BERT. It also adds two binary features to the end of each hidden layer vector (the two features defined above)**

In [None]:
def extract_bert_features(input_texts):
  features = []
  for i, text in enumerate(input_texts):
    input = tokenizer.encode(text, truncation=True,
                             return_tensors="pt")
    hidden_states = model(input).hidden_states
    feature = torch.stack([hidden_state[0][0] for hidden_state in hidden_states[1:]]).detach().cpu().numpy()
    assert feature.shape == (12, 768)

    new_features = []
    for layer in range(12):
      new_features.append(numpy.append(numpy.append(feature[layer], contains_line(text)), contains_date(text)))
      
    new_features = numpy.stack(new_features)
    features.append(new_features)
  return numpy.stack(features)

**Here we extract the train, test, and dev embeddings to use as features in the future**

In [None]:
train_features = extract_bert_features(train_texts)
test_features = extract_bert_features(test_texts)
dev_features = extract_bert_features(dev_texts)


**The following cell is code provided by the CS490A course staff. It creates a class for a logistic regression classifier**

In [None]:
# code provided by CS490A course staff

class Classifier:
    '''
    A l2-regularized logistic regression classifier 
    '''
    def __init__(self):
        self.model = None
        self.C = -1

    def train_and_optimize(self, train_vectors, train_labels,
                            dev_vectors, dev_labels):
        assert not self.model, "ERROR: Model already trained."
        best_C = -1
        best_score = 0
        best_model = None
        for C in [0.1, 1, 10, 100]:
            model = sklearn_LR(C=C, max_iter=1000)
            model.fit(train_vectors, train_labels)
            train_acc = model.score(train_vectors, train_labels)
            dev_acc = model.score(dev_vectors, dev_labels)
            print("C = {} - Train Accuracy: {:.3f}, Dev Accuracy: {:.3f}".format(
                  C, train_acc, dev_acc))
            if dev_acc > best_score:
                best_C = C
                best_model = model
                best_score = dev_acc
        print("Selected C = {}".format(best_C))
        self.C = best_C
        self.model = best_model
    
    def predict(self, vectors):
        assert self.model, "ERROR: Must train model first"
        return self.model.predict(vectors)

    def test(self, test_vectors, test_labels):
        assert self.model, "ERROR: Must train model first"
        # Compute test accuracy
        accuracy = self.model.score(test_vectors, test_labels)
        n = test_vectors.shape[0]
        
        # Compute 95% confidence interval using normal approximation
        confidence_level = 0.95
        z_score = -1 * scipy.stats.norm.ppf((1-confidence_level)/2)
        standard_error = numpy.sqrt(accuracy * (1-accuracy) / n)
        lower_ci = accuracy - standard_error*z_score
        upper_ci = accuracy + standard_error*z_score
        print("Test Accuracy: {:.3f} with 95% CI: [{:.3f}, {:.3f}]".format(
              accuracy, lower_ci, upper_ci))

    def print_weights(self, display_k=5, feature_names=None):
        assert self.model, "ERROR: Must train model first"
        
        n_classes = len(self.model.classes_)
        # binary (2 label classes)
        if n_classes == 2:
            weights = self.model.coef_[0]
            sorted_idx = numpy.argsort(weights)
            
            # positive class
            label = self.model.classes_[1]
            for i in sorted_idx[-display_k:][::-1]:
                weight = weights[i]
                if weight <= 0:
                    continue
                feat = feature_names[i] if feature_names else i
                print("{}\t{}\t{:.3f}".format(label, feat, weight))
            print()
            
            # negtaive class
            label = self.model.classes_[0]
            for i in sorted_idx[:display_k]:
                weight = weights[i]
                if weight >= 0:
                    continue
                feat = feature_names[i] if feature_names else i
                print("{}\t{}\t{:.3f}".format(label, feat, weight))
        
        # mulitclass (3+ label classes)
        else:
            for i, label in enumerate(self.model.classes_):
                weights = self.model.coef_[i]
                sorted_idx = numpy.argsort(weights)
                for i in sorted_idx[-display_k:][::-1]:
                    weight = weights[i]
                    feat = feature_names[i] if feature_names else i
                    print("{}\t{}\t{:.3f}".format(label, feat, weight))
                print()

**The following code is used to extract a specific hidden layer from each split**

In [None]:
def get_layer(i):
    return {"train": train_features[:, i-1, :], "test": test_features[:, i-1, :], "dev": dev_features[:, i-1, :]}

**Here we extract layer 11 from each split (it had the best perfoormance of all layers). Then we instantiate a classifier and use the code provided to optimize. Lastly the model is evaluated on the test set.**

In [None]:

# Featurize texts
feature_data = get_layer(11)

print("\nTraining & Optimizing Model...")
# Train model
model = Classifier()
model.train_and_optimize(feature_data["train"], train_labels,
                            feature_data["dev"], dev_labels
                            )
print("\nEvaluating Model...")
# Evaluate Model
model.test(feature_data["test"], test_labels)



Training & Optimizing Model...
C = 0.1 - Train Accuracy: 0.950, Dev Accuracy: 0.765
C = 1 - Train Accuracy: 1.000, Dev Accuracy: 0.795
C = 10 - Train Accuracy: 1.000, Dev Accuracy: 0.780
C = 100 - Train Accuracy: 1.000, Dev Accuracy: 0.775
Selected C = 1

Evaluating Model...
Test Accuracy: 0.800 with 95% CI: [0.745, 0.855]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
