# Logistic Regression
This notebook peforms text classification for a given annotated text dataset (with train, dev, test splits) using L2-regularized logistic regression. Documents are represented as binarized bag-of-words.

- Training: The classifier is trained using the train split (`train.tsv`).
- Optimization: Regularization strength *hyperparameter* is optimized (i.e. selected) using the dev split (`dev.tsv`).
- Evaluation: Classifier performance is evaluted using the test split (`test.tsv`).

The notebook reports the trained classifier's test accuracy with 95% confidence intervals. It also prints out the top feature weights for each label class.  

In [None]:
import csv
import numpy as np
import scipy
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression as sklearn_LR

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def load_data(filename):
    '''
    Method for loading in data from tsv.
    
    Input:
    - filename (str): tsv file containing annotated data must include
                      header with the named columns "label" and "text"

    Returns a lists of strings corresponding to the document texts
    and labels respectively
    '''
    with open(filename, encoding="utf-8", newline='') as tsv:
        reader = csv.DictReader(tsv, delimiter='\t')
        
        texts, labels = [], []
        for row in reader:
            texts.append(row['text'])
            labels.append(row['label'])

    return texts, labels

In [None]:
def text_to_feature_vectors(train, dev, test):
    '''
    Note that texts are lowercased and that we only include
    word types that occur in at least 5 documents within the
    training split.
    '''
    binary_bow_vectorizer = CountVectorizer(lowercase=True,
                                            tokenizer=nltk.word_tokenize,
                                            binary=True,
                                            min_df=5
                                           )
    train_vecs = binary_bow_vectorizer.fit_transform(train)
    dev_vecs = binary_bow_vectorizer.transform(dev)
    test_vecs = binary_bow_vectorizer.transform(test)
    vocab_index = binary_bow_vectorizer.vocabulary_
    vocab = sorted(vocab_index.keys(), key=lambda x: vocab_index[x])
    
    return {
        "vocab": vocab,
        "vocab_index": binary_bow_vectorizer.vocabulary_,
        "train": train_vecs,
        "dev": dev_vecs,
        "test": test_vecs
    }

In [None]:
class Classifier:
    '''
    A l2-regularized logistic regression classifier 
    '''
    def __init__(self):
        self.model = None
        self.C = -1

    def train_and_optimize(self, train_vectors, train_labels,
                            dev_vectors, dev_labels):
        assert not self.model, "ERROR: Model already trained."
        best_C = -1
        best_score = 0
        best_model = None
        for C in [0.1, 1, 10, 100]:
            model = sklearn_LR(C=C, max_iter=1000)
            model.fit(train_vectors, train_labels)
            train_acc = model.score(train_vectors, train_labels)
            dev_acc = model.score(dev_vectors, dev_labels)
            print("C = {} - Train Accuracy: {:.3f}, Dev Accuracy: {:.3f}".format(
                  C, train_acc, dev_acc))
            if dev_acc > best_score:
                best_C = C
                best_model = model
                best_score = dev_acc
        print("Selected C = {}".format(best_C))
        self.C = best_C
        self.model = best_model
    
    def predict(self, vectors):
        assert self.model, "ERROR: Must train model first"
        return self.model.predict(vectors)

    def test(self, test_vectors, test_labels):
        assert self.model, "ERROR: Must train model first"
        # Compute test accuracy
        accuracy = self.model.score(test_vectors, test_labels)
        n = test_vectors.shape[0]
        
        # Compute 95% confidence interval using normal approximation
        confidence_level = 0.95
        z_score = -1 * scipy.stats.norm.ppf((1-confidence_level)/2)
        standard_error = np.sqrt(accuracy * (1-accuracy) / n)
        lower_ci = accuracy - standard_error*z_score
        upper_ci = accuracy + standard_error*z_score
        print("Test Accuracy: {:.3f} with 95% CI: [{:.3f}, {:.3f}]".format(
              accuracy, lower_ci, upper_ci))

    def print_weights(self, display_k=5, feature_names=None):
        assert self.model, "ERROR: Must train model first"
        
        n_classes = len(self.model.classes_)
        # binary (2 label classes)
        if n_classes == 2:
            weights = self.model.coef_[0]
            sorted_idx = np.argsort(weights)
            
            # positive class
            label = self.model.classes_[1]
            for i in sorted_idx[-display_k:][::-1]:
                weight = weights[i]
                if weight <= 0:
                    continue
                feat = feature_names[i] if feature_names else i
                print("{}\t{}\t{:.3f}".format(label, feat, weight))
            print()
            
            # negtaive class
            label = self.model.classes_[0]
            for i in sorted_idx[:display_k]:
                weight = weights[i]
                if weight >= 0:
                    continue
                feat = feature_names[i] if feature_names else i
                print("{}\t{}\t{:.3f}".format(label, feat, weight))
        
        # mulitclass (3+ label classes)
        else:
            for i, label in enumerate(self.model.classes_):
                weights = self.model.coef_[i]
                sorted_idx = np.argsort(weights)
                for i in sorted_idx[-display_k:][::-1]:
                    weight = weights[i]
                    feat = feature_names[i] if feature_names else i
                    print("{}\t{}\t{:.3f}".format(label, feat, weight))
                print()

In [None]:
def run(train_tsv, dev_tsv, test_tsv):
    print("Loading & Featurizing Data...")
    # Load Data
    train_texts, train_labels = load_data(train_tsv)
    dev_texts, dev_labels = load_data(dev_tsv)
    test_texts, test_labels = load_data(test_tsv)
    
    # Featurize texts
    feature_data = text_to_feature_vectors(train_texts, dev_texts, test_texts)
    
    print("\nTraining & Optimizing Model...")
    # Train model
    model = Classifier()
    model.train_and_optimize(feature_data["train"], train_labels,
                             feature_data["dev"], dev_labels
                             )
    print("\nEvaluating Model...")
    # Evaluate Model
    model.test(feature_data["test"], test_labels)
    print("\nPrinting Top Weights...")
    # Print weights
    model.print_weights(display_k=10, feature_names=feature_data["vocab"])

    predictions = model.predict(feature_data["test"])
    missed = list()
    missed_list = list()
    tot_list = list()
    for i in range(0,200):
      if(not predictions[i] == test_labels[i]):
        print(predictions[i], test_labels[i], test_texts[i])
        missed.append((predictions[i], test_labels[i]))
        missed_list.append(test_labels[i])
      tot_list.append(test_labels[i])
    my_dict = {i:missed.count(i) for i in missed}
    her_dict = {i:missed_list.count(i) for i in missed_list}
    his_dict = {i:tot_list.count(i) for i in tot_list}
    print(my_dict)
    print(her_dict)
    print(his_dict)

    for key, v in her_dict.items():
      print(key, 1 - her_dict[key]/his_dict[key])

In [None]:
# Change these variables to point to your data
train_fn = "train.tsv"
dev_fn = "dev.tsv"
test_fn = "test.tsv"

run(train_fn, dev_fn, test_fn)

Loading & Featurizing Data...

Training & Optimizing Model...
C = 0.1 - Train Accuracy: 0.700, Dev Accuracy: 0.570
C = 1 - Train Accuracy: 0.888, Dev Accuracy: 0.635
C = 10 - Train Accuracy: 0.955, Dev Accuracy: 0.615
C = 100 - Train Accuracy: 0.980, Dev Accuracy: 0.570
Selected C = 1

Evaluating Model...
Test Accuracy: 0.615 with 95% CI: [0.548, 0.682]

Printing Top Weights...
AL	novel	1.942
AL	wrote	1.531
AL	harry	1.428
AL	italian	1.305
AL	according	1.237
AL	series	1.138
AL	created	1.130
AL	god	1.118
AL	word	1.114
AL	novels	1.046

E	movie	3.136
E	film	2.186
E	song	1.575
E	actor	1.507
E	game	1.191
E	kf	1.169
E	has	1.149
E	show	1.123
E	actress	1.100
E	name	0.877

G	located	1.583
G	is	1.315
G	u.s.	1.035
G	state	0.941
G	city	0.906
G	famous	0.830
G	only	0.732
G	through	0.732
G	country	0.693
G	of	0.642

H	were	1.557
H	true	1.242
H	president	1.079
H	war	1.041
H	was	1.021
H	great	1.001
H	us	0.917
H	battle	0.864
H	ii	0.846
H	british	0.837

SL	fashion	1.156
SL	but	1.132
SL	german	1.055
SL	next