# Classification

Implement the textbook's system for supervised learning, modifying as needed to run on our literary corpus. Classify the novels as authored by either British/American or female/male writers. Assess the accuracy of the results.

As an optional challenge, see what you can do to speed up the system and/or to get better classification accuracy. In general, this will mean using different classifiers and/or other methods in the pipeline, though it could involve some additional feature engineering as well.

You can find the textbook's code via its [GitHub repo](https://github.com/foxbook/atap). You're free to copy what you need, but be aware that you'll need to make some changes to work with our data (not least, to use our corpus readers from problem sets 3/4). As ever, make sur eyou understand what the supplied code is doing. You can only modifiy it if you have a strong sense of how it works.

## Code

In [41]:
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split as tts

class CorpusLoader(object):

    def __init__(self, reader, folds=12, shuffle=True, categories=None):
        self.reader = reader
        self.folds  = KFold(n_splits=folds, shuffle=shuffle)
        self.files  = np.asarray(self.reader.fileids(categories=categories))

    def fileids(self, idx=None):
        if idx is None:
            return self.files
        return self.files[idx]

    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


In [42]:
#!/usr/bin/env python3

DOC_PATTERN = r'.+\.txt'        # Documents are just files that end in '.txt'
PKL_PATTERN = r'.+\.pickle'     # Pickled files end in .pickle
CAT_PATTERN = r'([a-z_\s]+)/.*' # We won't use this, but fall back to directory-based labels
                                # if no other labels are supplied

import codecs
import time
import nltk
import os
import pickle
from   glob import glob
from   nltk.corpus.reader.api import CorpusReader
from   nltk.corpus.reader.api import CategorizedCorpusReader
from   nltk import pos_tag, sent_tokenize, wordpunct_tokenize

def make_cat_map(path, extension):
    """
    Takes a directory path and file extension (e.g., 'txt').
    Returns a dictionary of file:category mappings from standard file names:
      nation-author-title-year-gender
    """
    file_paths = glob(os.path.join(path, f'*.{extension}'))
    file_names = [os.path.split(i)[1] for i in file_paths]
    category_map = {} # Dict to hold filename:[categories] mappings
    for file in file_names:
        parsed = file.rstrip(f'.{extension}').split('-') # strip extension and split on hyphens
        nation = parsed[0]
        gender = parsed[4]
        category_map[file] = [nation, gender, nation+gender]
    return category_map

class TMNCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for categorized text documents to enable preprocessing.
    """
    
    def __init__(
        self, 
        root, 
        fileids=DOC_PATTERN,
        encoding='utf8', 
        **kwargs
    ):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            # First, try to build a cat_map from standard-style filenames
            try: 
                kwargs['cat_map'] = make_cat_map(root, 'txt')
            # On error, fall back to dir names for categories    
            except Exception as e:
                print(type(e), e, "\nUnable to build category map from file names.\nFalling back to categories by directory name.")
                kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
        
    def resolve(self, fileids, categories):
            """
            Returns a list of fileids or categories depending on what is passed
            to each internal corpus reader function. Implemented similarly to
            the NLTK ``CategorizedPlaintextCorpusReader``.
            """
            if fileids is not None and categories is not None:
                raise ValueError("Specify fileids or categories, not both")

            if categories is not None:
                return self.fileids(categories)
            return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of a document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()

    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)
            
    def paras(self, fileids=None, categories=None):
        """
        Uses splitlines() to parse the paragraphs from plain text.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)
        
        for doc in self.docs(fileids):
            for par in doc.splitlines():
                if len(par) > 0:
                    yield par

    def sents(self, fileids=None, categories=None):
        """
        Uses the built in sentence tokenizer to extract sentences from the
        paragraphs. Note that this method uses BeautifulSoup to parse HTML.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)
        
        for paragraph in self.paras(fileids):
            for sentence in sent_tokenize(paragraph):
                yield sentence

    def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)
        
        for sentence in self.sents(fileids):
            for token in wordpunct_tokenize(sentence):
                yield token

    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()

        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in sent_tokenize(para):
                counts['sents'] += 1

                for word in wordpunct_tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'categories': n_topics,
            'paragraphs':  counts['paras'],
            'sentences':  counts['sents'],
            'words':  counts['words'],
            'vocabulary_size':  len(tokens),
            'lexical_diversity': float(counts['words']) / float(len(tokens)),
            'paras_per_doc':  float(counts['paras']) / float(n_fileids),
            'words_per_doc':  float(counts['words']) / float(n_fileids),
            'sents_per_para':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
    
class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            # First, try to build a cat_map from standard-style filenames
            try: 
                kwargs['cat_map'] = make_cat_map(root, 'pickle')
            # On error, fall back to dir names for categories    
            except Exception as e:
                print(type(e), e, "\nUnable to build category map from file names.\nFalling back to categories by directory name.")
                kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None, categories=None):
        for sent in self.sents(fileids, categories):
            for token in sent:
                yield token

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for token in self.tagged(fileids, categories):
            yield token[0]
 

In [43]:
import nltk
import unicodedata
import numpy as np
import os

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline



def identity(words):
    return words


class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        return all(
            unicodedata.category(char).startswith('P') for char in token
        )

    def is_stopword(self, token):
        return token.lower() in self.stopwords

    def normalize(self, document):
        return [
            self.lemmatize(token, tag).lower()
            for paragraph in document
            for sentence in paragraph
            for (token, tag) in sentence
            if not self.is_punct(token) and not self.is_stopword(token)
        ]

    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

    def fit(self, X, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.normalize(document[0])


def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)


labels = ["M","F"]
reader = PickledCorpusReader('../data/pickled')
loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

models = []
for form in (LogisticRegression, SGDClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(GaussianNB(), True))

import time
import json

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def score_models(models, loader):
    for model in models:

        name = model.named_steps['classifier'].__class__.__name__
        if 'reduction' in model.named_steps:
            name += " (TruncatedSVD)"

        scores = {
            'model': str(model),
            'name': name,
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'time': [],
        }

        for X_train, X_test, y_train, y_test in loader:
            start = time.time()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores['time'].append(time.time() - start)
            scores['accuracy'].append(accuracy_score(y_test, y_pred))
            scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
            scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
            scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

        yield scores

if __name__ == '__main__':
    for scores in score_models(models, loader):
        with open('results.json', 'a') as f:
            f.write(json.dumps(scores) + "\n")


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Analysis

Offer a brief set of observations on the system you've built. Does it perform well? How so? How not? What would you try to change in order to make it better (for whatever definitions of "better" seem appropriate to you)?