In [1]:
MALE = 'male'
FEMALE = 'female'
UNKNOWN = 'unknown'
BOTH = 'both'

MALE_WORDS = set([
        'guy','spokesman','chairman',"men's",'men','him',"he's",'his',
        'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
        'dads','dude','father','fathers','fiance','gentleman','gentlemen',
        'god','grandfather','grandpa','grandson','groom','he','himself',
        'husband','husbands','king','male','man','mr','nephew','nephews',
        'priest','prince','son','sons','uncle','uncles','waiter','widower',
        'widowers'
])

FEMALE_WORDS = set([
        'heroine','spokeswoman','chairwoman',"women's",'actress','women',
        "she's",'her','aunt','aunts','bride','daughter','daughters','female',
        'fiancee','girl','girlfriend','girlfriends','girls','goddess',
        'granddaughter','grandma','grandmother','herself','ladies','lady',
        'lady','mom','moms','mother','mothers','mrs','ms','niece','nieces',
        'priestess','princess','queens','she','sister','sisters','waitress',
        'widow','widows','wife','wives','woman'
])

In [19]:
def genderize(words):
    
    mwlen = len(MALE_WORDS.intersection(words))
    fwlen = len(FEMALE_WORDS.intersection(words))
    
    if mwlen > 0 and fwlen == 0:
        return MALE
    elif mwlen == 0 and fwlen > 0:
        return FEMALE
    elif mwlen > 0 and fwlen > 0:
        return BOTH
    else:
        return UNKNOWN

In [20]:
from collections import Counter

def count_gender(sentences):
    
    sents = Counter()
    words = Counter()
    
    for sentence in sentences:
        gender = genderize(sentence)
        sents[gender] += 1
        words[gender] += len(sentence)
        
    return sents, words

In [21]:
import nltk

def parse_gender(text):
    
    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]
    
    sents, words = count_gender(sentences)
    total = sum(words.values())
    
    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]
        
        print(
        '{}% {} ({} sentences)'.format(pcent, gender, nsents)
        )

In [22]:
# New York Times entitled “Rehearse, Ice Feet, Repeat: The Life of a New York City Ballet Corps Dancer”

article = ("With apologies to James Brown, the hardest working people in show business may well be ballet dancers. And at New York City Ballet, none work harder than the dancers in its lowest rank, the corps de ballet. During the first week of the company’s winter season, Claire Kretzschmar, 24, a rising corps member, danced in all seven performances, appearing in five ballets, sometimes changing costumes at intermission to dance two roles in a night.     But her work onstage did not even begin to capture the stamina required to be in the corps. Spending a week shadowing Ms. Kretzschmar was exhausting — she gave new meaning to the idea of being on your feet all day. Twelve-hour days at the David H. Koch Theater, the company’s Lincoln Center home, were hardly unusual: Company class each morning was followed by back-to-back-to-back rehearsals, with occasional breaks for costume fittings or physical therapy, and then by the hair-makeup-costume-dance routine of daily performances. This weekend will be even more frenetic. Ms. Kretzschmar will appear in seven ballets from Friday evening to Sunday afternoon, when she faces a new test: taking on the title role of the Sleepwalker in George Balanchine’s eerie, proto-goth ballet “La Sonnambula.” Balanchine, one of ballet’s most important choreographers, was a founder of City Ballet, and remains its guiding spirit more than three decades after his death. Being in City Ballet’s corps is not like being a member of a chorus line, or a backup singer. The company promotes almost all of its stars, the principal dancers, from within. So while the corps is expected to be able to move in startling unison, like a school of fish, and to assemble in straight lines and keep all rippling swan arms parallel, its 54 members are also competing for bigger roles and promotions. “There is an element of competition, and people get different opportunities, but everybody just wants to do their best onstage, and everyone wants each other to just do their best onstage,” Ms. Kretzschmar said during a break between rehearsals last week. “We have all experienced so many extreme highs and lows that it’s almost that you have to bond with this group of people. Here are scenes from one week in the busy life of a corps member.")


In [23]:
parse_gender(article)

73.57630979498862% unknown (11 sentences)
9.79498861047836% female (2 sentences)
16.62870159453303% both (1 sentences)


In [25]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'(?!\.)[\w)\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w)\s]+)/.*'

# corpus = CategorizedPlaintextCorpusReader(
#     '/path/to/corpus/root', DOC_PATTERN, cat_pattern=CAT_PATTERN)

In [27]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader
import codecs

CAT_PATTERN =  r'([\w)\s]+)/.*'
DOC_PATTERN = r'(?!\.)[\w)\s]+/[\w\s\d\-]+\.txt'

TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

class HTMLCorpusReader(CategorizedCorpusReader, CorpusReader):
    # corpus reader for raw html docs to enable preprocessing
    
    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                tags=TAGS, **kwargs):
        
        # add default category pattern if not passed into the class
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
            
        # initialize NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
        
        # save tags specifically want to extract
        self.tags = tags
        
    
    def resolve(self, fileids, categories):
        """
        returns list of fields or categories depending on what is passed
        to each internal corpus reader function. implemented similarly
        to NLTK CategorizedPlaintextCorpusReader
        """
        
        if fileids is not None and categories is not None:
            raise ValueError('specify fileids or categories, not both')
            
        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    
    def docs(self, fileids=None, categories=None):
        """
        returns complete text of HTML doc, closing doc after done
        reading it and yielding in memory safe fashion
        """
        
        # resolve fileids and categories
        fileids = self.resolve(fileids, categories)
        
        # create generator, loading one doc into memory at a time
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()
                
                
    def sizes(self, fileids=None, categories=None):
        """
        returns list of tuples, fileid and size on disk of file.
        this function is used to detect oddly large files in corpus
        """
        
        fileids = self.resolve(fileids, categories)
        
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)

In [28]:
import sqlite3

class SqliteCorpusReader(object):
    
    def __init__(self, path):
        self._cur = sqlite3.connect(path).cursor()
        
    def ids(self):
        """
        returns review ids, which enable joins to other metadata
        """
        self._cur.execute('SELECT reviewid FROM content')
        for idx in iter(self._cur.fetchone, None):
            yield idx
            
    def scores(self):
        """
        returns review score to be used as the target for
        later supervised learning problems
        """
        self._cur.execute('SELECT score FROM reviews')
        for score in iter(self._cur.fetchone, None):
            yield score
            
    def texts(self):
        """
        returns full review texts, to be preprocessed and
        vectorized for supervised learning
        """
        self._cur.execute('SELECT content FROM content')
        for text in iter(self._cur.fetchone, None):
            yield text
        

In [33]:
from readability.readability import Unparseable
from readability.readability import Document as Paper

def html(self, fileids=None, categories=None):
    """
    returns HTML content of eachd ocument, cleaning it using
    the readability-lxml library
    """
    for doc in self.docs(fileids, categories):
        try:
            yield Paper(doc).summary()
        except Unparseable as e:
            print('Could not parse HTML: {}'.format(e))
            continue
    

In [34]:
import bs4

TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

def paras(self, fileids=None, categories=None):
    """
    uses BeautifulSoup to parse paragraphs from HTML.
    """
    for html in self.html(fileids, categories):
        soup = bs4.BeautifulSoup(html, 'lxml')
        for element in soup.find_all(tags):
            yield element.text
        # destroys tree when done with each file
        soup.decompose

In [35]:
from nltk import sent_tokenize
from nltk import wordpunct_tokenize
from nltk import pos_tag, sent_tokenize

def sents(self, fileids=None, categories=None):
    """
    use built in sentence tokenizer to extract sentences. this method uses
    BS to parse HTML
    """
    for paragraph in self.paras(fileids, categories):
        for sentence in sent_tokenize(paragraph):
            yield sentence
            
def words(self, fileids=None, categories=None):
    for sentence in self.sents(fileids, categories):
        for token in wordpunct_tokenize(sentence):
            yield token
            
def tokenize(self, fileids=None, categories=None):
    """
    segments, tokenizes and tags a document in corpus
    """
    for paragraph in self.paras(fileids=fileids):
        yield [
            pos_tag(wordpunct_tokenize(sent))
            for sent in sent_tokenize(paragraph)
        ]

In [36]:
import time

def describe(self, fileids=None, categories=None):
    """
    performs single pass of corpus and returns dict with variety
    of metrics concerning state of corpus
    """
    started = time.time()
    
    # structure to perform counting
    counts = nltk.FreqDist()
    tokens = nltk.FreqDist()
    
    # perform single pass over paragraphs, tokenize and count
    for para in self.paras(fileids, categories):
        counts['paras'] += 1
        
        for sent in para:
            counts['sents'] += 1
            
            for word, tag in sent:
                counts['words'] += 1
                tokens[word] += 1
                
    # computer number of files and categories in corpus
    n_fileids = len(self.resolve(fileids, categories) or self.fileids())
    n_topics = len(self.categories(self.resolve(fileids, categories)))
    
    # return data structure with info
    return {
        'files': n_fileids,
        'topics': n_topics,
        'paras': counts['paras'],
        'sents': counts['sents'],
        'words': counts['words'],
        'vocab': len(tokens),
        'lexdiv': float(counts['words']) / float(len(tokens)), 
        'ppdoc': float(counts['paras']) / float(n_fileids), 
        'sppar': float(counts['sents']) / float(counts['paras']), 
        'secs': time.time()-started,
    }

In [37]:
import os
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize
import pickle

class Preprocessor(object):
    """
    preprocessor wraps an 'HTMLCorpusReader' and performs tokenization
    and part-of-speech tagging
    """
    def __init__(self, corpus, target=None, **kwargs):
        self.corpus = corpus
        self.target = target
        
    def fileids(self, fileids=None, categories=None):
        fileids = self.corpus.resolve(fileids, categories)
        if fileids:
            return fileids
        return self.corpus.fileids()
    
    def abspath(self, fileid):
        # find directory, relative to corpus root
        parent = os.path.relpath(
            os.path.dirname(self.corpus.abspath(fileid)), self.corpus.root)
        
        # computer the name parts to reconstruct
        basename = os.path.basename(fileid)
        name, ext = os.path.splitext(basename)
        
        # create pickle file extension
        basename = name + '.pickle'
        
        # return path to file relative to the target
        return os.path.normpath(os.path.join(self.target, parent, basename))
    
    def tokenize(self, fileid):
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ]
            
    def process(self, fileid):
        """
        for a single file, checks location on disk to ensure no errors,
        uses +tokenize()+ to perform preprocessing and writes transformed
        doc as a pickle to target location
        """
        # compute outpath to write the file to
        target = self.abspath(fileid)
        parent = os.path.dirname(target)
        
        # make sure directory exists
        if not os.path.exists(parent):
            os.makedirs(parent)
            
        # make sure that the parent is a directory and not a file
        if not os.path.isdir(parent):
            raise ValueError('Please supply a directory to write preprocessed data to.')
            
        # create a data structure for the pickle
        document = list(self.tokenize(fileid))
        
        # open and serialize the pickle to disk
        with open(target, 'wb') as f:
            pickle.dmp(document, f, pickle.HIGHEST_PROTOCOL)
            
        # clean up the document
        del document
        
        return target
    
    def transform(self, fileids=None, categories=None):
        # make target directory if it doesn't already exist
        if not os.path.exists(self.target):
            os.makedirs(self.target)
            
        # resolve fileids to start processing
        for fileid in self.fileids(fileids, categories):
            yield self.process(fileid)

In [39]:
import pickle

PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'

class PickledCorpusReader(HTMLCorpusReader):
    
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
        
    def docs(self, fileids=None, categories=None):
        fileids = self.resolve(fileids, categories)
        # load one pickled document into memory at a time
        for path in self.abspaths(fileids):
            with open(path, 'rb') as f:
                yield pickle.load(f)
                
    def paras(self, fileids=None, categories=None):
        for doc in self.docs(fileids, categories):
            for para in doc:
                yield para
                
    def sents(self, fileids=None, categories=None):
        for para in self.paras(fileids, categories):
            for sent in para:
                yield sent
                
    def tagged(self, fileids=None, categories=None):
        for sent in self.sents(fileids, categories):
            for tagged_token in sent:
                yield tagged_token
                
    def words(self, fileids=None, categories=None):
        for tagged in self.tagged(fileids, categories):
            yield tagged[0]

In [40]:
import nltk
import string

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    for token in nltk.word_tokenize(text):
        if token in string.punctation: continue
        yield stem.stem(token)
        
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]