# Intro to NLP course (2017 - 2018)

## Homework 1 : Tokenization and Corpus Statistics

Peter Weber and Jonatan Piñol

Objectives:

1) Load and tokenize the treebank corpus from NLTK using regexp_tokenizer
- obtain the corpus using get_corpus_t1()
- obtain the gold standard using get_gold_tokens()
- extend the existing regexp grammar to improve its coverage
- modify the corpus prior to the tokenization (if needed)
- tokenize the corpus with regexp_tokenize()
- evaluate the tokenization using evaluate_t1()
- improve the regexp grammar until satisfied with the result

2) Print basic statistics for the corpus (after the tokenization)
- The number of tokens in the corpus
- The number of types in the corpus (case insensitive!)
- The number of hapaxes - tokens that appear in the corpus only once (case insensitive!) 
- The most frequent types with length >=5 
- The average token length
- The most frequent token length in the corpus
- The number of bi-, tri-, and five-grams in the corpus (you need to write your own function for extracting five-grams);
- The most frequent bi- and tri-grams that do NOT contain punctuation (for the task, assume punctuation to be , . ! ? )
- The most frequent five-grams
- The percentage of bi-,tri-, and five-grams that appear only once
- The 10 most frequent collocates of "man" and "woman" in the corpus, within a window of 4
- The 10 most frequent collocates of "man" and "woman", with a frequency of 5 or more, according to the PPMI score (within a window of 4)


In [1]:
# Import section

# Import nltk
import nltk
from nltk import word_tokenize
from nltk import regexp_tokenize
from nltk import FreqDist
from nltk import bigrams, trigrams
from nltk.collocations import *

# Import regular expressions
import re

# Import corpora
from nltk.corpus import treebank_raw

In [2]:
## Functions given in the task
## You should not change anything here
def get_corpus_t1(nr_files=199):
    """Returns the raw corpus as a long string.
    'nr_files' says how much of the corpus is returned;
    default is 199, which is the whole corpus.
    """
    fileids = nltk.corpus.treebank_raw.fileids()[:nr_files]
    corpus_text = nltk.corpus.treebank_raw.raw(fileids)
    # Get rid of the ".START" text in the beginning	of each file:
    corpus_text = corpus_text.replace(".START", "")
    return corpus_text

def fix_gold_tokens(tokens):
    """Replace tokens so that they are similar to the raw corpus text."""
    return [token.replace("''", '"').replace("``",'"').replace(r"\/", "/") for token in tokens]

def get_gold_tokens(nr_files=199):
    """Returns the gold corpus as a list of strings.
    'nr_files' says how much of the corpus is returned;
    default is 199, which is the whole corpus.
    """
    fileids = nltk.corpus.treebank_chunk.fileids()[:nr_files]
    gold_tokens = nltk.corpus.treebank_chunk.words(fileids)
    return fix_gold_tokens(gold_tokens)

def evaluate_t1(test_tokens, gold_tokens):
    """Finds the chunks where test_tokens differs from gold_tokens.
    Prints the errors and calculates similarity measures.
    """
    import difflib
    matcher = difflib.SequenceMatcher()
    matcher.set_seqs(test_tokens, gold_tokens)
    error_chunks = true_positives = false_positives = false_negatives = 0
    print(" Token%30s | %-30sToken" % ("Error", "Correct"))
    print("-" * 38 + "+" + "-" * 38)
    for difftype, test_from, test_to, gold_from, gold_to in matcher.get_opcodes():
        if difftype == "equal":
            true_positives += test_to - test_from
        else:
            false_positives += test_to - test_from
            false_negatives += gold_to - gold_from
            error_chunks += 1
            test_chunk = " ".join(test_tokens[test_from:test_to])
            gold_chunk = " ".join(gold_tokens[gold_from:gold_to])
            print("%6d%30s | %-30s%d" % (test_from,test_chunk, gold_chunk, gold_from))
    precision = 1.0 * true_positives / (true_positives + false_positives)
    recall = 1.0 * true_positives / (true_positives+ false_negatives)
    fscore = 2.0 * precision * recall / (precision+ recall)
    print()
    print("Test size: %5d tokens" % len(test_tokens))
    print("Gold size: %5d tokens" % len(gold_tokens))
    print("Nr errors: %5d chunks" % error_chunks)
    print("Precision: %5.2f %%" % (100 * precision))
    print("Recall: %5.2f %%" % (100 * recall))
    print("F-score: %5.2f %%" % (100 * fscore))
    print()

In [3]:
# HOMEWORK 1. PART 1.
# Dummy function
# Feel free to make it more verbose and include prints/status updates
def hw1_part1():
    # Get the corpus
    print("\n------------------------- FIRST PART ---------------------------------")
    corpus = get_corpus_t1()
    # Modification on the corpus in order to tokenize "n't"
    corpus = re.sub('n\'t', ' n\'t', corpus)
    
    # Get the gold standard
    gold_tokens = get_gold_tokens()

    # Initial regular expression grammar
    # You need to modify it so that you can improve the performance of the tokenizer
    re_grammar = r'''(?x)  # set flag to allow verbose regexps
     \'[a-z][a-z]?                 
     | (?:n\'t)                           # n't
     | Corp\. | Calif\. | Sept\. | Conn\. # usual abbreviations
     |[A-Z][a-z]{1,2}\.                  # more abreviations
     | [A-Z]*[a-z]+\/[A-Z]*[a-z]+     
     | \d+(?:\,\d+)*\-\w+\-?\w*         # 19-years-old
     | [A-Z][A-Z]?\&[A-Z]
     | [\$\%]                           # dollar symbol and percentage symbol
     | \'?[0-9]+s                       # 1950s
     | \d+\:\d{2}\s[ap]\.m              # 12:30 p.m
     | \d+(?:[\.\,\/\-]\d+)?(?:[\,\-\/\-]\d+)?           # numbers, e.g. 12.40, 82,2, 2/7,9-0, 122,222,222
     | (?:[A-Z]\.)+\.?                  # abbreviations, e.g. U.S.A.
     | (?:[a-z]\.)+\.?
     | \w+(?:-\w+)*                     # words with optional internal hyphens
     | \.\.\.                           # ellipsis
     | \-\-
     | [][.,;"'?!():-_`#&{}]            # these are separate tokens; includes ], [

    ''' 

    # Modify the corpus prior to tokenization here, if necessary
    
    # Tokenize the corpus
    test_tokens = regexp_tokenize(corpus, re_grammar)
    
    # Evaluate the results
    evaluate_t1(test_tokens,gold_tokens)
    
    return(test_tokens)

In [4]:
# HOMEWORK 1. PART 2.
# Dummy function
# Feel free to make it more verbose and include prints/status updates
def hw1_part2(tokens):
    
    print("\n-------------------------- SECOND PART -----------------------------")
    print("\nThis function prints the corpus statistics for the SECOND PART of the HOMEWORK 1")
    print("\n -------------------------------------------------------------------------------")
    
    # all tokens lowercase, for case insensitiveness. We asume that for n-grams statistics should also be case insensitive
    words = [w.lower() for w in tokens] 

    # number of tokens in the corpus
    n_tokens = len(words)
    n_tokens
    print("\nThe number of tokens is : ",n_tokens)

    # number of types
    types = set(words)
    n_types = len(types) # case insensitve! 
    n_types
    print("\nThe number of types is : ",n_types)

    # number of hapaxes
    fdist = FreqDist(words)
    n_hapaxes = len(fdist.hapaxes()) # case insensitve! 
    n_hapaxes
    print("\nThe number of hapaxes is : ",n_hapaxes)

    # most frequent types with length >= 5
    long_tokens = [w for w in words if len(w)>=5] # filter tokens with length >=5
    fdist2 = FreqDist(long_tokens)  
    frequent_long_types = fdist2.most_common(10) # most common types 
    frequent_long_types
    print("\nThe most frequent types with length >= 5 are (case insensitive!!): ",frequent_long_types)

    # average token length
    average_lenght =  sum(len(w) for w in words)/n_tokens
    average_lenght
    print("\nThe average token length is: ",average_lenght)

    # The most frequent token length in text
    token_lengths = (len(w) for w in words)
    fdist3 = FreqDist(token_lengths)
    most_frequent_length = fdist3.most_common(1)[0][0]
    most_frequent_length
    print("\nThe most frequent token length in text: ",most_frequent_length)

    # the number of bi-grams, tri-grams, five-grams 
    # bi-grams
    bigr_list = list(bigrams(words))
    n_bigr = len(bigr_list)
    print("\nThe number of bigrams is : ",n_bigr)
    # tri-grams
    trigr_list = list(trigrams(words))
    n_trigr = len(trigr_list)
    print("\nThe number of trigrams is : ",n_trigr)
    # five-grams
    from nltk import ngrams
    fivegrams = ngrams(words, 5)
    fivegr_list = list(grams for grams in fivegrams)
    n_fivegr = len(fivegr_list)
    print("\nThe number of fivegrams is : ",n_fivegr)

    # The most frequent bi- and tri-grams that do NOT contain punctuation (for the task, assume punctuation to be , . ! ? )
    # bi-gram
    words_no_punctuation = list(w for w in words if re.search('.*[^,\.\!\?].*',w))
    bigr_list2 = list(bigrams(words_no_punctuation))
    fdist4 = FreqDist(bigr_list2)
    print("\nMost frequent bigram", fdist4.most_common(10))
    #tri-gram
    trigr_list2 = list(trigrams(words_no_punctuation))
    fdist5 = FreqDist(trigr_list2)
    print("\nMost frequent trigram",fdist5.most_common(10))

    # The most frequent five-gram (as the opposite is not stated, in this case 5-grams could contain punctuation):
    fdist6 = FreqDist(fivegr_list)
    print("\nMost frequent fivegram",fdist6.most_common(10))

    # The percentage of bi-,tri-, and five-grams that appear only once, that MAY contain some punctuation
    fdist7 = FreqDist(bigr_list)
    bigr_onetime = len(fdist7.hapaxes())
    bigr_onetime_perc = (bigr_onetime/len(bigr_list))*100
    print("\nPercentage of bigrams that appear once ",bigr_onetime_perc,"%" )

    fdist8 = FreqDist(trigr_list)
    trigr_onetime = len(fdist8.hapaxes())
    trigr_onetime_perc = (trigr_onetime/len(trigr_list))*100
    print("\nPercentage of trigrams that appear once ",trigr_onetime_perc,"%" )

    fdist9 = FreqDist(fivegr_list)
    fivegr_onetime = len(fdist9.hapaxes())
    fivegr_onetime_perc = (fivegr_onetime/len(fivegr_list))*100
    print("\nPercentage of fivegrams that appear once ",fivegr_onetime_perc,"%" )

    # The 10 most frequent collocates of "man" and "woman" in the corpus, within a window of 4
    cooc_ = BigramCollocationFinder.from_words(words, window_size=5)

    cooc_man_list = []
    cooc_woman_list = []
    for pair,freq in cooc_.ngram_fd.items():
        if 'man' in pair:
            cooc_man_list.append([pair,freq])
        elif 'woman' in pair:
            cooc_woman_list.append([pair,freq])
    cooc_man_list = sorted(cooc_man_list, key=lambda x: x[1],reverse=True)
    cooc_woman_list = sorted(cooc_woman_list, key=lambda x: x[1],reverse=True)
    print("\nCo-occurrence of 'man' in a window of 4 : ",cooc_man_list[:10])
    print("\nCo-occurrence of 'woman' in a window of 4 : ",cooc_woman_list[:10])

    # The 10 most frequent collocates of "man" and "woman", with a frequency of 5 or more, according to the PPMI score
    # (within a window of 4)
    # For this part we assume that what is asked is to output the top 10 "man"/"woman" collocates of highest PPMI score 
    # Load the pre-built association measures
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    cooc_.score_ngrams(bigram_measures.pmi)
    cooc_man_list = []
    cooc_woman_list = []
    for pair,freq in cooc_.score_ngrams(bigram_measures.pmi):
        if 'man' in pair:
            cooc_man_list.append([pair,freq])
        if 'woman' in pair:
            cooc_woman_list.append([pair,freq])
    cooc_man_list_top = list(bigram for bigram in cooc_man_list if bigram[1]>=5)
    cooc_woman_list_top = list(bigram for bigram in cooc_woman_list if bigram[1]>=5)
    print("\nThe 10 most frequent collocates of 'man', with a frequency of 5 or more, according to the PPMI score :",cooc_man_list_top[:10])
    print("\nThe 10 most frequent collocates of 'woman', with a frequency of 5 or more, according to the PPMI score :",cooc_woman_list_top[:10])

In [5]:
# Main program
tokens = hw1_part1()
hw1_part2(tokens)


------------------------- FIRST PART ---------------------------------


  return [tok for tok in self._regexp.split(text) if tok]


 Token                         Error | Correct                       Token
--------------------------------------+--------------------------------------
   444                        Mass . | Mass.                         444
   818                      Donoghue |                               817
  1300                         S. p. | S.p .                         1298
  1499                         Corp. | Corp .                        1497
  2700                        Mass . | Mass.                         2699
  2757                         Conn. | Conn .                        2755
  2917                         . . . | ...                           2916
  3330                         . . . | ...                           3327
  4502                          Ltd. | Ltd .                         4497
  4741                        Colo . | Colo.                         4737
  4790                      Messrs . | Messrs.                       4785
  6852                          US 

------------------------

--------

---------