Introduction to NLP course (2017-2018).

Homework 2.1: Markov Models. Hidden Markov Models and Part of Speech Tagging.

Objectives:

1) Create a tri-gram model for generating pseudo-Trump sentences 
- load the corpus, tokenize it and obtain list of trigrams 
- define a function that obtains the counts of the "model" 
- define a function that generates a pseudo-sentence 
- when generating a sentence, make sure that your sentence fulfils the following requirements
    - it is at least 5 words long
    - the last token of the pseudo-sentence is a ".", "!", or "?"
    - it does not contain any other ".", "!", "?" tokens other than the final one
- print 5 pseudo-sentences

2) Use the built-in n-gram HMM models in nltk to tag a corpus 
- load the brown corpus
- split each category in the corpus to test and train
- for each category in the corpus, train on the train set and evaluate on the test set the following taggers:
    - default
    - affix
    - unigram
    - bigram
    - trigram
    
    Each tagger should have backoff configured on the previous tagger.
    
    Print the results in a table.
    
    
- repeat the previous experiment using universal tagset. Print the results in a table.
- cross evaluate between different genres (train on one category, evaluate on all the other categories). Print and compare the results
- Only for the "news" portion of the corpus, compare
    - the best berforming tagger (with backoff)
    - the naive bayes tagger
    
    Compare the accuracy as well as the execution time.
    
    Use both the universal tagset and the full tagset.

In [75]:
# Import section

# Import nltk
import nltk
from nltk import bigrams, trigrams

# Import numpy
import numpy as np
import pandas as pd

# Import codecs
import codecs

# Import taggers
from nltk import DefaultTagger, AffixTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk import ClassifierBasedPOSTagger

# Import the brown corpus
from nltk.corpus import brown

import time

In [76]:
# Homework 2 part 1

# Dummy function
# Extend and rework

def hw2_part1():
    
    # Trump speeches file location
    fname = "speeches.txt"
    # Read the corpus
    raw_corpus = codecs.open(fname,'r','utf8').read()
    
    # Tokenize the corpus
    corpus = nltk.word_tokenize(raw_corpus)

    # Generate list of trigrams
    
    
    # Initialize the "markov model"
    # Preferably, you should define a function (or an object)
    # which "trains" the model. You should just invoke the function here.

    
    # Fill in all the counts 


    # Generate a sentence
    # Preferably you should define a function (or an object)
    # which "generates" a sentence following the requirements (min length, ending with punctuation, etc)
    # You should just invoke the code here
    
    
    # Print the sentences





In [83]:
# Homework 2 part 2

# Function that splits a corpus in train and test
def split_train_test(corpus,test_size=500):
    return corpus[test_size:], corpus[:test_size]

def split_each_category_in_train_and_test(corpus, fraction_ts, tagset=None):
    train_sets = {} # dictionary of training sets, one key for each category
    test_sets = {}  # dictionary of test sets, one key for each category
    for category in corpus.categories():
        corpus_tsents = corpus.tagged_sents(categories=category, tagset=tagset)
        tsents_train, tsents_test = split_train_test(corpus_tsents, test_size=int(fraction_ts * len(corpus_tsents)))
        # add training and test set to the corresponding dictionary
        train_sets[category] = tsents_train
        test_sets[category] = tsents_test
        
    return train_sets, test_sets

def get_most_common_tag(train_set):
    flat_list = [item for sublist in train_set for item in sublist]
    tags = [tag for (word, tag) in flat_list]
    # Get the most frequent tag in the training set
    most_frequent_tag = nltk.FreqDist(tags).max()
    return most_frequent_tag

# For each category, train and evaluate taggers. Use backoff.
def train_and_evaluate_taggers(taggers_list, corpus, train_sets, test_sets):
    # dictionary to keep the results of each tagger on each category
    benchmarks_df = {t.__name__: [] for t in taggers_list}
    # dictionary to keep models trained on each category
    models_df = {t.__name__: [] for t in taggers_list}
    
    for i, category in enumerate(brown.categories()):
        for tagger in taggers_list:
            if tagger == DefaultTagger:
                most_common_tag = get_most_common_tag(train_sets[category])
                # the default tagger does not have a backoff tagger
                tagger_model = tagger(most_common_tag)
            else:
                # each tagger takes as backoff tagger the previous tagger
                tagger_model = tagger(train_sets[category], backoff=tagger_model)
            # evaluate tagger
            accuracy = round(tagger_model.evaluate(test_sets[category]),4) * 100
            # append to dictionaries accuracy and model
            benchmarks_df[tagger.__name__].append(accuracy)
            models_df[tagger.__name__].append(tagger_model)
    
    # returns the statistics for all taggers and all categories
    benchmarks_df = pd.DataFrame(benchmarks_df, index=brown.categories())
    models_df = pd.DataFrame(models_df, index=brown.categories())
    return benchmarks_df, models_df

# Dummy function
# Extend and rework
def hw2_part2():
    # list of taggers to benchmark
    taggers = [DefaultTagger, AffixTagger, UnigramTagger, BigramTagger, TrigramTagger]
    fraction_ts = 0.2 # faction test set
    
    ### BROWN TAGSET   
    # Split each category in the brown corpus into train and test
    train_sets_full, test_sets_full = split_each_category_in_train_and_test(brown, fraction_ts)
    brown_tagset_df, models_df_full = train_and_evaluate_taggers(taggers, brown, train_sets_full, test_sets_full)
    print (brown_tagset_df)
    
    ### UNIVERSAL TAGSET
    # Split each category in the brown corpus into train and test using tagset='universal'
    train_sets_uni, test_sets_uni = split_each_category_in_train_and_test(brown, fraction_ts, tagset='universal')
    universal_tagset_df, models_df_uni = train_and_evaluate_taggers(taggers, brown, train_sets_uni, test_sets_uni)
    print (universal_tagset_df)   
    
    ### NB classifier
    
    # Print the performance of the best performing n-gram tagger and the runtime (full tagset)  
    bigram_tagger = models_df_full.loc['news', 'BigramTagger']
    tic = time.time()
    accuracy = bigram_tagger.evaluate(test_sets_full['news']) * 100
    runtime = time.time() - toc
    print "BigramTagger on news, (full tagset). Accuracy:", round(accuracy,2), "Evaluation runtime (s):", runtime
    
    # Train and evaluate nb tagger on the "news" category (full tagset)
    nb_tagger = ClassifierBasedPOSTagger(train=train_sets_full['news'])
    
    # Print the performance of the nb tagger and the runtime (full tagset)
    tic = time.time()
    accuracy = nb_tagger.evaluate(test_sets_full['news']) * 100
    runtime_full = round(time.time() - tic, 2)
    print "Naive Bayes on news, (full tagset). Accuracy:", round(accuracy,2), "Evaluation runtime (s):", runtime_full
    
    # Print the performance of the best performing n-gram tagger and the runtime (universal tagset)
    bigram_tagger = models_df_uni.loc['news', 'BigramTagger']
    tic = time.time()
    accuracy = bigram_tagger.evaluate(test_sets_uni['news']) * 100
    runtime = time.time() - toc
    print "BigramTagger on news, (universal tagset). Accuracy:", round(accuracy,2), "Evaluation runtime (s):", runtime
    
    # Train and evaluate nb tagger on the "news" category (universal tagset)
    nb_tagger = ClassifierBasedPOSTagger(train=train_sets_uni['news'])
    
    # Print the performance of the nb tagger and the runtime (universal tagset)
    tic = time.time()
    accuracy = nb_tagger.evaluate(test_sets_uni['news']) * 100
    runtime_uni = round(time.time() - tic, 2)
    print "Naive Bayes on news, (universal tagset). Accuracy:", round(accuracy,2), "Evaluation runtime (s):", runtime_uni
    
    ### Cross evaluation
    
    # Cross-evaluate between categories (using universal tagset)
    # Example: train on news_train, evaluate on the "test" of every other category
    # Do this for all categories in the corpus
    # Print the results

In [84]:
hw2_part2()

                 AffixTagger  BigramTagger  DefaultTagger  TrigramTagger  UnigramTagger
adventure              23.71         90.17          10.87          90.05          88.79
belles_lettres         28.15         90.38          12.86          90.34          89.11
editorial              29.28         87.09          13.93          86.85          86.12
fiction                24.46         88.74          11.38          88.69          87.44
government             31.51         86.49          13.67          86.48          85.25
hobbies                28.19         85.97          13.63          85.94          85.01
humor                  24.44         83.24          11.73          83.33          82.82
learned                34.94         90.24          18.04          90.31          89.19
lore                   27.94         88.32          14.05          88.18          87.16
mystery                23.33         89.47          11.30          89.25          87.53
news                   29.60    

In [85]:
#90.05 90.34 86.85 88.69 tri none
#93.96 93.96 92.69 93.23 tri universal