# Preprocessing the data

In [40]:
import nltk

SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"

def add_sentence_tokens(sentences, n):
    
    sos = SOS * (n-1) if n > 1 else SOS
    return ['{}{} {}'.format(sos, s, EOS) for s in sentences]

def replace_singletons(tokens):
   
    vocab = nltk.FreqDist(tokens)
    return [token if vocab[token] > 1 else UNK for token in tokens]

def preprocess(sentences, n):
    """Add SOS/EOS/UNK tokens to given sentences and tokenize.

    Args:
        sentences (list of str): the sentences to preprocess.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        The preprocessed sentences, tokenized by words.

    """
    sentences = add_sentence_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    tokens = replace_singletons(tokens)
    return tokens

# Data Loading

In [57]:
from itertools import product
import math
import nltk
from pathlib import Path

def load_data(data_path):
    """Load data from the given data path.

    Args:
        data_path (Path) -- full path of the file to load data.

    Returns:
        Data sets, as lists of sentences.

    """

    with open(data_path, 'r') as f:
        data = [l.strip().split(".") for l in f.readlines()]    
    res = sum(data, [])
    res = list(filter(lambda x: x != "", res))
    return res

# Language modeling

In [20]:
class LanguageModel(object):
    """An n-gram language model trained on a given corpus.
    
    For a given n and given training corpus, constructs an n-gram language
    model for the corpus by:
    1. preprocessing the corpus (adding SOS/EOS/UNK tokens)
    2. calculating (smoothed) probabilities for each n-gram

    Also contains methods for calculating the perplexity of the model
    against another corpus, and for generating sentences.

    Args:
        train_data (list of str): list of sentences comprising the training corpus.
        n (int): the order of language model to build (i.e. 1 for unigram, 2 for bigram, etc.).
        laplace (int): lambda multiplier to use for laplace smoothing (default 1 for add-1 smoothing).

    """

    def __init__(self, train_data, n, laplace=1):
        self.n = n
        self.laplace = laplace
        self.tokens = preprocess(train_data, n)
        self.vocab  = nltk.FreqDist(self.tokens)
        self.model  = self._create_model()
        self.masks  = list(reversed(list(product((0,1), repeat=n))))

    def _smooth(self):
        """Apply Laplace smoothing to n-gram frequency distribution.
        
        Here, n_grams refers to the n-grams of the tokens in the training corpus,
        while m_grams refers to the first (n-1) tokens of each n-gram.

        Returns:
            dict: Mapping of each n-gram (tuple of str) to its Laplace-smoothed 
            probability (float).

        """
        vocab_size = len(self.vocab)

        n_grams = nltk.ngrams(self.tokens, self.n)
        n_vocab = nltk.FreqDist(n_grams)

        m_grams = nltk.ngrams(self.tokens, self.n-1)
        m_vocab = nltk.FreqDist(m_grams)

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

        return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }

    def _create_model(self):
        """Create a probability distribution for the vocabulary of the training corpus.
        
        If building a unigram model, the probabilities are simple relative frequencies
        of each token with the entire corpus.

        Otherwise, the probabilities are Laplace-smoothed relative frequencies.

        Returns:
            A dict mapping each n-gram (tuple of str) to its probability (float).

        """
        if self.n == 1:
            num_tokens = len(self.tokens)
            return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
        else:
            return self._smooth()
    

In [60]:
import nltk
from nltk import word_tokenize, sent_tokenize
# Load and prepare train/test data
data_path = Path("/Users/Raj/Documents/NEU/NLP/Assignment1/lang_modelingdata.txt")
data = load_data(data_path)


'We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness'

# Unigram Model

In [61]:
ngram_size = 1

unigram_lm = LanguageModel(data, ngram_size, laplace=1)
print("Number of sentences: {}".format(len(data)))
print("Vocabulary size: {}".format(len(unigram_lm.vocab)))
print(f"Number of tokens : {len(unigram_lm.tokens)}")


Number of sentences: 45
Vocabulary size: 136
Number of tokens : 1433


# Bigram Model

In [None]:
ngram_size = 2

lm = LanguageModel(data, ngram_size, laplace=1)
print("Number of sentences: {}".format(len(data)))
print("Vocabulary size: {}".format(len(lm.vocab)))
print(f"Number of tokens : {len(lm.tokens)}")

In [65]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.util import ngrams
unigram=[]
tokenized_text = []
bigram = []
trigram = []
fourgram = []
for sentence in data:
    sentence = sentence.lower()
    sequence = word_tokenize(sentence) 
    for word in sequence:
        if (word =='.'):
            sequence.remove(word) 
        else:
            unigram.append(word)
            
    tokenized_text.append(sequence) 
            
    bigram.extend(list(ngrams(sequence, 2)))  
#unigram, bigram, trigram, and fourgram models are created
    trigram.extend(list(ngrams(sequence, 3)))
    fourgram.extend(list(ngrams(sequence, 4)))

In [66]:
def removal(x):     
#removes ngrams containing only stopwords
    y = []
    for pair in x:
        count = 0
        for word in pair:
            if word in stop_words:
                count = count or 0
            else:
                count = count or 1
        if (count==1):
            y.append(pair)
    return(y)
bigram = removal(bigram)
trigram = removal(trigram)             
fourgram = removal(fourgram)
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)
print("Most common n-grams without stopword removal and without add-1 smoothing: \n")
print ("Most common bigrams: ", freq_bi.most_common(5))
print ("\nMost common trigrams: ", freq_tri.most_common(5))
print ("\nMost common fourgrams: ", freq_four.most_common(5))

Most common n-grams without stopword removal and without add-1 smoothing: 

Most common bigrams:  [((',', 'and'), 22), ((',', 'that'), 5), ((',', 'it'), 4), ((',', 'the'), 4), ((';', 'and'), 4)]

Most common trigrams:  [((',', 'and', 'to'), 4), ((',', 'it', 'is'), 3), (('he', 'has', 'refused'), 3), (('his', 'assent', 'to'), 3), (('united', 'states', 'of'), 2)]

Most common fourgrams:  [(('united', 'states', 'of', 'america'), 2), (('states', 'of', 'america', ','), 2), ((',', 'it', 'is', 'their'), 2), (('his', 'assent', 'to', 'laws'), 2), (('on', 'the', 'inhabitants', 'of'), 2)]


In [49]:
# noofsentences = 2
# print("Generating sentences...")
for sentence, prob in lm.generate_sentences(6):
    print("{} ({:.5f})".format(sentence, prob))

<s> He has refused his Assent to the most </s> (0.08694)
<s>  We have been the most </s> (0.09970)
<s> For transporting us in the most </s> (0.09117)
<s> The history of the most </s> (0.09366)
<s> We have been the most </s> (0.08855)
<s> --That to the most </s> (0.09517)


In [67]:
#stopwords = code for downloading stop words through nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#prints top 10 unigrams, bigrams after removing stopwords
print("Most common n-grams with stopword removal and without add-1 smoothing: \n")
unigram_sw_removed = [p for p in unigram if p not in stop_words]
fdist = nltk.FreqDist(unigram_sw_removed)
print("Most common unigrams: ", fdist.most_common(10))
bigram_sw_removed = []
bigram_sw_removed.extend(list(ngrams(unigram_sw_removed, 2)))
fdist = nltk.FreqDist(bigram_sw_removed)
print("\nMost common bigrams: ", fdist.most_common(10))

Most common n-grams with stopword removal and without add-1 smoothing: 

Most common unigrams:  [(',', 105), ('us', 11), ('people', 10), (';', 10), ('laws', 9), (':', 9), ('states', 8), ('right', 7), ('government', 6), ('among', 5)]

Most common bigrams:  [((',', 'right'), 5), (('government', ','), 4), (('usurpations', ','), 3), (('independent', 'states'), 3), (('united', 'states'), 2), (('states', 'america'), 2), (('america', ','), 2), (('rights', ','), 2), (('great', 'britain'), 2), (('assent', 'laws'), 2)]
