# Learning a Predictive N-Gram Model

## Obtain

In [None]:
import oldp_client 

conf = oldp_client.Configuration()
conf.api_key['api_key'] = '123abc'  # Replace this with your API key
api_client = oldp_client.ApiClient(conf)
cases_api = oldp_client.CasesApi(api_client)
cases = cases_api.cases_list(court_id=2).results[0:10]  # court=Europäischer Gerichtshof

## Clean

In [None]:
from utils import preprocessing

def clean(content):
    content = preprocessing.remove_pattern(content, r'<br.*>|\n|\t', replace_with=' ')
    content = preprocessing.remove_pattern(content, r'<[^>]+>')
    content = preprocessing.replace_html_special_ents(content)
    content = preprocessing.remove_whitespace(content)
    return content

text = ''
for case in cases[0:9]:
    text += clean(case.content)
    
test_text = clean(cases[9].content)

## Explore

In [None]:
import spacy
import collections

class Corpus:

    def __init__(self, text):
        nlp = spacy.load('de_core_news_sm')
        self.doc = nlp(text)

    def get_words(self):
        for token in self.doc:
            yield token.text
    
    def get_sentences(self):
        for sent in self.doc.sents:
            yield sent
                
    def get_ngrams(self, n):
        for sent in self.get_sentences():
            if len(sent) < 10:
                continue
            for pos in range(len(sent)):
                if len(sent)-pos < n:
                    break
                yield (*[sent[pos+i].text for i in range(n)],)

In [None]:
def print_most_common(n):
    counter = collections.Counter(corpus.get_ngrams(n))
    print('\nThe most common {}-grams:'.format(n))
    for k, v in counter.most_common(5):
        print('{}: {}'.format(k, v))

corpus = Corpus(text)

print('Number of words in corpus: ', len(list(corpus.get_words())))
print('Number of sentences in corpus: ', len(list(corpus.get_sentences())))
print('Size of alphabet:', len(set(corpus.get_words())))
    
print_most_common(1)
print_most_common(3)
print_most_common(5)

## Learning a Model

In [None]:
class NgramModel:
    
    def __init__(self, n=3):
        self.n = n
        self.ngrams = None
        self.alphabet = None
    
    def learn(self, corpus):
        self.ngrams = collections.Counter(corpus.get_ngrams(self.n))
        self.alphabet = set(corpus.get_words())
        
    def predict(self, context):
        context = tuple(context.split( ))
        if len(context) < self.n - 1:
            raise ValueError('The context has to be at least of length {}!'.format(self.n - 1))
        if len(context) >= self.n:
            context = context[-self.n + 1:]
            
        matches = {}
        for word in self.alphabet:
            count = self.ngrams[context + (word,)]
            if count > 0:
                matches[word] = count
        total_count = sum(matches.values(), 0.0)
        return {k: v / total_count for k, v in matches.items()}

corpus = Corpus(text)

model = NgramModel(n=3)
model.learn(corpus)

## Predict

In [None]:
model.predict('der Europäischen')

In [None]:
import numpy as np

def eval(n):
    train_corpus = Corpus(text)
    test_corpus = Corpus(test_text)

    model = NgramModel(n=n)
    model.learn(train_corpus)
    
    print('\nN={}:'.format(n))
    print('Training cross ent:', cross_ent(model, train_corpus, n))
    print('Test cross ent:', cross_ent(model, test_corpus, n))

def cross_ent(model, corpus, n):
    cross_ent = 0.0
    count = 0
    for ngram in corpus.get_ngrams(n):
        context = ' '.join(ngram[0:n-1])
        pred = ngram[n-1]
        distr = model.predict(context)

        # only count ngrams that occurred in the training data
        if pred in distr:
            cross_ent -= np.log2(distr[pred])
            count += 1
        
    cross_ent /= count
    return cross_ent

eval(2)
eval(3)
eval(5)