<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/ken_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KenLM Framework for Language Modeling

## Install KenLM

### Reference: https://github.com/kpu/kenlm

In [1]:
# !pip install https://github.com/kpu/kenlm/archive/master.zip

In [2]:
import sys
sys.path.append('utils/')

In [3]:
import kenlm
import os
import re
import utils.ngram_utils as ngram_utils
import numpy as np

In [4]:
# Read data from .txt files and create lists of reviews

train_data = []
# create a list of all the reviews 
with open('../data/amazon_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
vocab = list(set(all_tokens_train))
len(vocab)

23115

In [7]:
train_data = []
for t in train_data_tokenized:
    train_data.append(' '.join(t))
train_data[:3]

['this is a great tutu and at a really great price .',
 "it doesn ' t look cheap at all .",
 "i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly ."]

In [8]:
valid_data = []
for t in valid_data_tokenized:
    valid_data.append(' '.join(t))
valid_data[:3]

['these are not sized right .',
 'a 3x is always big on me and these r cut wrong !',
 "i ' m returning them ."]

In [9]:
len(train_data), len(valid_data)

(107790, 15172)

In [10]:
# # Change directory where you have the data
# path_to_data = '../data/'
# os.chdir(path_to_data)
# path_to_data

## 3-gram model with KenLM

In [14]:
cat ../data/amazon_train.txt | ../../kenlm/build/bin/lmplz -o 3 > amazonLM3.arpa

/bin/sh: 1: /home/roberta/venv/robertavenv/lib/python3.5/site-packages/kenlm/build/bin/lmplz: not found
cat: write error: Broken pipe


In [12]:
!../../kenlm/build/bin/build_binary amazonLM3.arpa amazonLM3.klm

/bin/sh: 1: ../../kenlm/build/bin/build_binary: not found


In [13]:
model_3n = kenlm.LanguageModel('amazonLM3.klm')
model_3n

OSError: Cannot read model 'amazonLM3.klm' (util/file.cc:76 in int util::OpenReadOrThrow(const char*) threw ErrnoException because `-1 == (ret = open(name, 00))'. No such file or directory while opening /home/roberta/AMMI-2019-NLP-Part2/01-day-LM/amazonLM3.klm)

## 5-gram KenLM

In [None]:
cat ../data/amazon_train.txt | ../../kenlm/build/bin/lmplz -o 5 > amazonLM5.arpa

In [None]:
!../../kenlm/build/bin/build_binary amazonLM5.arpa amazonLM5.klm

In [None]:
model_5n = kenlm.LanguageModel('amazonLM5.klm')
model_5n

## Perplexity (Train + Valid Data)

### The KenLM model reports negative log likelihood, not perplexity. So we'll be converting the score and report net perplexity. The following function calculate the perpelxity.

### Pereplexity is defined as follows, $$ PPL = b^{- \frac{1}{N} \sum_{i=1}^N \log_b q(x_i)} $$ 

### All probabilities here are in log base 10 so to convert to perplexity, we do the following 

### $$PPL = 10^{-\log(P) / N} $$ 

### where $P$ is the total NLL, and $N$ is the word count.

In [None]:
def get_ppl(lm, sentences):
    """
    Assume sentences is a list of strings (space delimited sentences)
    """
    total_nll = 0
    total_wc = 0
    for sent in sentences:
        sent = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", sent)
        words = sent.strip().split()
        score = lm.score(sent, bos=False, eos=False)
        word_count = len(words)
        total_wc += word_count
        total_nll += score
    ppl = 10**-(total_nll/total_wc)
    return ppl


In [None]:
# 3-gram
train_ppl = get_ppl(model_3n, train_data)
valid_ppl = get_ppl(model_3n, valid_data)
train_ppl, valid_ppl

In [None]:
# 5-gram
train_ppl = get_ppl(model_5n, train_data)
valid_ppl = get_ppl(model_5n, valid_data)
train_ppl, valid_ppl

## Score Sentences

In [None]:
sentences = ['i like this product very much .']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

In [None]:
sentences = ['i like pandas']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

Function for loading the data

In [None]:
sentences = ['this color is very ugly']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

In [None]:
sentences = ['kigali is an awesome city !']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

In [None]:
sentences = ['i want to get a refund']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

In [None]:
sentences = ['this watch is not what i expected']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

In [None]:
sentences = ['this dress fits me perfectly !']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

In [None]:
sentences = ['my wife loves the color of this dress']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl3, ppl5

## Generate Sentences

In [None]:
def generate(lm, context='<s>', max_num_tokens=20):
    generated_tokens = []
    cur_sent = context
    for j in range(max_num_tokens):
        scores = []
        for i, token in enumerate(vocab):
            sent = cur_sent + ' ' + token
            if token == '</s>':
                eos = True
            else:
                eos = False
            token_score = lm.score(sent, bos=True, eos=eos)
            scores.append(token_score)
        best_token = vocab[np.argmax(scores)]
        generated_tokens.append(best_token)
        cur_sent = cur_sent + ' ' + best_token
        if best_token == '</s>':
            break
    return generated_tokens

In [None]:
s3 = generate(model_3n)
s5 = generate(model_5n)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> i will'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> i like'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> i am'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> this'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> this dress'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> this animal'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))

In [None]:
context = '<s> what'
s3 = generate(model_3n, context=context)
s5 = generate(model_5n, context=context)
print(' '.join(word for word in s3))
print(' '.join(word for word in s5))