<a href="https://colab.research.google.com/github/mocamocamo/inflearn-llm-colab/blob/main/3_N_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# N-Gram 예제
- Reference : https://www.kaggle.com/alvations/n-gram-language-model-with-nltk

## NLTK 라이브러리 설치 및 import

In [1]:
!pip install -U nltk



In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [3]:
import nltk
nltk.download("all", quiet=True)

True

In [4]:
text = [['apple', 'grape', 'carrot'], ['apple', 'carrot', 'melon', 'grape', 'melon', 'watermelon']]

In [5]:
list(bigrams(text[0]))

[('apple', 'grape'), ('grape', 'carrot')]

In [6]:
list(ngrams(text[1], n=3))

[('apple', 'carrot', 'melon'),
 ('carrot', 'melon', 'grape'),
 ('melon', 'grape', 'melon'),
 ('grape', 'melon', 'watermelon')]

# 문장의 시작(\<s>\)과 끝(\</s>\)을 나타내는 Padding 추가

In [7]:
from nltk.util import pad_sequence
list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))

['<s>', 'apple', 'grape', 'carrot', '</s>']

In [8]:
padded_sent = list(pad_sequence(text[0],
                                pad_left=True, left_pad_symbol="<s>",
                                pad_right=True, right_pad_symbol="</s>",
                                n=2))
list(ngrams(padded_sent, n=2))

[('<s>', 'apple'), ('apple', 'grape'), ('grape', 'carrot'), ('carrot', '</s>')]

# pad_both_ends 함수를 이용해서 이 과정을 좀더 쉽게 수행할 수 있습니다.

In [9]:
from nltk.lm.preprocessing import pad_both_ends

list(pad_both_ends(text[0], n=2))

['<s>', 'apple', 'grape', 'carrot', '</s>']

In [10]:
list(bigrams(pad_both_ends(text[0], n=2)))

[('<s>', 'apple'), ('apple', 'grape'), ('grape', 'carrot'), ('carrot', '</s>')]

# everygrams 함수를 이용해서 각 N-gram(e.g. 1-gram, 2-gram, 3-gram)의 시작과 끝에 padding을 적용할 수 있습니다.

In [11]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n=2))
print(padded_bigrams)
print(list(everygrams(padded_bigrams, max_len=3)))

['<s>', 'apple', 'grape', 'carrot', '</s>']
[('<s>',), ('<s>', 'apple'), ('<s>', 'apple', 'grape'), ('apple',), ('apple', 'grape'), ('apple', 'grape', 'carrot'), ('grape',), ('grape', 'carrot'), ('grape', 'carrot', '</s>'), ('carrot',), ('carrot', '</s>'), ('</s>',)]


# flatten 함수를 이용해서 모든 문자들을 펼칠 수 있습니다.

In [12]:
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text))

['<s>',
 'apple',
 'grape',
 'carrot',
 '</s>',
 '<s>',
 'apple',
 'carrot',
 'melon',
 'grape',
 'melon',
 'watermelon',
 '</s>']

In [16]:
from nltk.lm.preprocessing import padded_everygram_pipeline
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ngramlize_sent in training_ngrams:
  print(f"ngramlize_sent: {list(ngramlize_sent)}")

list(padded_sentences)

ngramlize_sent: [('<s>',), ('<s>', 'apple'), ('apple',), ('apple', 'grape'), ('grape',), ('grape', 'carrot'), ('carrot',), ('carrot', '</s>'), ('</s>',)]
ngramlize_sent: [('<s>',), ('<s>', 'apple'), ('apple',), ('apple', 'carrot'), ('carrot',), ('carrot', 'melon'), ('melon',), ('melon', 'grape'), ('grape',), ('grape', 'melon'), ('melon',), ('melon', 'watermelon'), ('watermelon',), ('watermelon', '</s>'), ('</s>',)]


['<s>',
 'apple',
 'grape',
 'carrot',
 '</s>',
 '<s>',
 'apple',
 'carrot',
 'melon',
 'grape',
 'melon',
 'watermelon',
 '</s>']

# 테스트를 위한 텍스트 파일(language-never-random.txt)을 다운받습니다.

In [18]:
import os
import requests
import io

# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)


text

'                       Language is never, ever, ever, random\n\n                                                               ADAM KILGARRIFF\n\n\n\n\nAbstract\nLanguage users never choose words randomly, and language is essentially\nnon-random. Statistical hypothesis testing uses a null hypothesis, which\nposits randomness. Hence, when we look at linguistic phenomena in cor-\npora, the null hypothesis will never be true. Moreover, where there is enough\ndata, we shall (almost) always be able to establish that it is not true. In\ncorpus studies, we frequently do have enough data, so the fact that a rela-\ntion between two phenomena is demonstrably non-random, does not sup-\nport the inference that it is not arbitrary. We present experimental evidence\nof how arbitrary associations between word frequencies and corpora are\nsystematically non-random. We review literature in which hypothesis test-\ning has been used, and show how it has often led to unhelpful or mislead-\ning results.\n

In [20]:
from nltk import word_tokenize, sent_tokenize
# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                  for sent in sent_tokenize(text)]

tokenized_text[0]

['language',
 'is',
 'never',
 ',',
 'ever',
 ',',
 'ever',
 ',',
 'random',
 'adam',
 'kilgarriff',
 'abstract',
 'language',
 'users',
 'never',
 'choose',
 'words',
 'randomly',
 ',',
 'and',
 'language',
 'is',
 'essentially',
 'non-random',
 '.']

In [21]:
print(text[:500])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 


# 3-gram 모델 선정

In [23]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

train_data

<generator object padded_everygram_pipeline.<locals>.<genexpr> at 0x7fdd0974b530>

# 학습을 위해 MLE(Maximum Likelihood Estimation) 추정

In [24]:
from nltk.lm import MLE
model = MLE(n) # Lets train a 3-grams model, previously we set n=3

In [25]:
len(model.vocab)

0

In [26]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1391 items>


In [27]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


# 만약 Vocab 집합에 포함되지 않는 단어라면 <UNK>라는 특수 토큰으로 처리됩니다.

In [29]:
# If we lookup the vocab on unseen sentences not from the training data,
# it automatically replace words not in the vocabulary with `<UNK>`.
print(model.vocab.lookup('language is never random nsfkalfnsaklfsankllah .'.split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


In [30]:
print(model.counts)

<NgramCounter with 3 ngram orders and 19611 ngrams>


In [31]:
# language 단어가 몇 번 나타났나? 25번
model.counts['language'] # i.e. Count('language')

25

In [32]:
# language 단어가 등장한 뒤에 이어서 is 단어가 나온 횟수? 11번
model.counts[['language']]['is'] # i.e. Count('is'|'language')

11

In [33]:
# language, is 조합 이후에 never 단어가 나온 횟수? 7번
model.counts[['language', 'is']]['never'] # i.e. Count('never'|'language is')

7

In [34]:
model.score('language') # P('language')

0.003691671588895452

In [35]:
model.score('is', 'language'.split())  # P('is'|'language')

0.44

In [36]:
model.score('never', 'language is'.split())  # P('never'|'language is')

0.6363636363636364

In [38]:
# unknown 같은 경우 vocab 포함 안되는 경우 = 확률 0
print(model.score("<UNK>") == model.score("lah"))
print(model.score("<UNK>"))
print(model.score("lah"))

True
0.0
0.0


In [40]:
model.score("<UNK>") == model.score("nsfkalfnsaklfsankllah")

True

In [41]:
model.score("<UNK>") == model.score("lor")

True

In [42]:
# 편의를 위해 log를 씌운 확률로도 계산할 수 있음
model.logscore("never", "language is".split())

-0.6520766965796932

# N-gram 모델을 이용해서 랜덤한 새로운 텍스트를 생성합니다.


In [43]:
print(model.generate(20, random_seed=7))

['and', 'carroll', 'used', 'hypothesis', 'testing', 'has', 'been', 'used', ',', 'and', 'a', 'half', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [44]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [45]:
generate_sent(model, 20, random_seed=7)

'and carroll used hypothesis testing has been used, and a half.'

In [46]:
print(model.generate(28, random_seed=0))

['the', 'scf-verb', 'link', 'is', 'motivated', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [47]:
generate_sent(model, 28, random_seed=0)

'the scf-verb link is motivated.'

In [48]:
generate_sent(model, 20, random_seed=1)

'237⫺246.'

In [49]:
generate_sent(model, 20, random_seed=30)

'hypothesis is ever a useful construct.'

In [50]:
generate_sent(model, 20, random_seed=42)

'more (or cold) weather, or on saturday nights, or by people in (or poorer)'