In [1]:
import numpy as np
import pandas as pd
import nltk
import math
import nltk
import re
from collections import defaultdict
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/twitter/en_US.twitter.txt


In [2]:
with open("../input/twitter/en_US.twitter.txt", "r") as f:
    data = f.read()
print(type(data))

<class 'str'>


In [3]:
def build_vocab(sentences, min_freq):
    vocab = []
    word_count = defaultdict(int)
    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1
    for word, cnt in word_count.items():
        if cnt >= min_freq:
            vocab.append(word)
    return vocab

def replace_oov_with_unk(sentences, vocab, unknown_word='<UNK>'):
    return [[word if word in vocab else unknown_word for word in sentence] for sentence in sentences]

In [4]:
def preprocess(data, min_freq):
    sentences = data.split("\n")
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    tokenized_sentences = [nltk.word_tokenize(s.lower()) for s in sentences]
    n_sentences = len(tokenized_sentences)
    train_size = int(n_sentences * 0.8)
    train_data, test_data = tokenized_sentences[:train_size], tokenized_sentences[train_size:]
    vocab = build_vocab(train_data, min_freq)
    train_data = replace_oov_with_unk(train_data, vocab)
    test_data = replace_oov_with_unk(test_data, vocab)
    return train_data, test_data, vocab

In [5]:
def cal_n_gram_counts(data, n, start="<s>", end="<e>"):
    n_grams = defaultdict(int)
    for sentence in data:
        sentence = [start] * n + sentence + [end]
        sentence = tuple(sentence)
        for i in range(len(sentence)):
            n_gram = sentence[i:i + n]
            n_grams[n_gram] += 1
    return n_grams

In [6]:
def estimate_probability(word, n_gram, n_grams, np1_grams, vocab_size, k=1):
    n_gram = tuple(n_gram)
    np1_gram = n_gram + (word,)
    return (np1_grams[np1_gram] + k) / (n_grams[n_gram] + vocab_size * k)

def estimate_probabilities(n_gram, n_grams, np1_grams, vocab, k=1):
    vocab = vocab + ["<e>", "<UNK>"]
    vocab_size =  len(vocab)
    probabilities = {}
    for word in vocab:
        probabilities[word] = estimate_probability(word, n_gram, n_grams, np1_grams, vocab_size, k)
    return probabilities

def next_word(prev_tokens, n_grams, np1_grams, vocab, k=1):
    n = len(list(n_grams.keys())[0])
    n_gram = prev_tokens[-n:]
    probs = estimate_probabilities(n_gram, n_grams, np1_grams, vocab, k)
    max_prob = max(probs.values())
    for k, v in probs.items():
        if v == max_prob: 
            return k

In [7]:
train_data, test_data, vocab = preprocess(data[:10000000], 1)
print(len(train_data), len(vocab))
n_grams, np1_grams = cal_n_gram_counts(train_data, 3), cal_n_gram_counts(train_data, 4)

115056 73658


In [11]:
s = ["what", "are", "you"]
for i in range(100):
    nw = next_word(s, n_grams, np1_grams, vocab, 3)
    s.append(nw)
    if nw == '<e>':
        break
print(s)

['what', 'are', 'you', 'doing', '?', '<e>']
