In [None]:
import random
import nltk
from nltk import bigrams, FreqDist, ConditionalFreqDist

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mshos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import os
import string

## Keep your training documents in a folder named 'data'
input_data_dir = "data"

# String of punctuation without the full stop
punctuation = string.punctuation.replace('.', '')  # Retain the full stop

def is_hidden(filepath):
    return os.path.basename(filepath).startswith('.')

text_data=""
for filename in os.listdir(input_data_dir):
    filepath = os.path.join(input_data_dir, filename)
    if not is_hidden(filepath):
        with open(filepath) as infile:
            for line in infile:
                if line.strip():  # Check if line is not just whitespace
                    # Remove all punctuation except full stops
                    for char in punctuation:
                        line = line.replace(char, '')
                    text_data += line

In [None]:
len(text_data)

8441834

In [None]:
# Tokenize the text into words
# Lowercasing for consistency
words = nltk.word_tokenize(text_data.lower())

# Generate bigrams
bi_grams = list(bigrams(words))

# Calculate frequency distribution for each bigram
bi_gram_freq_dist = FreqDist(bi_grams)

In [None]:
from itertools import islice
# Print the first five elements of the dictionary
first_five_items = list(islice(bi_gram_freq_dist.items(), 5))
for item in first_five_items:
    print(item)

(('asian', 'exporters'), 1)
(('exporters', 'fear'), 2)
(('fear', 'damage'), 1)
(('damage', 'from'), 8)
(('from', 'u'), 27)


In [None]:
# Compute conditional frequency distribution of bigrams
bi_gram_freq = ConditionalFreqDist(bi_grams)

In [None]:
bi_gram_freq['natural']

FreqDist({'gas': 216, 'rubber': 39, 'resources': 9, 'for': 3, 'float': 3, 'disasters': 2, 'that': 2, 'lt': 2, 'lower': 1, 'beverages': 1, ...})

In [None]:
import heapq

topk=3
# Create a dictionary to hold the top topk bigrams for each first word
top_bigrams_per_first_word = {}

# Iterate over the bigram frequency distribution
for (first_word, second_word), freq in bi_gram_freq_dist.items():
    # Initialize an empty heap for the first_word if it doesn't exist
    if first_word not in top_bigrams_per_first_word:
        top_bigrams_per_first_word[first_word] = []

    # Add to the heap and maintain top topk
    heapq.heappush(top_bigrams_per_first_word[first_word],
                   (freq, second_word))
    if len(top_bigrams_per_first_word[first_word]) > topk:
        heapq.heappop(top_bigrams_per_first_word[first_word])


In [None]:
top_bigrams_per_first_word['natural']

[(9, 'resources'), (216, 'gas'), (39, 'rubber')]

In [None]:
# Convert the heap to a simple list for each first word
for first_word in top_bigrams_per_first_word:
    sorted_bigrams = sorted(
        top_bigrams_per_first_word[first_word], reverse=True)
    top_bigrams_list = []
    for freq, second_word in sorted_bigrams:
        top_bigrams_list.append(second_word)
    top_bigrams_per_first_word[first_word] = top_bigrams_list

# Use these filtered bigrams to create a ConditionalFreqDist
filtered_bi_grams = []
for first_word in top_bigrams_per_first_word:
    for second_word in top_bigrams_per_first_word[first_word]:
        filtered_bi_grams.append((first_word, second_word))

bi_gram_freq = ConditionalFreqDist(filtered_bi_grams)

In [None]:
def generate_sentence(word, num_words):
    word =word.lower()
    for _ in range(num_words):
        print(word, end=' ')
        next_words = [item for item, freq in bi_gram_freq[word].items()]
        if len(next_words) > 0:
            # Randomly choose a next word
            word = random.choice(next_words)
        else:
            break  # Break if the word has no following words
    print()

In [None]:
generate_sentence('Asia', 100)

asia pacific ltd said the dollar . the dollar and other major industrial countries and other than the u . 5 . the u card rate cut its board declared a new company . the dollar and a new york . the company . s lt xon unit of a new york investor asher s stock dividend to be the u card rate cut the dollar s lt bp s lt c and a new zealand ltd said . s lt xon . 5 mln dlrs in the u trans world bank said . the u card to the u 
