In [None]:
pip install datasets

In [17]:
import os

# Training text
train_file = 'EN_train.txt'
train_text = ''
with open(train_file, 'r', encoding='utf-8') as file:
    file_content = file.read()
    train_text = file_content

print("====== Training Text")
print(train_text[:100])

# Vallidation text
validate_file = 'EN_validate.txt'
validate_text = ''
with open(validate_file, 'r', encoding='utf-8') as file:
    file_content = file.read()
    validate_text = file_content
    

print("====== Validate Text")
print(validate_text[:100])



CHAPTER I. START IN LIFE


I was born in the year 1632, in the city of York, of a good family,
thoug
Chapter I. Into the Primitive


 Old longings nomadic leap,
Chafing at custom s chain;
Again from it


In [18]:
import torch
import math
from collections import defaultdict, Counter

class NGramModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab = set()  # Vocabulary of all unique words (for smoothing)
        
        # Check if M1 GPU (Metal) or CUDA is available, else fall back to CPU
        if torch.backends.mps.is_available():
            self.device = torch.device("mps")  # for mac with M series SoC
        elif torch.cuda.is_available():
            self.device = torch.device("cuda")  # CUDA GPU
        else:
            self.device = torch.device("cpu")  # Default to CPU
        
        print(f"Using device: {self.device}")

    def tokenize(self, text):
        """Simple whitespace-based tokenization."""
        return text.split()

    def ngrams(self, corpus):
        """
        Generate n-grams from the corpus with padding.
        """
        # Pad the corpus with <s> and </s> tokens
        padded_corpus = ['<s>'] * (self.n - 1) + corpus + ['</s>']
        return [tuple(padded_corpus[i:i + self.n]) for i in range(len(padded_corpus) - self.n + 1)]

    def train(self, corpus):
        """
        Train the n-gram model on the provided corpus.
        Args:
            corpus (list): List of tokenized words.
        """
        # Tokenize the corpus and create n-grams
        ngrams_list = self.ngrams(corpus)
        
        # Add each word to the vocabulary set
        self.vocab.update(corpus)
        
        # Count n-grams and contexts
        for ngram in ngrams_list:
            context = ngram[:-1]  # First (n-1) words
            token = ngram[-1]     # Last word
            self.ngram_counts[context][token] += 1
            self.context_counts[context] += 1

    def predict_next(self, context):
        """
        Predict the next word based on the given context using add-one smoothing.
        Args:
            context (tuple): Tuple of (n-1) words as the context.
        Returns:
            dict: Probabilities of next possible words.
        """
        if context not in self.ngram_counts:
            return None

        # Get the counts of possible next words for the context
        possible_next_words = self.ngram_counts[context]
        total_count = self.context_counts[context]
        
        # Apply add-one smoothing: Add 1 to each count and include the vocabulary size in the denominator
        smoothed_probabilities = {}
        vocab_size = len(self.vocab)
        
        # Transfer calculations to the selected device (CPU, CUDA, or MPS)
        for word in self.vocab:
            count = possible_next_words[word] if word in possible_next_words else 0
            smoothed_prob = (count + 1) / (total_count + vocab_size)
            smoothed_probabilities[word] = smoothed_prob

        return smoothed_probabilities

    def evaluate_perplexity(self, corpus):
        """
        Calculate the perplexity of the model on a given corpus.
        Args:
            corpus (list): List of tokenized words.
        Returns:
            float: Perplexity score.
        """
        ngrams_list = self.ngrams(corpus)
        
        log_prob_sum = 0
        N = len(corpus)  # Total number of words
        
        for ngram in ngrams_list:
            context = ngram[:-1]  # First (n-1) words
            token = ngram[-1]     # Last word
            
            # Get the smoothed probabilities for the context
            smoothed_probs = self.predict_next(context)
            
            if smoothed_probs is None or token not in smoothed_probs:
                # If no valid probability exists (e.g., unseen context), assign a small probability
                smoothed_prob = 1 / (len(self.vocab) + 1)  # Small probability
            else:
                # Get the probability of the actual token
                smoothed_prob = smoothed_probs[token]
            
            # Add log of the probability to the log sum
            log_prob_sum += math.log(smoothed_prob, 2)  # Log probability (base 2)
        
        # Calculate perplexity
        perplexity = math.pow(2, -log_prob_sum / N)
        return perplexity


In [25]:
from nltk.tokenize import word_tokenize

print("Total Text:", len(train_text))
tokens = word_tokenize(train_text, language='english', preserve_line=False)
print("Tokens:", tokens[:20])

      
token_lengths = 100000 
tokens = tokens[:token_lengths]
print("Tokens:", tokens[:20])

print("Only taking Tokens:", len(tokens))

print("====== Validation Token")

print("Total Validation Text:", len(validate_text))
validate_tokens = word_tokenize(validate_text, language='english', preserve_line=False)
print("Validation Tokens:", validate_tokens[:20])

      
token_lengths = 100000 
validate_tokens = validate_tokens[:token_lengths]
print("Validation Tokens:", validate_tokens[:20])

print("Only taking Validation Tokens:", len(validate_tokens))

Total Text: 27282132
Tokens: ['CHAPTER', 'I', '.', 'START', 'IN', 'LIFE', 'I', 'was', 'born', 'in', 'the', 'year', '1632', ',', 'in', 'the', 'city', 'of', 'York', ',']
Tokens: ['CHAPTER', 'I', '.', 'START', 'IN', 'LIFE', 'I', 'was', 'born', 'in', 'the', 'year', '1632', ',', 'in', 'the', 'city', 'of', 'York', ',']
Only taking Tokens: 100000
Total Validation Text: 4418100
Validation Tokens: ['Chapter', 'I', '.', 'Into', 'the', 'Primitive', 'Old', 'longings', 'nomadic', 'leap', ',', 'Chafing', 'at', 'custom', 's', 'chain', ';', 'Again', 'from', 'its']
Validation Tokens: ['Chapter', 'I', '.', 'Into', 'the', 'Primitive', 'Old', 'longings', 'nomadic', 'leap', ',', 'Chafing', 'at', 'custom', 's', 'chain', ';', 'Again', 'from', 'its']
Only taking Validation Tokens: 100000


In [26]:
import pickle
# For digram, n=1
model = NGramModel(n=1)
model.train(tokens)
# Save the trained model to a file
with open('unigram_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Sample corpus (tokenized)
context = ()
print(f"Input: {context}")

# Evaluate the model's perplexity on the same corpus (you can use a test corpus)
perplexity = model.evaluate_perplexity(validate_tokens)
print(f"Perplexity of the model: {perplexity}")
predictions = model.predict_next(context)
                                 

# Sort the dictionary by probability in descending order and get the top 2 entries
top_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:2]
# Extract the top 1st and 2nd word and their probabilities
first_word, first_prob = top_predictions[0]
second_word, second_prob = top_predictions[1]

# Print the results
print(f"1st Word: {first_word}, Probability: {first_prob}")
print(f"2nd Word: {second_word}, Probability: {second_prob}")

Using device: mps
Input: ()
Perplexity of the model: 607.0094541846838
1st Word: ,, Probability: 0.07696310804545498
2nd Word: I, Probability: 0.039342587073149964


In [28]:
import pickle
# For digram, n=2
model = NGramModel(n=2)
model.train(tokens)
# Save the trained model to a file
with open('digram_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Sample corpus (tokenized)
corpus = tokens
context = ('nursing',)

print(f"Input: {context}")

# Evaluate the model's perplexity on the same corpus (you can use a test corpus)
perplexity = model.evaluate_perplexity(corpus)
print(f"Perplexity of the model: {perplexity}")
predictions = model.predict_next(context)
                                 

# Sort the dictionary by probability in descending order and get the top 2 entries
top_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:2]
# Extract the top 1st and 2nd word and their probabilities
first_word, first_prob = top_predictions[0]
second_word, second_prob = top_predictions[1]

# Print the results
print(f"1st Word: {first_word}, Probability: {first_prob}")
print(f"2nd Word: {second_word}, Probability: {second_prob}")

Using device: mps
Input: ('nursing',)
Perplexity of the model: 595.0719326577995
1st Word: it, Probability: 0.0003516792685071215
2nd Word: slipped, Probability: 0.00017583963425356076


In [30]:
import pickle
# For digram, n=3
model = NGramModel(n=3)
model.train(tokens)
# Save the trained model to a file
with open('trigram_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Sample corpus (tokenized)
corpus = tokens
context = ('My', 'father', )

print(f"Input: {context}")
# Evaluate the model's perplexity on the same corpus (you can use a test corpus)
perplexity = model.evaluate_perplexity(corpus)
print(f"Perplexity of the model: {perplexity}")
predictions = model.predict_next(context)


# Sort the dictionary by probability in descending order and get the top 2 entries
top_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:2]
# Extract the top 1st and 2nd word and their probabilities
first_word, first_prob = top_predictions[0]
second_word, second_prob = top_predictions[1]

# Print the results
print(f"1st Word: {first_word}, Probability: {first_prob}")
print(f"2nd Word: {second_word}, Probability: {second_prob}")

Using device: mps
Input: ('My', 'father')
Perplexity of the model: 1846.0709715429957
1st Word: ,, Probability: 0.0005274261603375527
2nd Word: slipped, Probability: 0.00017580872011251758
