In [11]:
import sys
from collections import defaultdict

import re

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    # text = re.sub(r'[^\w\s]', '', text)
    return text

def build_unigram_model(training_file):
    word_counts = defaultdict(int)
    total_words = 0
    
    with open(training_file, 'r', encoding='utf-8') as f:
        for line in f:
            # Preprocess the line
            line = preprocess_text(line)
            words = line.strip().split()
            for word in words:
                word_counts[word] += 1
                total_words += 1
    
    word_probs = {word: count/total_words for word, count in word_counts.items()}
    return word_probs




In [7]:
# training_file = 'train2.txt'
# wrds = []
# with open(training_file, 'r', encoding='utf-8') as f:
#         for line in f:
#             # Preprocess the line
#             line = preprocess_text(line)
#             words = line.strip().split()
#             wrds.extend(words)

In [10]:
training_file = 'train2.txt'
word_probs=build_unigram_model(training_file)

In [13]:
import math
def test_unigram_model(model, test_file):
    # Process each line in the test file
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            sentence_prob = 1
            for word in words:
                # Use a small probability for unknown words
                word_prob = model.get(word, 1e-6)
                sentence_prob *= word_prob
            
            print(f"Sentence: {line.strip()}\nProbability: {math.log(sentence_prob)}\n")

In [14]:
model = word_probs
test_file = 'test1.txt'
test_unigram_model(model, test_file)

Sentence: a
Probability: -4.03731822540442

Sentence: b c
Probability: -24.91484902136226

Sentence: a b c d
Probability: -40.967796442038825



In [15]:
model = word_probs
test_file = 'test2.txt'
test_unigram_model(model, test_file)

Sentence: Wolf
Probability: -13.815510557964274

Sentence: In the jungle
Probability: -24.64663467280577

Sentence: Rustle in the grass .
Probability: -33.0732457544786

Sentence: What could go wrong ?
Probability: -41.35452071759269

Sentence: I swear I am not making this up .
Probability: -76.1685325974044

Sentence: But old Mr. Toad will leave one day .
Probability: -77.37797250211233



In [16]:
model = word_probs
test_file = 'test2.txt'
test_unigram_model(model, test_file)

Sentence: Wolf
Probability: -13.815510557964274

Sentence: In the jungle
Probability: -24.64663467280577

Sentence: Rustle in the grass .
Probability: -33.0732457544786

Sentence: What could go wrong ?
Probability: -41.35452071759269

Sentence: I swear I am not making this up .
Probability: -76.1685325974044

Sentence: But old Mr. Toad will leave one day .
Probability: -77.37797250211233



In [None]:
def test_unigram_model(model, test_file):
    # Process each line in the test file
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            sentence_prob = 1
            for word in words:
                # Use a small probability for unknown words
                word_prob = model.get(word, 1e-6)
                sentence_prob *= word_prob
            
            print(f"Sentence: {line.strip()}\nProbability: {sentence_prob}\n")

def main():
    if len(sys.argv) != 3:
        print("Usage: python3 ngrams.py [training file] [test file]")
        sys.exit(1)
    
    training_file, test_file = sys.argv[1], sys.argv[2]
    model = build_unigram_model(training_file)
    test_unigram_model(model, test_file)

if __name__ == "__main__":
    main()


In [3]:
from collections import defaultdict, Counter
import math

In [4]:
def preprocess_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
    return text


In [5]:
def build_unigram_model(training_file):
    training_text = preprocess_text_file(training_file)
    words = training_text.split()
    total_words = len(words)
    word_counts = Counter(words)
    return word_counts, total_words

def unigram_probability(sentence, word_counts, total_words):
    words = preprocess_text(sentence).split()
    log_prob = 0
    for word in words:
        word_prob = word_counts[word] / total_words
        log_prob += math.log2(word_prob)
    return log_prob

In [6]:
def build_bigram_model(training_file):
    training_text = preprocess_text_file(training_file)
    sentences = training_text.split('\n')
    bigram_counts = defaultdict(int)
    word_counts = defaultdict(int)
    
    for sentence in sentences:
        words = ['<s>'] + sentence.split()
        for i in range(len(words) - 1):
            bigram = (words[i], words[i+1])
            bigram_counts[bigram] += 1
            word_counts[words[i]] += 1
            
    return bigram_counts, word_counts


def bigram_probability(sentence, bigram_counts, word_counts):
    words = ['<s>'] + preprocess_text(sentence).split()
    log_prob = 0
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_prob = bigram_counts[bigram] / word_counts[words[i]]
        log_prob += math.log2(bigram_prob) if bigram_prob > 0 else float('-inf')
    return log_prob


In [7]:
def build_bigram_model(training_file):
    # sentences = training_text.lower().split('\n')

    bigram_counts = defaultdict(int)
    word_counts = defaultdict(int)

    with open(training_file, 'r', encoding='utf-8') as f:
        for line in f:
            # Preprocess the line
            # line = preprocess_text(line)
            # words = line.strip().split()
            words = ['<s>'] + preprocess_text(line).split()
            for i in range(len(words) - 1):
                bigram = (words[i], words[i+1])
                bigram_counts[bigram] += 1
                word_counts[words[i]] += 1        
            
    return bigram_counts, word_counts

def bigram_probability(sentence, bigram_counts, word_counts):
    words = ['<s>'] + preprocess_text(sentence).split()
    log_prob = 0
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_prob = bigram_counts[bigram] / word_counts[words[i]]
        log_prob += math.log2(bigram_prob) if bigram_prob > 0 else float('-inf')
    return log_prob


In [8]:
def bigram_add_one_smoothing(sentence, bigram_counts, word_counts, V):
    words = ['<s>'] + preprocess_text(sentence).split()
    log_prob = 0
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_prob = (bigram_counts[bigram] + 1) / (word_counts[words[i]] + V)
        log_prob += math.log2(bigram_prob)
    return log_prob


In [16]:
Counter([2,5,8,2,2,2,5,5,5,5,5,5,5,5])

Counter({5: 9, 2: 4, 8: 1})

In [17]:
defaultdict(int)

defaultdict(int, {})

In [162]:
from collections import defaultdict, Counter
import math

# reading and preprocessing the text data file
def preprocess_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
    return text

# unigram model 
def build_unigram_model(training_file):
    training_text = preprocess_text_file(training_file)
    words = training_text.split() # all words or unigrams in the text file
    total_words = len(words) # total number of unigrams
    word_counts = Counter(words) # make dictionary of words with counts
    return word_counts, total_words

# unigram model probability calculating function
def unigram_probability(sentence, word_counts, total_words):
    words = sentence.split() # split the sentance to words
    log_prob = 0
    for word in words:
        word_prob = word_counts[word] / total_words if word in word_counts else 0 # calculating the probabilty of each word in sentence
        log_prob += math.log2(word_prob) if word_prob > 0 else float('-inf') # adding the each words probability to total probability after applying log2
    return log_prob if log_prob!=float('-inf') else "undefined"
# bigram model
def build_bigram_model(training_file):
    training_text = preprocess_text_file(training_file)
    sentences = training_text.split('\n')
    bigram_counts = defaultdict(int) # initializing an empty dictionary for bigram counts
    word_counts = defaultdict(int) # initializing an empty dictionary for words/unigram counts
    
    for sentence in sentences:
        words = ['<s>'] + sentence.split() # adding special character <s> in each
        for i in range(len(words) - 1): # 
            bigram = (words[i], words[i+1])  # making bigrams -> (bigram = current_word + next_word)
            bigram_counts[bigram] += 1 # bigram frequency calculation
            word_counts[words[i]] += 1 # unigram frequency calculation
            
    return bigram_counts, word_counts

# bigram probability calculation without smoothing 
def bigram_probability(sentence, bigram_counts, word_counts):
    words = ['<s>'] + sentence.split()
    log_prob = 0
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_prob = bigram_counts[bigram] / word_counts[words[i]] if bigram_counts[bigram] > 0 else 1
        log_prob += math.log2(bigram_prob) if bigram_prob > 0 else float('-inf')
    return log_prob if log_prob!=float('-inf') else "undefined"

# bigram probability calculation smoothing 
def bigram_add_one_smoothing(sentence, bigram_counts, word_counts, V): # V = len(unigram_counts_for_bigrams) + 1  # +1 for <s> token
    words = ['<s>'] + sentence.split()
    log_prob = 0
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_prob = (bigram_counts[bigram] + 1) / (word_counts[words[i]] + V)
        log_prob += math.log2(bigram_prob)
    return log_prob

In [136]:
def unigram_probability(sentence, word_counts, total_words):
    words = sentence.split()
    log_prob = 0
    for word in words:
        if word in word_counts:
            word_prob = word_counts[word] / total_words
        else:
            word_prob = word_counts['<UNK>'] / total_words  # Use <UNK> token for unknown words
        log_prob += math.log2(word_prob)
    return log_prob

def bigram_probability(sentence, bigram_counts, word_counts):
    words = ['<s>'] + sentence.split()
    log_prob = 0
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        if bigram in bigram_counts and words[i] in word_counts:
            bigram_prob = bigram_counts[bigram] / word_counts[words[i]]
        else:
            bigram_prob = bigram_counts[('<UNK>', words[i+1])] / word_counts['<UNK>']  # Use <UNK> token for unknown bigrams or words
        log_prob += math.log2(bigram_prob)
    return log_prob

def build_unigram_model(training_file):
    training_text = preprocess_text_file(training_file)
    words = training_text.split()
    total_words = len(words)
    word_counts = Counter(words)
    word_counts['<UNK>'] = 1  # Initialize count for <UNK> token to 1
    return word_counts, total_words

def build_bigram_model(training_file):
    training_text = preprocess_text_file(training_file)
    sentences = training_text.split('\n')
    bigram_counts = defaultdict(int)
    word_counts = defaultdict(int)

    for sentence in sentences:
        words = ['<s>'] + sentence.split()
        for i in range(len(words) - 1):
            bigram = (words[i], words[i+1])
            bigram_counts[bigram] += 1
            word_counts[words[i]] += 1
            if words[i+1] not in word_counts:  # Add unseen words to word_counts with count 1
                word_counts[words[i+1]] = 1
            else:
                word_counts[words[i+1]] += 1
    bigram_counts[('<UNK>', '<UNK>')] = 1  # Initialize count for unknown bigrams to 1
    return bigram_counts, word_counts


In [134]:
('"This is a funny-looking sentence", she said!').split()

['"This', 'is', 'a', 'funny-looking', 'sentence",', 'she', 'said!']

In [123]:
bigram_counts, word_counts = build_bigram_model(training_file)

In [125]:
word_counts

defaultdict(int, {'<s>': 1, 'a': 1, 'b': 1, 'c': 1, 'd': 1})

In [124]:
bigram_counts

defaultdict(int,
            {('<s>', 'a'): 1,
             ('a', 'b'): 1,
             ('b', 'c'): 1,
             ('c', 'd'): 1,
             ('d', 'b'): 1})

In [163]:
def test_models_with_file(training_file, test_file):
    """
    Test the specified language model with sentences from a test file.
    
    Parameters:
    - test_file: Path to the test text file.
    - training_file: Path to the training text file.
    """
    # unigram model
    word_counts, total_words = build_unigram_model(training_file)
    # bigram model
    bigram_counts, unigram_counts_for_bigrams = build_bigram_model(training_file)
    V = len(unigram_counts_for_bigrams)-1  # +1 for <s> token

    with open(test_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()

    for sentence in sentences:
        preprocessed_sentence = sentence.lower().strip()

        # unigram model probs
        log_prob_uni = unigram_probability(preprocessed_sentence, word_counts, total_words)

        # bigram probs
        log_prob_bi = bigram_probability(preprocessed_sentence, bigram_counts, unigram_counts_for_bigrams)

        # bigram probs with smoothing
        log_prob_bi_smoth = bigram_add_one_smoothing(preprocessed_sentence, bigram_counts, unigram_counts_for_bigrams, V)
        
        print(f"S = {sentence.strip()}")
        print(f"Unsmoothed Unigrams, logprob(S) = {round(log_prob_uni, 4) if log_prob_uni!='undefined' else log_prob_uni}")
        print(f"Unsmoothed Bigrams, logprob(S) = {round(log_prob_bi, 4) if log_prob_bi!='undefined' else log_prob_bi}")
        print(f"Smoothed Bigrams, logprob(S) = {log_prob_bi_smoth:.4f}")
        print() # add empty line 


In [165]:
training_file = "train1.txt"
test_file = "test1.txt"
test_models_with_file(training_file, test_file)

S = a
Unsmoothed Unigrams, logprob(S) = -2.3219
Unsmoothed Bigrams, logprob(S) = 0.0
Smoothed Bigrams, logprob(S) = -1.3219

S = b c
Unsmoothed Unigrams, logprob(S) = -3.6439
Unsmoothed Bigrams, logprob(S) = 0.0
Smoothed Bigrams, logprob(S) = -3.6439

S = a b c d
Unsmoothed Unigrams, logprob(S) = -8.2877
Unsmoothed Bigrams, logprob(S) = 0.0
Smoothed Bigrams, logprob(S) = -5.2877

