In [18]:
from nltk.util import ngrams
from collections import Counter

corpus=[
    'This is a dog',
    'This is a cat',
    'I love my cat',
    'This is my name'
]

def preprocess(text):
    return text.lower().split()


In [19]:
#Combine all sentences into a single list of words
all_words=[]
for sentence in corpus:
    all_words.extend(preprocess(sentence))


#Unigram bigram trigram
unigrams = list(ngrams(all_words,1))
bigrams = list(ngrams(all_words,2))
trigrams = list(ngrams(all_words,3))

unigram_counts=Counter(unigrams)
bigram_counts=Counter(bigrams)
trigram_counts=Counter(trigrams)
    

In [20]:
#Display above
print("Unigrams:")
for unigram, count in unigram_counts.items():
    print(f"{unigram}: {count}")
    
print("Bigrams:")
for bigram, count in bigram_counts.items():
    print(f"{bigram}: {count}")    
    
print("Trigrams:")
for trigram, count in trigram_counts.items():
    print(f"{trigram}: {count}")

Unigrams:
('this',): 3
('is',): 3
('a',): 2
('dog',): 1
('cat',): 2
('i',): 1
('love',): 1
('my',): 2
('name',): 1
Bigrams:
('this', 'is'): 3
('is', 'a'): 2
('a', 'dog'): 1
('dog', 'this'): 1
('a', 'cat'): 1
('cat', 'i'): 1
('i', 'love'): 1
('love', 'my'): 1
('my', 'cat'): 1
('cat', 'this'): 1
('is', 'my'): 1
('my', 'name'): 1
Trigrams:
('this', 'is', 'a'): 2
('is', 'a', 'dog'): 1
('a', 'dog', 'this'): 1
('dog', 'this', 'is'): 1
('is', 'a', 'cat'): 1
('a', 'cat', 'i'): 1
('cat', 'i', 'love'): 1
('i', 'love', 'my'): 1
('love', 'my', 'cat'): 1
('my', 'cat', 'this'): 1
('cat', 'this', 'is'): 1
('this', 'is', 'my'): 1
('is', 'my', 'name'): 1


In [6]:
import nltk
from nltk.corpus import gutenberg, brown
from nltk.util import ngrams
from collections import Counter

#Download necessary NLTK data files
nltk.download('gutenberg')
nltk.download('brown')

#Choose corpus
#corpus=gutenberg.words('austen-emma.txt')
corpus=brown.words(categories='news')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\vyasm\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vyasm\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [17]:
#Function to find ngram

def find_ngrams(corpus,n):
    ngrams_list=list(ngrams(corpus,n))
    return Counter(ngrams_list)

#Find unigrams

unigrams=find_ngrams(corpus,1)
bigrams=find_ngrams(corpus,2)
trigrams=find_ngrams(corpus,3)

print("Most common unigrams:", unigrams.most_common(10))
print("Most common bigrams:", bigrams.most_common(10))
print("Most common trigrams:", trigrams.most_common(10))





Most common unigrams: [(('The Internet of things describes devices with sensors, processing ability, software and other technologies that connect and exchange data with other devices and systems over the Internet or other communications networks. The Internet of things encompasses electronics, communication, and computer science engineering.',), 1)]
Most common bigrams: []
Most common trigrams: []


In [8]:
from collections import defaultdict, Counter

corpus=[
    'This is a dog',
    'This is a cat',
    'I love my cat',
    'This is my name'
]

def preprocess(text):
    return text.lower().split()

#Build bigram model

bigram_counts=defaultdict(Counter)
unigram_counts=Counter()
for sentences in corpus:
    words=preprocess(sentence)
    unigram_counts.update(words)
    for i in range(len(words)-1):
        bigram_counts[words[i]][words[i+1]]+=1
    

In [13]:
def bigram_probability(sentence):
    words= preprocess(sentence)
    probability=1.0
    for i in range(len(words)-1):
         first_word, second_word = words[i], words[i + 1]
         # Calculate probability of the bigram
         prob = bigram_counts[first_word][second_word] / unigram_counts[first_word]
         probability *= prob
         print(f"Probability of '{first_word} {second_word}': {prob}")
    return probability

# Test Sentence 
sentence = "This is my cat"
probability = bigram_probability(sentence)
print(f"\nProbability of sentence '{sentence}': {probability}")


Probability of 'this is': 0.75
Probability of 'is my': 0.25
Probability of 'my cat': 0.25

Probability of sentence 'This is my cat': 0.046875


In [10]:
from collections import defaultdict, Counter

# Initialize your defaultdicts
trigram_counts = defaultdict(Counter)
bigram_counts = defaultdict(Counter)

for sentence in corpus:
    words = preprocess(sentence)  # preprocess the sentence to get the list of words
    for i in range(len(words) - 2):
        trigram_counts[(words[i], words[i+1])][words[i+2]] += 1  # Increment the trigram count by 1
    for i in range(len(words) - 1):
        bigram_counts[words[i]][words[i + 1]] += 1  # Increment the bigram count by 1


In [11]:
def trigram_probability(sentence):
    words = preprocess(sentence)
    probability = 1.0
    
    for i in range(len(words) - 2):
        first_word, second_word, third_word = words[i], words[i + 1], words[i + 2]
        
        if (first_word, second_word) in trigram_counts and third_word in trigram_counts[(first_word, second_word)]:
            prob = trigram_counts[(first_word, second_word)][third_word] / bigram_counts[first_word][second_word]
            probability *= prob
            print(f"P({third_word} | {first_word} {second_word}) = {prob}")
        else:
            print(f"Trigram ({first_word}, {second_word}, {third_word}) not found.")
            probability *= 0 # Probability becomes zero if the trigram doesn't exist
    
    return probability

# Test Sentence 
sentence = "This is my name"
probability = bigram_probability(sentence)
print(f"\nProbability of sentence '{sentence}': {probability}")

Probability of 'this is': 0.75
Probability of 'is my': 0.25
Probability of 'my name': 0.25

Probability of sentence 'This is my name': 0.046875


In [12]:
"The Internet of things describes devices with sensors, processing ability, software and other technologies that connect and exchange data with other devices and systems over the Internet or other communications networks. The Internet of things encompasses electronics, communication, and computer science engineering."


'The Internet of things describes devices with sensors, processing ability, software and other technologies that connect and exchange data with other devices and systems over the Internet or other communications networks. The Internet of things encompasses electronics, communication, and computer science engineering.'