In [34]:
 # Step 1: Import the NLTK library
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

In [35]:
# Step 2: Download the punkt tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')  # Alternative for newer NLTK versions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [36]:
# Step 3: Load a sample text corpus
sample_text = """
Natural Language Processing (NLP) is a subfield of artificial intelligence
that focuses on the interaction between computers and human language.
It enables computers to understand, interpret, and generate human language.
NLP applications include machine translation, sentiment analysis,
chatbots, and speech recognition. Language models are fundamental to NLP.
"""
print("Sample Text Corpus:")
print(sample_text)

Sample Text Corpus:

Natural Language Processing (NLP) is a subfield of artificial intelligence 
that focuses on the interaction between computers and human language. 
It enables computers to understand, interpret, and generate human language. 
NLP applications include machine translation, sentiment analysis, 
chatbots, and speech recognition. Language models are fundamental to NLP.



In [37]:
# Step 4: Tokenize the text into words
words = word_tokenize(sample_text.lower())
print("\nTokenized Words:")
print(words)
print(f"Total tokens: {len(words)}")


Tokenized Words:
['natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'human', 'language', '.', 'it', 'enables', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', '.', 'nlp', 'applications', 'include', 'machine', 'translation', ',', 'sentiment', 'analysis', ',', 'chatbots', ',', 'and', 'speech', 'recognition', '.', 'language', 'models', 'are', 'fundamental', 'to', 'nlp', '.']
Total tokens: 58


In [38]:
# Step 5: Generate N-grams using nltk.ngrams()
# Let's generate 2-grams (bigrams) as an example
n = 2  # You can change this to 3 for trigrams, 4 for 4-grams, etc.
generated_ngrams = list(ngrams(words, n))
print(f"\nGenerated {n}-grams:")
for i, gram in enumerate(generated_ngrams[:10]):  # Show first 10
    print(f"{i+1}. {gram}")
print(f"Total {n}-grams generated: {len(generated_ngrams)}")


Generated 2-grams:
1. ('natural', 'language')
2. ('language', 'processing')
3. ('processing', '(')
4. ('(', 'nlp')
5. ('nlp', ')')
6. (')', 'is')
7. ('is', 'a')
8. ('a', 'subfield')
9. ('subfield', 'of')
10. ('of', 'artificial')
Total 2-grams generated: 57


In [39]:
# Step 6: Count frequency of each N-gram
ngram_freq = Counter(generated_ngrams)
print(f"\nFrequency of {n}-grams:")
for gram, freq in list(ngram_freq.items())[:10]:  # Show top 10
    print(f"{gram}: {freq} times")


Frequency of 2-grams:
('natural', 'language'): 1 times
('language', 'processing'): 1 times
('processing', '('): 1 times
('(', 'nlp'): 1 times
('nlp', ')'): 1 times
(')', 'is'): 1 times
('is', 'a'): 1 times
('a', 'subfield'): 1 times
('subfield', 'of'): 1 times
('of', 'artificial'): 1 times


In [40]:
# Step 7: Display the most common N-grams
most_common_ngrams = ngram_freq.most_common(10)
print(f"\nTop 10 Most Common {n}-grams:")
for i, (gram, freq) in enumerate(most_common_ngrams, 1):
    print(f"{i}. {gram}: {freq} times")


Top 10 Most Common 2-grams:
1. ('human', 'language'): 2 times
2. ('language', '.'): 2 times
3. (',', 'and'): 2 times
4. ('natural', 'language'): 1 times
5. ('language', 'processing'): 1 times
6. ('processing', '('): 1 times
7. ('(', 'nlp'): 1 times
8. ('nlp', ')'): 1 times
9. (')', 'is'): 1 times
10. ('is', 'a'): 1 times


In [41]:
# Step 8: Analyze text patterns
print("\nText Pattern Analysis:")
print(f"1. Most frequent {n}-gram: {most_common_ngrams[0][0]} ({most_common_ngrams[0][1]} times)")
print(f"2. Unique {n}-grams: {len(ngram_freq)}")
print(f"3. Total {n}-grams: {sum(ngram_freq.values())}")

# Calculate basic probabilities
print("\nBasic Probability Analysis:")
for gram, freq in most_common_ngrams[:3]:
    probability = freq / len(generated_ngrams)
    print(f"P({gram}) = {freq}/{len(generated_ngrams)} = {probability:.4f}")


Text Pattern Analysis:
1. Most frequent 2-gram: ('human', 'language') (2 times)
2. Unique 2-grams: 54
3. Total 2-grams: 57

Basic Probability Analysis:
P(('human', 'language')) = 2/57 = 0.0351
P(('language', '.')) = 2/57 = 0.0351
P((',', 'and')) = 2/57 = 0.0351


In [42]:
# Step 9: Save results for NLP applications
import json

# Save n-grams and their frequencies to a file
results = {
    'text_corpus': sample_text,
    'tokens': words,
    'n_value': n,
    'ngrams_frequencies': {str(k): v for k, v in ngram_freq.items()},
    'most_common': [(str(k), v) for k, v in most_common_ngrams]
}

with open('ngrams_language_model.json', 'w') as f:
    json.dump(results, f, indent=4)

print("\nResults saved to 'ngrams_language_model.json'")

# Also save as text file for readability
with open('ngrams_language_model.txt', 'w') as f:
    f.write("N-GRAMS AND LANGUAGE MODELING RESULTS\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Sample Text: {sample_text}\n\n")
    f.write(f"Token Count: {len(words)}\n")
    f.write(f"N-gram Type: {n}-grams\n")
    f.write(f"Total {n}-grams: {len(generated_ngrams)}\n")
    f.write(f"Unique {n}-grams: {len(ngram_freq)}\n\n")
    f.write("Top 10 Most Common N-grams:\n")
    for i, (gram, freq) in enumerate(most_common_ngrams, 1):
        f.write(f"{i}. {gram}: {freq} times\n")

print("Results saved to 'ngrams_language_model.txt'")


Results saved to 'ngrams_language_model.json'
Results saved to 'ngrams_language_model.txt'


In [43]:
# PRACTICAL 4 (ii): Unigrams, Bigrams and Trigrams
print("\n" + "=" * 60)
print("PRACTICAL 4 (ii): Unigrams, Bigrams and Trigrams")
print("=" * 60)

# Step 1: Import NLTK and download the punkt dataset
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

# Download punkt if not already downloaded
nltk.download('punkt', quiet=True)


PRACTICAL 4 (ii): Unigrams, Bigrams and Trigrams


True

In [44]:
# Step 2: Load a sample corpus
corpus_text = """
Language modeling is a fundamental task in natural language processing.
Models predict the probability of word sequences. Unigrams consider single words.
Bigrams consider pairs of words. Trigrams consider three words together.
N-grams help in text generation, speech recognition, and machine translation.
"""
print("Sample Corpus:")
print(corpus_text)

Sample Corpus:

Language modeling is a fundamental task in natural language processing. 
Models predict the probability of word sequences. Unigrams consider single words. 
Bigrams consider pairs of words. Trigrams consider three words together.
N-grams help in text generation, speech recognition, and machine translation.



In [45]:
# Step 3: Tokenize text
tokens = word_tokenize(corpus_text.lower())
print("\nTokenized Text:")
print(tokens)
print(f"Total tokens: {len(tokens)}")


Tokenized Text:
['language', 'modeling', 'is', 'a', 'fundamental', 'task', 'in', 'natural', 'language', 'processing', '.', 'models', 'predict', 'the', 'probability', 'of', 'word', 'sequences', '.', 'unigrams', 'consider', 'single', 'words', '.', 'bigrams', 'consider', 'pairs', 'of', 'words', '.', 'trigrams', 'consider', 'three', 'words', 'together', '.', 'n-grams', 'help', 'in', 'text', 'generation', ',', 'speech', 'recognition', ',', 'and', 'machine', 'translation', '.']
Total tokens: 49


In [46]:
# Step 4: Generate unigrams, bigrams and trigrams
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

print("\nGenerated N-grams:")
print(f"Unigrams: {len(unigrams)} (showing first 10: {unigrams[:10]})")
print(f"Bigrams: {len(bigrams)} (showing first 10: {bigrams[:10]})")
print(f"Trigrams: {len(trigrams)} (showing first 10: {trigrams[:10]})")


Generated N-grams:
Unigrams: 49 (showing first 10: [('language',), ('modeling',), ('is',), ('a',), ('fundamental',), ('task',), ('in',), ('natural',), ('language',), ('processing',)])
Bigrams: 48 (showing first 10: [('language', 'modeling'), ('modeling', 'is'), ('is', 'a'), ('a', 'fundamental'), ('fundamental', 'task'), ('task', 'in'), ('in', 'natural'), ('natural', 'language'), ('language', 'processing'), ('processing', '.')])
Trigrams: 47 (showing first 10: [('language', 'modeling', 'is'), ('modeling', 'is', 'a'), ('is', 'a', 'fundamental'), ('a', 'fundamental', 'task'), ('fundamental', 'task', 'in'), ('task', 'in', 'natural'), ('in', 'natural', 'language'), ('natural', 'language', 'processing'), ('language', 'processing', '.'), ('processing', '.', 'models')])


In [47]:
# Step 5: Compute their frequency distribution
unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

print("\nFrequency Distribution:")
print(f"Unique Unigrams: {len(unigram_freq)}")
print(f"Unique Bigrams: {len(bigram_freq)}")
print(f"Unique Trigrams: {len(trigram_freq)}")


Frequency Distribution:
Unique Unigrams: 36
Unique Bigrams: 47
Unique Trigrams: 47


In [48]:
# Step 6: Display most common N-grams
print("\nMOST COMMON N-GRAMS:")
print("\nTop 5 Unigrams:")
for i, (gram, freq) in enumerate(unigram_freq.most_common(5), 1):
    print(f"{i}. {gram[0]}: {freq} times")

print("\nTop 5 Bigrams:")
for i, (gram, freq) in enumerate(bigram_freq.most_common(5), 1):
    print(f"{i}. {gram}: {freq} times")

print("\nTop 5 Trigrams:")
for i, (gram, freq) in enumerate(trigram_freq.most_common(5), 1):
    print(f"{i}. {gram}: {freq} times")


MOST COMMON N-GRAMS:

Top 5 Unigrams:
1. .: 6 times
2. consider: 3 times
3. words: 3 times
4. language: 2 times
5. in: 2 times

Top 5 Bigrams:
1. ('words', '.'): 2 times
2. ('language', 'modeling'): 1 times
3. ('modeling', 'is'): 1 times
4. ('is', 'a'): 1 times
5. ('a', 'fundamental'): 1 times

Top 5 Trigrams:
1. ('language', 'modeling', 'is'): 1 times
2. ('modeling', 'is', 'a'): 1 times
3. ('is', 'a', 'fundamental'): 1 times
4. ('a', 'fundamental', 'task'): 1 times
5. ('fundamental', 'task', 'in'): 1 times


In [49]:
# Step 7: Analyze language structure
print("\n" + "=" * 40)
print("LANGUAGE STRUCTURE ANALYSIS")
print("=" * 40)

# Vocabulary analysis
vocabulary = set(tokens)
print(f"1. Vocabulary Size: {len(vocabulary)} unique words")

# N-gram coverage analysis
total_unigrams = len(unigrams)
total_bigrams = len(bigrams)
total_trigrams = len(trigrams)

print(f"\n2. N-gram Statistics:")
print(f"   Unigrams: {len(unigram_freq)} unique / {total_unigrams} total")
print(f"   Bigrams: {len(bigram_freq)} unique / {total_bigrams} total")
print(f"   Trigrams: {len(trigram_freq)} unique / {total_trigrams} total")

# Context window analysis
print(f"\n3. Context Window Analysis:")
print(f"   Unigrams show individual word frequency")
print(f"   Bigrams show word pair relationships")
print(f"   Trigrams show three-word phrase patterns")

# Example of language patterns
print(f"\n4. Example Patterns Found:")
print(f"   Most common unigram: '{unigram_freq.most_common(1)[0][0][0]}'")
print(f"   Most common bigram: {bigram_freq.most_common(1)[0][0]}")
print(f"   Most common trigram: {trigram_freq.most_common(1)[0][0]}")


LANGUAGE STRUCTURE ANALYSIS
1. Vocabulary Size: 36 unique words

2. N-gram Statistics:
   Unigrams: 36 unique / 49 total
   Bigrams: 47 unique / 48 total
   Trigrams: 47 unique / 47 total

3. Context Window Analysis:
   Unigrams show individual word frequency
   Bigrams show word pair relationships
   Trigrams show three-word phrase patterns

4. Example Patterns Found:
   Most common unigram: '.'
   Most common bigram: ('words', '.')
   Most common trigram: ('language', 'modeling', 'is')


In [50]:
# Step 8: Save results
import json

# Save all results to a JSON file
all_results = {
    'corpus': corpus_text,
    'tokens': tokens,
    'vocabulary_size': len(vocabulary),
    'unigrams': {
        'total': total_unigrams,
        'unique': len(unigram_freq),
        'top_10': [(str(k[0]), v) for k, v in unigram_freq.most_common(10)]
    },
    'bigrams': {
        'total': total_bigrams,
        'unique': len(bigram_freq),
        'top_10': [(str(k), v) for k, v in bigram_freq.most_common(10)]
    },
    'trigrams': {
        'total': total_trigrams,
        'unique': len(trigram_freq),
        'top_10': [(str(k), v) for k, v in trigram_freq.most_common(10)]
    }
}

with open('unigrams_bigrams_trigrams_results.json', 'w') as f:
    json.dump(all_results, f, indent=4)

print("\nResults saved to 'unigrams_bigrams_trigrams_results.json'")

# Also create a readable text report
with open('ngrams_analysis_report.txt', 'w') as f:
    f.write("UNIGRAMS, BIGRAMS AND TRIGRAMS ANALYSIS\n")
    f.write("=" * 50 + "\n\n")
    f.write("CORPUS TEXT:\n")
    f.write(corpus_text + "\n\n")

    f.write("TOKEN ANALYSIS:\n")
    f.write(f"Total tokens: {len(tokens)}\n")
    f.write(f"Vocabulary size: {len(vocabulary)} words\n\n")

    f.write("UNIGRAMS ANALYSIS:\n")
    f.write(f"Total unigrams: {total_unigrams}\n")
    f.write(f"Unique unigrams: {len(unigram_freq)}\n")
    f.write("Top 10 Unigrams:\n")
    for i, (gram, freq) in enumerate(unigram_freq.most_common(10), 1):
        f.write(f"  {i}. {gram[0]}: {freq} times\n")
    f.write("\n")

    f.write("BIGRAMS ANALYSIS:\n")
    f.write(f"Total bigrams: {total_bigrams}\n")
    f.write(f"Unique bigrams: {len(bigram_freq)}\n")
    f.write("Top 10 Bigrams:\n")
    for i, (gram, freq) in enumerate(bigram_freq.most_common(10), 1):
        f.write(f"  {i}. {gram}: {freq} times\n")
    f.write("\n")

    f.write("TRIGRAMS ANALYSIS:\n")
    f.write(f"Total trigrams: {total_trigrams}\n")
    f.write(f"Unique trigrams: {len(trigram_freq)}\n")
    f.write("Top 10 Trigrams:\n")
    for i, (gram, freq) in enumerate(trigram_freq.most_common(10), 1):
        f.write(f"  {i}. {gram}: {freq} times\n")

print("Report saved to 'ngrams_analysis_report.txt'")
print("\n" + "=" * 60)
print("PRACTICAL 4 COMPLETED SUCCESSFULLY!")
print("=" * 60)


Results saved to 'unigrams_bigrams_trigrams_results.json'
Report saved to 'ngrams_analysis_report.txt'

PRACTICAL 4 COMPLETED SUCCESSFULLY!
