In [11]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

corpus = [
    "WordPiece tokenization splits words into smaller units for efficient encoding.",
    "Natural Language Processing enables machines to understand human language.",
    "Embedding layers convert words into dense vector representations."
]


def generate_ngrams(text, n):
    tokens = word_tokenize(text.lower())  
    return list(ngrams(tokens, n))

for i, text in enumerate(corpus):
    print(f"\nSentence {i+1}: {text}")
    print("Unigrams:", generate_ngrams(text, 1))
    print("Bigrams:", generate_ngrams(text, 2))
    print("Trigrams:", generate_ngrams(text, 3))

vectorizer = TfidfVectorizer(ngram_range=(1,3))  
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()

print("\nTF-IDF Scores:")
for i, doc in enumerate(tfidf_scores):
    print(f"\nDocument {i+1}:")
    
    scored_terms = [(j, score) for j, score in enumerate(doc) if score > 0]
    
    scored_terms_sorted = sorted(scored_terms, key=lambda x: x[1], reverse=True)
    
    for j, score in scored_terms_sorted:
        print(f"{feature_names[j]}: {score:.4f}")




Sentence 1: WordPiece tokenization splits words into smaller units for efficient encoding.
Unigrams: [('wordpiece',), ('tokenization',), ('splits',), ('words',), ('into',), ('smaller',), ('units',), ('for',), ('efficient',), ('encoding',), ('.',)]
Bigrams: [('wordpiece', 'tokenization'), ('tokenization', 'splits'), ('splits', 'words'), ('words', 'into'), ('into', 'smaller'), ('smaller', 'units'), ('units', 'for'), ('for', 'efficient'), ('efficient', 'encoding'), ('encoding', '.')]
Trigrams: [('wordpiece', 'tokenization', 'splits'), ('tokenization', 'splits', 'words'), ('splits', 'words', 'into'), ('words', 'into', 'smaller'), ('into', 'smaller', 'units'), ('smaller', 'units', 'for'), ('units', 'for', 'efficient'), ('for', 'efficient', 'encoding'), ('efficient', 'encoding', '.')]

Sentence 2: Natural Language Processing enables machines to understand human language.
Unigrams: [('natural',), ('language',), ('processing',), ('enables',), ('machines',), ('to',), ('understand',), ('human',

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import re
import math
from collections import defaultdict

corpus = [
    "WordPiece tokenization splits words into smaller units for efficient encoding.",
    "Natural Language Processing enables machines to understand human language.",
    "Embedding layers convert words into dense vector representations."
]


def tokenize_and_ngram(text, ngram_range=(1, 3)):
    """Tokenize text and generate n-grams (1-3 grams)."""
    words = re.findall(r'\b\w+\b', text.lower())
    ngrams_list = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngrams_list.extend([tuple(words[i:i+n]) for i in range(len(words) - n + 1)])
    return ngrams_list

def compute_tf(doc_ngrams):
    tf = defaultdict(int)
    for ngram in doc_ngrams:
        tf[ngram] += 1
    return tf

def compute_idf(corpus_ngrams):
    idf = defaultdict(float)
    total_docs = len(corpus_ngrams)
    for doc in corpus_ngrams:
        unique_ngrams = set(doc)
        for ngram in unique_ngrams:
            idf[ngram] += 1
    for ngram, df in idf.items():
        idf[ngram] = math.log((1 + total_docs) / (1 + df)) + 1  
    return idf


def compute_tfidf(corpus):
    corpus_ngrams = [tokenize_and_ngram(doc) for doc in corpus]
    

    idf = compute_idf(corpus_ngrams)
    
   
    tfidf_corpus = []
    for doc_ngrams in corpus_ngrams:
        tf = compute_tf(doc_ngrams)
        tfidf = defaultdict(float)
        for ngram, count in tf.items():
            tfidf[ngram] = count * idf[ngram]  # TF * IDF
        
        norm = math.sqrt(sum(score ** 2 for score in tfidf.values()))
        if norm != 0:
            for ngram in tfidf:
                tfidf[ngram] /= norm
        tfidf_corpus.append(tfidf)
    return tfidf_corpus, idf


for i, text in enumerate(corpus):
    ngrams = tokenize_and_ngram(text)
    print(f"\nSentence {i+1}: {text}")
    print("N-grams (1-3):", ngrams)

tfidf_corpus, idf = compute_tfidf(corpus)
print("\nTF-IDF Scores (Aligned with sklearn):")
for i, tfidf in enumerate(tfidf_corpus):
    print(f"\nDocument {i+1}:")
    for ngram, score in sorted(tfidf.items(), key=lambda x: -x[1]):
        if score > 0:
            print(f"{' '.join(ngram)}: {score:.4f}")


Sentence 1: WordPiece tokenization splits words into smaller units for efficient encoding.
N-grams (1-3): [('wordpiece',), ('tokenization',), ('splits',), ('words',), ('into',), ('smaller',), ('units',), ('for',), ('efficient',), ('encoding',), ('wordpiece', 'tokenization'), ('tokenization', 'splits'), ('splits', 'words'), ('words', 'into'), ('into', 'smaller'), ('smaller', 'units'), ('units', 'for'), ('for', 'efficient'), ('efficient', 'encoding'), ('wordpiece', 'tokenization', 'splits'), ('tokenization', 'splits', 'words'), ('splits', 'words', 'into'), ('words', 'into', 'smaller'), ('into', 'smaller', 'units'), ('smaller', 'units', 'for'), ('units', 'for', 'efficient'), ('for', 'efficient', 'encoding')]

Sentence 2: Natural Language Processing enables machines to understand human language.
N-grams (1-3): [('natural',), ('language',), ('processing',), ('enables',), ('machines',), ('to',), ('understand',), ('human',), ('language',), ('natural', 'language'), ('language', 'processing'),