In [1]:
# ---------------------------------------------------------
# IMPORT REQUIRED LIBRARIES
# ---------------------------------------------------------

import pandas as pd                      # For DataFrame creation and display
from nltk.tokenize import word_tokenize   # For splitting sentences into words
from nltk.corpus import stopwords         # For removing common English words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
                                          # For BOW and TF-IDF vectorization


# ---------------------------------------------------------
# SAMPLE SENTENCES FOR BAG-OF-WORDS AND N-GRAMS
# ---------------------------------------------------------

sentences = [
    'this is IT 7 sem H div of sem 7',
    'students of IT sem 7 are good and are good',
    'good sem system is good for education',
    'Education is good for human education we want to carry semantic meaning for meaning'
]


# ---------------------------------------------------------
# SAMPLE SENTENCES FOR TF-IDF
# ---------------------------------------------------------

for_tfidf = [
    "IT department is good",
    "Students in IT Department are good",
    "No good is Bad"
]


# ---------------------------------------------------------
# COUNT VECTORIZER (Bag of Words)
# ---------------------------------------------------------

countvectorizer = CountVectorizer()             # Create a CountVectorizer object

count_wm = countvectorizer.fit_transform(sentences) 
# fit_transform() learns vocabulary + converts sentences → word count matrix

count_tokens = countvectorizer.get_feature_names_out()
# Extract the list of all words learned by CountVectorizer

print("BOW Vocabulary =", count_tokens)
print("Vocabulary size =", len(count_tokens))


# Convert sparse matrix to dense matrix and then to DataFrame for readability
cv = pd.DataFrame(count_wm.todense(), columns=count_tokens)
print("\nBag of Words Matrix (CountVectorizer):\n")
print(cv)


# ---------------------------------------------------------
# TF-IDF VECTOR (Term Frequency–Inverse Document Frequency)
# ---------------------------------------------------------

tfidfvectorizer = TfidfVectorizer()            # Create TF-IDF vectorizer

tfidf_vect = tfidfvectorizer.fit_transform(for_tfidf)
# fit_transform() learns vocabulary + applies TF-IDF transformation

tfidf_tokens = tfidfvectorizer.get_feature_names_out() 
# List of all words involved in TF-IDF vocabulary

print("\nTF-IDF Vocabulary =", tfidf_tokens)

# Convert sparse matrix to dense matrix → DataFrame
ftidf_v = pd.DataFrame(tfidf_vect.toarray(), columns=tfidf_tokens)

print("\nTF-IDF Matrix:\n")
print(ftidf_v)


# ---------------------------------------------------------
# N-GRAM FEATURES (Here: 5-grams)
# ---------------------------------------------------------

ngcov = CountVectorizer(ngram_range=(5, 5))    # Extract ONLY 5-word sequences

ngram_matrix = ngcov.fit_transform(sentences)  
# Converts sentences into 5-gram count vectors

print("\nN-Gram Vocabulary (5-grams):\n")
print(ngcov.vocabulary_)                       # Display all generated 5-grams

print("\n5-Gram Count Matrix:\n")
print(pd.DataFrame(ngram_matrix.toarray(), columns=ngcov.get_feature_names_out()))


# ---------------------------------------------------------
# END OF CODE
# ---------------------------------------------------------


BOW Vocabulary = ['and' 'are' 'carry' 'div' 'education' 'for' 'good' 'human' 'is' 'it'
 'meaning' 'of' 'sem' 'semantic' 'students' 'system' 'this' 'to' 'want'
 'we']
Vocabulary size = 20

Bag of Words Matrix (CountVectorizer):

   and  are  carry  div  education  for  good  human  is  it  meaning  of  \
0    0    0      0    1          0    0     0      0   1   1        0   1   
1    1    2      0    0          0    0     2      0   0   1        0   1   
2    0    0      0    0          1    1     2      0   1   0        0   0   
3    0    0      1    0          2    2     1      1   1   0        2   0   

   sem  semantic  students  system  this  to  want  we  
0    2         0         0       0     1   0     0   0  
1    1         0         1       0     0   0     0   0  
2    1         0         0       1     0   0     0   0  
3    0         1         0       0     0   1     1   1  

TF-IDF Vocabulary = ['are' 'bad' 'department' 'good' 'in' 'is' 'it' 'no' 'students']

TF-IDF Matrix: