In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# Sample text data
documents = [
    "Deep learning is a subset of machine learning.",
    "Machine learning is used in data science.",
    "This is a Deep Learning Lab Manual."
]

In [None]:
# a) Bag-of-Words using CountVectorizer

def bow_with_countvectorizer(docs):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(docs)
    return vectorizer.get_feature_names_out(), X.toarray()


In [None]:
# b) Bag-of-n-grams using CountVectorizer (bi-grams here)

def ngrams_with_countvectorizer(docs, ngram_range=(2, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(docs)
    return vectorizer.get_feature_names_out(), X.toarray()



In [None]:
# c) Bag-of-Words using TfidfVectorizer

def bow_with_tfidfvectorizer(docs):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs)
    return vectorizer.get_feature_names_out(), X.toarray()



In [None]:
# Running the methods
features_bow, matrix_bow = bow_with_countvectorizer(documents)
features_ngrams, matrix_ngrams = ngrams_with_countvectorizer(documents)
features_tfidf, matrix_tfidf = bow_with_tfidfvectorizer(documents)


In [None]:
# Display the results
print("a) Bag-of-Words (CountVectorizer):")
print("Features:", features_bow)
print("Matrix:\n", matrix_bow)

print("\n\nb) Bag-of-n-grams (CountVectorizer - Bigrams):")
print("Features:", features_ngrams)
print("Matrix:\n", matrix_ngrams)

print("\n\nc) Bag-of-Words (TfidfVectorizer):")
print("Features:", features_tfidf)
print("Matrix:\n", matrix_tfidf)

a) Bag-of-Words (CountVectorizer):
Features: ['data' 'deep' 'in' 'is' 'lab' 'learning' 'machine' 'manual' 'of'
 'science' 'subset' 'this' 'used']
Matrix:
 [[0 1 0 1 0 2 1 0 1 0 1 0 0]
 [1 0 1 1 0 1 1 0 0 1 0 0 1]
 [0 1 0 1 1 1 0 1 0 0 0 1 0]]


b) Bag-of-n-grams (CountVectorizer - Bigrams):
Features: ['data science' 'deep learning' 'in data' 'is deep' 'is subset' 'is used'
 'lab manual' 'learning is' 'learning lab' 'machine learning' 'of machine'
 'subset of' 'this is' 'used in']
Matrix:
 [[0 1 0 0 1 0 0 1 0 1 1 1 0 0]
 [1 0 1 0 0 1 0 1 0 1 0 0 0 1]
 [0 1 0 1 0 0 1 0 1 0 0 0 1 0]]


c) Bag-of-Words (TfidfVectorizer):
Features: ['data' 'deep' 'in' 'is' 'lab' 'learning' 'machine' 'manual' 'of'
 'science' 'subset' 'this' 'used']
Matrix:
 [[0.         0.34353772 0.         0.26678769 0.         0.53357537
  0.34353772 0.         0.45171082 0.         0.45171082 0.
  0.        ]
 [0.43535684 0.         0.43535684 0.25712876 0.         0.25712876
  0.3311001  0.         0.         0.43535684