In [7]:
# pip install scikit-learn gensim nltk
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X_bow = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the result to a dense array
dense_array_bow = X_bow.toarray()

# Display the BoW matrix
print("Bag-of-Words Matrix:")
print(dense_array_bow)

# Display feature names
print("\nFeature Names:")
print(feature_names)

# Calculate Term Frequency (TF)
TF = dense_array_bow / dense_array_bow.sum(axis=1, keepdims=True)

# Display normalized count occurrence (TF)
print("\nNormalized Count Occurrence (TF):")
print(TF)

from sklearn.feature_extraction.text import TfidfTransformer

# Create the TF-IDF transformer
tfidf_transformer = TfidfTransformer()

# Fit and transform the TF matrix
X_tfidf = tfidf_transformer.fit_transform(X_bow)

# Convert the result to a dense array
dense_array_tfidf = X_tfidf.toarray()

# Display the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(dense_array_tfidf)

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# Tokenize the documents
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

# Example: Get the embedding for the word 'document'
embedding_document = word2vec_model.wv['document']
print("\nWord2Vec Embedding for 'document':")
print(embedding_document)




Bag-of-Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

Normalized Count Occurrence (TF):
[[0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.33333333 0.         0.16666667 0.         0.16666667
  0.16666667 0.         0.16666667]
 [0.16666667 0.         0.         0.16666667 0.16666667 0.
  0.16666667 0.16666667 0.16666667]
 [0.         0.2        0.2        0.2        0.         0.
  0.2        0.         0.2       ]]

TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.384085

ModuleNotFoundError: No module named 'gensim'