**ASS NO-2 Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec**

In [None]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download tokenizer (if needed)
nltk.download('punkt')

# Sample dataset
documents = [
    "Machine learning is amazing",
    "Deep learning is a part of machine learning",
    "Natural language processing uses machine learning",
    "Word embeddings improve NLP models",
    "Text processing is a crucial part of NLP"
]


vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nBag-of-Words (BoW):\n", bow_df)


tf_matrix = bow_matrix.toarray() / bow_matrix.sum(axis=1).reshape(-1, 1)
tf_df = pd.DataFrame(tf_matrix, columns=vectorizer.get_feature_names_out())
print("\nNormalized Term Frequency (TF):\n", tf_df)


tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF:\n", tfidf_df)


tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=10, window=3, min_count=1, workers=4)
word_vectors = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}

print("\nWord2Vec Vector for 'machine':\n", word_vectors.get('machine', "Not in vocabulary"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



Bag-of-Words (BoW):
    amazing  crucial  deep  embeddings  improve  is  language  learning  \
0        1        0     0           0        0   1         0         1   
1        0        0     1           0        0   1         0         2   
2        0        0     0           0        0   0         1         1   
3        0        0     0           1        1   0         0         0   
4        0        1     0           0        0   1         0         0   

   machine  models  natural  nlp  of  part  processing  text  uses  word  
0        1       0        0    0   0     0           0     0     0     0  
1        1       0        0    0   1     1           0     0     0     0  
2        1       0        1    0   0     0           1     0     1     0  
3        0       1        0    1   0     0           0     0     0     1  
4        0       0        0    1   1     1           1     1     0     0  

Normalized Term Frequency (TF):
    amazing   crucial      deep  embeddings  impro