**Assignment No.2**: Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec

In [None]:
# Install necessary libraries if not installed
!pip install --upgrade gensim scikit-learn pandas numpy

Collecting numpy
  Using cached numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)


In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
# Sample dataset: List of documents
documents = [
    "I love natural language processing and machine learning.",
    "Machine learning is a fascinating field.",
    "Deep learning and NLP are closely related.",
    "Natural language processing is fun to learn.",
    "I enjoy working with NLP and machine learning projects."
]

In [None]:
# Preprocessing function (without NLTK)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    tokens = text.split()  # Tokenization using split
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return " ".join(tokens)

preprocessed_docs = [preprocess_text(doc) for doc in documents]
print("Preprocessed Documents:\n", preprocessed_docs)

Preprocessed Documents:
 ['love natural language processing machine learning', 'machine learning fascinating field', 'deep learning nlp closely related', 'natural language processing fun learn', 'enjoy working nlp machine learning projects']


In [None]:
# ---- Bag of Words (BoW) ----
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_docs)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nBag-of-Words Matrix:\n", bow_df)



Bag-of-Words Matrix:
    closely  deep  enjoy  fascinating  field  fun  language  learn  learning  \
0        0     0      0            0      0    0         1      0         1   
1        0     0      0            1      1    0         0      0         1   
2        1     1      0            0      0    0         0      0         1   
3        0     0      0            0      0    1         1      1         0   
4        0     0      1            0      0    0         0      0         1   

   love  machine  natural  nlp  processing  projects  related  working  
0     1        1        1    0           1         0        0        0  
1     0        1        0    0           0         0        0        0  
2     0        0        0    1           0         0        1        0  
3     0        0        1    0           1         0        0        0  
4     0        1        0    1           0         1        0        1  


In [None]:
# ---- TF-IDF (Term Frequency - Inverse Document Frequency) ----
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf_df)


TF-IDF Matrix:
     closely      deep     enjoy  fascinating     field      fun  language  \
0  0.000000  0.000000  0.000000     0.000000  0.000000  0.00000  0.418378   
1  0.000000  0.000000  0.000000     0.601285  0.601285  0.00000  0.000000   
2  0.501992  0.501992  0.000000     0.000000  0.000000  0.00000  0.000000   
3  0.000000  0.000000  0.000000     0.000000  0.000000  0.50298  0.405801   
4  0.000000  0.000000  0.475822     0.000000  0.000000  0.00000  0.000000   

     learn  learning      love   machine   natural       nlp  processing  \
0  0.00000  0.292153  0.518569  0.347292  0.418378  0.000000    0.418378   
1  0.00000  0.338754  0.000000  0.402688  0.000000  0.000000    0.000000   
2  0.00000  0.282814  0.000000  0.000000  0.000000  0.405004    0.000000   
3  0.50298  0.000000  0.000000  0.000000  0.405801  0.000000    0.405801   
4  0.00000  0.268070  0.000000  0.318664  0.000000  0.383890    0.000000   

   projects   related   working  
0  0.000000  0.000000  0.0000

In [None]:
# ---- Word2Vec Embeddings ----
tokenized_docs = [doc.split() for doc in preprocessed_docs]  # Tokenization
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)


In [None]:
# Example: Get word embedding for 'learning'
if 'learning' in word2vec_model.wv:
    print("\nWord2Vec Embedding for 'learning':\n", word2vec_model.wv['learning'])

# Example: Find most similar words to 'nlp'
if 'nlp' in word2vec_model.wv:
    print("\nMost Similar Words to 'nlp':\n", word2vec_model.wv.most_similar('nlp'))


Word2Vec Embedding for 'learning':
 [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7