In [None]:
# Import basic libraries
import numpy as np
import pandas as pd

# Import sklearn tools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import Word2Vec
from gensim.models import Word2Vec

# Import tokenizer
from nltk.tokenize import word_tokenize
import nltk

# Download tokenizer model
nltk.download('punkt')

import nltk

# Download missing tokenizer tables
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Sample dataset (documents)
documents = [
    "Natural language processing is fun",
    "I love learning natural language processing",
    "Word embeddings are useful in NLP",
    "Bag of words and TF IDF are basic NLP techniques"
]

# Display documents
documents


['Natural language processing is fun',
 'I love learning natural language processing',
 'Word embeddings are useful in NLP',
 'Bag of words and TF IDF are basic NLP techniques']

In [3]:
# Create CountVectorizer object
count_vectorizer = CountVectorizer()

# Fit and transform the documents
bow_matrix = count_vectorizer.fit_transform(documents)

# Convert to DataFrame for readability
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

bow_df


Unnamed: 0,and,are,bag,basic,embeddings,fun,idf,in,is,language,...,love,natural,nlp,of,processing,techniques,tf,useful,word,words
0,0,0,0,0,0,1,0,0,1,1,...,0,1,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,1,0,0,0,0,0
2,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,1,0
3,1,1,1,1,0,0,1,0,0,0,...,0,0,1,1,0,1,1,0,0,1


In [4]:
# Normalize BoW by dividing each row by total word count in that document
normalized_bow = bow_df.div(bow_df.sum(axis=1), axis=0)

normalized_bow


Unnamed: 0,and,are,bag,basic,embeddings,fun,idf,in,is,language,...,love,natural,nlp,of,processing,techniques,tf,useful,word,words
0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.2,...,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.2,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
2,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0
3,0.1,0.1,0.1,0.1,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.0,0.1


In [5]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert to DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df


Unnamed: 0,and,are,bag,basic,embeddings,fun,idf,in,is,language,...,love,natural,nlp,of,processing,techniques,tf,useful,word,words
0,0.0,0.0,0.0,0.0,0.0,0.508672,0.0,0.0,0.508672,0.401043,...,0.0,0.401043,0.0,0.0,0.401043,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401043,...,0.508672,0.401043,0.0,0.0,0.401043,0.0,0.0,0.0,0.0,0.0
2,0.0,0.344315,0.0,0.0,0.436719,0.0,0.0,0.436719,0.0,0.0,...,0.0,0.0,0.344315,0.0,0.0,0.0,0.0,0.436719,0.436719,0.0
3,0.328919,0.259324,0.328919,0.328919,0.0,0.0,0.328919,0.0,0.0,0.0,...,0.0,0.0,0.259324,0.328919,0.0,0.328919,0.328919,0.0,0.0,0.328919


In [9]:
# Tokenize each document into words
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

tokenized_docs


[['natural', 'language', 'processing', 'is', 'fun'],
 ['i', 'love', 'learning', 'natural', 'language', 'processing'],
 ['word', 'embeddings', 'are', 'useful', 'in', 'nlp'],
 ['bag',
  'of',
  'words',
  'and',
  'tf',
  'idf',
  'are',
  'basic',
  'nlp',
  'techniques']]

In [10]:
# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,   # Size of word embedding
    window=5,         # Context window size
    min_count=1,      # Include all words
    workers=4         # Number of CPU threads
)


In [11]:
# Get embedding vector for a word
word2vec_model.wv['language']


array([-0.01723938,  0.00733148,  0.01037977,  0.01148388,  0.01493384,
       -0.01233535,  0.00221123,  0.01209456, -0.0056801 , -0.01234705,
       -0.00082045, -0.0167379 , -0.01120002,  0.01420908,  0.00670508,
        0.01445134,  0.01360049,  0.01506148, -0.00757831, -0.00112361,
        0.00469675, -0.00903806,  0.01677746, -0.01971633,  0.01352928,
        0.00582883, -0.00986566,  0.00879638, -0.00347915,  0.01342277,
        0.0199297 , -0.00872489, -0.00119868, -0.01139127,  0.00770164,
        0.00557325,  0.01378215,  0.01220219,  0.01907699,  0.01854683,
        0.01579614, -0.01397901, -0.01831173, -0.00071151, -0.00619968,
        0.01578863,  0.01187715, -0.00309133,  0.00302193,  0.00358008],
      dtype=float32)

In [12]:
# Find words similar to 'language'
word2vec_model.wv.most_similar('language')


[('in', 0.1960817128419876),
 ('natural', 0.16563552618026733),
 ('i', 0.15517635643482208),
 ('embeddings', 0.14385901391506195),
 ('tf', 0.13940520584583282),
 ('are', 0.12670199573040009),
 ('fun', 0.1211962178349495),
 ('basic', 0.10519503057003021),
 ('learning', 0.08872983604669571),
 ('of', 0.03227848559617996)]