In [None]:
# Perform Bag-of-Words (count occurrence, normalized count occurrence)
# Perform TF-IDF on data
# Create word embeddings using Word2Vec


In [None]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [None]:
nltk.download('punkt')

In [None]:
# Sample text dataset
documents = [
    "AI is transforming the future",
    "Machine learning is a part of AI",
    "Natural language processing enables AI systems"
]

df = pd.DataFrame(documents, columns=['Text'])

In [None]:
# Bag-of-Words: Count Occurrence
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df['Text'])

bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
print("Bag-of-Words (Count Occurrence):")
print(bow_df)

In [None]:
# Bag-of-Words: Normalized Count Occurrence
normalized_bow = normalize(bow_matrix, norm='l1', axis=1)
normalized_bow_df = pd.DataFrame(normalized_bow.toarray(), columns=count_vectorizer.get_feature_names_out())

print("\nBag-of-Words (Normalized Count):")
print(normalized_bow_df)

In [None]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df)

In [None]:
# Tokenization for Word2Vec
tokenized_text = [word_tokenize(doc.lower()) for doc in df['Text']]

In [None]:
# Train Word2Vec Model
word2vec_model = Word2Vec(
    sentences=tokenized_text,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [None]:
# Display Word2Vec embedding for a sample word
word = 'ai'
if word in word2vec_model.wv:
    print(f"\nWord2Vec embedding for '{word}':")
    print(word2vec_model.wv[word])
else:
    print(f"Word '{word}' not found in vocabulary")