In [None]:
#Perform bag-of-words approach (count occurrence, normalized count occurrence)
#TF-IDF on data.
#Create embeddings using Word2Vec

In [None]:
!pip install numpy pandas scikit-learn gensim



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Sample text data (Replace this with your dataset)
documents = [
    "AI is transforming the future.",
    "Machine learning is a part of AI.",
    "Natural language processing is useful for AI applications."
]

In [None]:
# Convert to Pandas DataFrame
df = pd.DataFrame(documents, columns=["Text"])

In [None]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df["Text"])

In [None]:
# Convert to DataFrame for better visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("Bag-of-Words (Raw Count):")
print(bow_df)

Bag-of-Words (Raw Count):
   ai  applications  for  future  is  language  learning  machine  natural  \
0   1             0    0       1   1         0         0        0        0   
1   1             0    0       0   1         0         1        1        0   
2   1             1    1       0   1         1         0        0        1   

   of  part  processing  the  transforming  useful  
0   0     0           0    1             1       0  
1   1     1           0    0             0       0  
2   0     0           1    0             0       1  


In [None]:
from sklearn.preprocessing import normalize

In [None]:
normalized_bow = normalize(bow_matrix, norm='l1', axis=1)  # L1 Normalization
# Convert sparse matrix to dense array to solve shape mismatch.
normalized_bow_dense = normalized_bow.toarray()
normalized_bow_df = pd.DataFrame(normalized_bow_dense, columns=vectorizer.get_feature_names_out())
print("\nBag-of-Words (Normalized Count):")
print(normalized_bow_df)


Bag-of-Words (Normalized Count):
         ai  applications    for  future        is  language  learning  \
0  0.200000         0.000  0.000     0.2  0.200000     0.000  0.000000   
1  0.166667         0.000  0.000     0.0  0.166667     0.000  0.166667   
2  0.125000         0.125  0.125     0.0  0.125000     0.125  0.000000   

    machine  natural        of      part  processing  the  transforming  \
0  0.000000    0.000  0.000000  0.000000       0.000  0.2           0.2   
1  0.166667    0.000  0.166667  0.166667       0.000  0.0           0.0   
2  0.000000    0.125  0.000000  0.000000       0.125  0.0           0.0   

   useful  
0   0.000  
1   0.000  
2   0.125  


In [None]:
# Initialize and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer() # Initialize the TfidfVectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Text"]) # Fit and transform

In [None]:
# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df)


TF-IDF Representation:
         ai  applications       for   future        is  language  learning  \
0  0.307144      0.000000  0.000000  0.52004  0.307144  0.000000  0.000000   
1  0.272499      0.000000  0.000000  0.00000  0.272499  0.000000  0.461381   
2  0.228215      0.386401  0.386401  0.00000  0.228215  0.386401  0.000000   

    machine   natural        of      part  processing      the  transforming  \
0  0.000000  0.000000  0.000000  0.000000    0.000000  0.52004       0.52004   
1  0.461381  0.000000  0.461381  0.461381    0.000000  0.00000       0.00000   
2  0.000000  0.386401  0.000000  0.000000    0.386401  0.00000       0.00000   

     useful  
0  0.000000  
1  0.000000  
2  0.386401  


In [None]:
# Download the 'punkt_tab' resource:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Tokenizing the sentences for Word2Vec
tokenized_text = [word_tokenize(doc.lower()) for doc in df["Text"]]

In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Get vector for a word
word = "ai"
if word in word2vec_model.wv:
    print(f"\nWord2Vec vector for '{word}':")
    print(word2vec_model.wv[word])
else:
    print(f"\nWord '{word}' not found in Word2Vec vocabulary.")


Word2Vec vector for 'ai':
[-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-