In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer

# Read and clean document
with open("t1.txt", "r") as f:
    doc1 = f.read().lower()
l_doc1 = re.sub(r"[^a-zA-Z0-9]", " ", doc1).split()
print(l_doc1)

['coffee', 'trees', 'are', 'pruned', 'short', 'to', 'conserve', 'their', 'energy', 'and', 'aid', 'in', 'harvesting', 'but', 'can', 'grow', 'to', 'more', 'than', '30', 'feet', '9', 'meters', 'high', 'each', 'tree', 'is', 'covered', 'with', 'green', 'waxy', 'leaves', 'growing', 'opposite', 'each', 'other', 'in', 'pairs', 'coffee', 'cherries', 'grow', 'along', 'the', 'branches', 'because', 'it', 'grows', 'in', 'a', 'continuous', 'cycle', 'it', 's', 'not', 'unusual', 'to', 'see', 'flowers', 'green', 'fruit', 'and', 'ripe', 'fruit', 'simultaneously', 'on', 'a', 'single', 'tree', 'all', 'commercially', 'grown', 'coffee', 'is', 'from', 'a', 'region', 'of', 'the', 'world', 'called', 'the', 'coffee', 'belt', 'the', 'trees', 'grow', 'best', 'in', 'rich', 'soil', 'with', 'mild', 'temperatures', 'frequent', 'rain', 'and', 'shaded', 'sun']


In [2]:
# Bag of Words calculation
def calculateBOW(words):
    return {word: words.count(word) for word in set(words)}

bow1 = calculateBOW(l_doc1)
print("Bag of Words Calculation: \n")
print(bow1,"\n")

Bag of Words Calculation: 

{'along': 1, 'are': 1, 'waxy': 1, 'soil': 1, 'continuous': 1, 'belt': 1, 'high': 1, 'shaded': 1, 'covered': 1, 'rain': 1, 'rich': 1, 'harvesting': 1, 'the': 4, 'their': 1, 'coffee': 4, 'frequent': 1, 'than': 1, 'growing': 1, 'leaves': 1, 'opposite': 1, 'and': 3, 'unusual': 1, 'see': 1, 'fruit': 2, 'mild': 1, '9': 1, 'conserve': 1, '30': 1, 'it': 2, 'pairs': 1, 'grow': 3, 'to': 3, 'tree': 2, 'commercially': 1, 'sun': 1, 'single': 1, 'best': 1, 'cherries': 1, 'each': 2, 'branches': 1, 'short': 1, 'ripe': 1, 'simultaneously': 1, 'all': 1, 'from': 1, 'energy': 1, 'because': 1, 'is': 2, 'world': 1, 's': 1, 'grown': 1, 'but': 1, 'called': 1, 'temperatures': 1, 'feet': 1, 'grows': 1, 'flowers': 1, 'pruned': 1, 'with': 2, 'in': 4, 'of': 1, 'on': 1, 'a': 3, 'can': 1, 'meters': 1, 'other': 1, 'cycle': 1, 'trees': 2, 'more': 1, 'aid': 1, 'green': 2, 'not': 1, 'region': 1} 



In [3]:
print("Bag of Words DataFrame: \n")
df_bow = pd.DataFrame([bow1])
print(df_bow,"\n")

Bag of Words DataFrame: 

   along  are  waxy  soil  continuous  belt  high  shaded  covered  rain  ...  \
0      1    1     1     1           1     1     1       1        1     1  ...   

   can  meters  other  cycle  trees  more  aid  green  not  region  
0    1       1      1      1      2     1    1      2    1       1  

[1 rows x 73 columns] 



In [4]:
# Using sklearn CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([doc1])
df_bow_sklearn = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print output
print(X.toarray(),"\n")

[[1 1 1 1 3 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 2 1 1 1 1 1 2 2 3 1 1 1 1 1 4 2
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 3 2 2 1 1 2 1]] 



In [5]:
# NLTK tokenization and word count
nltk.download('punkt')
dataset = [re.sub(r"\W", ' ', sent) for sent in nltk.sent_tokenize(doc1)]
word2count = {}
for sentence in dataset:
    for word in nltk.word_tokenize(sentence):
        word2count[word] = word2count.get(word, 0) + 1
        

print(word2count)

{'coffee': 4, 'trees': 2, 'are': 1, 'pruned': 1, 'short': 1, 'to': 3, 'conserve': 1, 'their': 1, 'energy': 1, 'and': 3, 'aid': 1, 'in': 4, 'harvesting': 1, 'but': 1, 'can': 1, 'grow': 3, 'more': 1, 'than': 1, '30': 1, 'feet': 1, '9': 1, 'meters': 1, 'high': 1, 'each': 2, 'tree': 2, 'is': 2, 'covered': 1, 'with': 2, 'green': 2, 'waxy': 1, 'leaves': 1, 'growing': 1, 'opposite': 1, 'other': 1, 'pairs': 1, 'cherries': 1, 'along': 1, 'the': 4, 'branches': 1, 'because': 1, 'it': 1, 'grows': 1, 'a': 3, 'continuous': 1, 'cycle': 1, 'itâ': 1, 's': 1, 'not': 1, 'unusual': 1, 'see': 1, 'flowers': 1, 'fruit': 2, 'ripe': 1, 'simultaneously': 1, 'on': 1, 'single': 1, 'all': 1, 'commercially': 1, 'grown': 1, 'from': 1, 'region': 1, 'of': 1, 'world': 1, 'called': 1, 'belt': 1, 'best': 1, 'rich': 1, 'soil': 1, 'mild': 1, 'temperatures': 1, 'frequent': 1, 'rain': 1, 'shaded': 1, 'sun': 1}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Neha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#Prepare word embeddings
vocab_size = len(set(l_doc1))     #unique vocabulary size
embed_dim = 10      #each word will be represented as a vector of 10 numbers

#dictionary comprehension that creates a mapping of each unique word to a unique index.
word_to_ix = {word: i for i, word in enumerate(set(l_doc1))}

print(word_to_ix)

embeddings = np.random.random_sample((vocab_size, embed_dim))

#Context-target pairs for CBOW model
data = [([l_doc1[i - 2], l_doc1[i - 1], l_doc1[i + 1], l_doc1[i + 2]], l_doc1[i]) 
        for i in range(2, len(l_doc1) - 2)]

{'along': 0, 'are': 1, 'waxy': 2, 'soil': 3, 'continuous': 4, 'belt': 5, 'high': 6, 'shaded': 7, 'covered': 8, 'rain': 9, 'rich': 10, 'harvesting': 11, 'the': 12, 'their': 13, 'coffee': 14, 'frequent': 15, 'than': 16, 'growing': 17, 'leaves': 18, 'opposite': 19, 'and': 20, 'unusual': 21, 'see': 22, 'fruit': 23, 'mild': 24, '9': 25, 'conserve': 26, '30': 27, 'it': 28, 'pairs': 29, 'grow': 30, 'to': 31, 'tree': 32, 'commercially': 33, 'sun': 34, 'single': 35, 'best': 36, 'cherries': 37, 'each': 38, 'branches': 39, 'short': 40, 'ripe': 41, 'simultaneously': 42, 'all': 43, 'from': 44, 'energy': 45, 'because': 46, 'is': 47, 'world': 48, 's': 49, 'grown': 50, 'but': 51, 'called': 52, 'temperatures': 53, 'feet': 54, 'grows': 55, 'flowers': 56, 'pruned': 57, 'with': 58, 'in': 59, 'of': 60, 'on': 61, 'a': 62, 'can': 63, 'meters': 64, 'other': 65, 'cycle': 66, 'trees': 67, 'more': 68, 'aid': 69, 'green': 70, 'not': 71, 'region': 72}


In [7]:
#Print example context-target pairs and embeddings
print(data)
print(embeddings)

[(['coffee', 'trees', 'pruned', 'short'], 'are'), (['trees', 'are', 'short', 'to'], 'pruned'), (['are', 'pruned', 'to', 'conserve'], 'short'), (['pruned', 'short', 'conserve', 'their'], 'to'), (['short', 'to', 'their', 'energy'], 'conserve'), (['to', 'conserve', 'energy', 'and'], 'their'), (['conserve', 'their', 'and', 'aid'], 'energy'), (['their', 'energy', 'aid', 'in'], 'and'), (['energy', 'and', 'in', 'harvesting'], 'aid'), (['and', 'aid', 'harvesting', 'but'], 'in'), (['aid', 'in', 'but', 'can'], 'harvesting'), (['in', 'harvesting', 'can', 'grow'], 'but'), (['harvesting', 'but', 'grow', 'to'], 'can'), (['but', 'can', 'to', 'more'], 'grow'), (['can', 'grow', 'more', 'than'], 'to'), (['grow', 'to', 'than', '30'], 'more'), (['to', 'more', '30', 'feet'], 'than'), (['more', 'than', 'feet', '9'], '30'), (['than', '30', '9', 'meters'], 'feet'), (['30', 'feet', 'meters', 'high'], '9'), (['feet', '9', 'high', 'each'], 'meters'), (['9', 'meters', 'each', 'tree'], 'high'), (['meters', 'high',

In [8]:
# Convert context words to indices
context_indices = [[word_to_ix[word] for word in context] for context, _ in data]
target_indices = [word_to_ix[target] for _, target in data]

print(context_indices)
print(target_indices)

[[14, 67, 57, 40], [67, 1, 40, 31], [1, 57, 31, 26], [57, 40, 26, 13], [40, 31, 13, 45], [31, 26, 45, 20], [26, 13, 20, 69], [13, 45, 69, 59], [45, 20, 59, 11], [20, 69, 11, 51], [69, 59, 51, 63], [59, 11, 63, 30], [11, 51, 30, 31], [51, 63, 31, 68], [63, 30, 68, 16], [30, 31, 16, 27], [31, 68, 27, 54], [68, 16, 54, 25], [16, 27, 25, 64], [27, 54, 64, 6], [54, 25, 6, 38], [25, 64, 38, 32], [64, 6, 32, 47], [6, 38, 47, 8], [38, 32, 8, 58], [32, 47, 58, 70], [47, 8, 70, 2], [8, 58, 2, 18], [58, 70, 18, 17], [70, 2, 17, 19], [2, 18, 19, 38], [18, 17, 38, 65], [17, 19, 65, 59], [19, 38, 59, 29], [38, 65, 29, 14], [65, 59, 14, 37], [59, 29, 37, 30], [29, 14, 30, 0], [14, 37, 0, 12], [37, 30, 12, 39], [30, 0, 39, 46], [0, 12, 46, 28], [12, 39, 28, 55], [39, 46, 55, 59], [46, 28, 59, 62], [28, 55, 62, 4], [55, 59, 4, 66], [59, 62, 66, 28], [62, 4, 28, 49], [4, 66, 49, 71], [66, 28, 71, 21], [28, 49, 21, 31], [49, 71, 31, 22], [71, 21, 22, 56], [21, 31, 56, 70], [31, 22, 70, 23], [22, 56, 23, 

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Convert to numpy arrays
X_train = np.array(context_indices)
y_train = np.array(target_indices)

# Define the CBOW model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=4))  # 4 context words
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))  # Output layer with softmax activation for multi-class classification

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, verbose=1)

# Example context for prediction
example_context = ['coffee', 'fruit', 'rain', 'waxy']
example_context_indices = np.array([[word_to_ix[word] for word in example_context]])

# Predict
predicted = model.predict(example_context_indices)
predicted_word_index = np.argmax(predicted, axis=-1)
predicted_word = list(word_to_ix.keys())[predicted_word_index[0]]
print(f"Predicted word: {predicted_word}")

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.0000e+00 - loss: 4.2942
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 4.2871 
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0106 - loss: 4.2822     
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0106 - loss: 4.2751     
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0539 - loss: 4.2690 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0958 - loss: 4.2635 
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1682 - loss: 4.2576 
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1828 - loss: 4.2518 
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━