# Neural Network with Continuous Bag of Words (CBOW)

For details see: https://www.kaggle.com/code/alincijov/nlp-starter-continuous-bag-of-words-cbow

## Libraries and settings

In [None]:
# Libraries
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())


## Data

### Sentence

Source: https://en.wikipedia.org/wiki/Currywurst

In [None]:
# Open file in read mode ('r')
with open('text.txt', 'r') as file:

    # Read the contents of the file
    sentences = file.read()

# Print the file content
print(sentences)

### Wordcloud

In [None]:
# Function to create a word cloud
def wordCloud_generator(data, title=None):
    
    wordcloud = WordCloud(height=300,
                          width=600,
                          background_color ='white',
                          min_font_size = 8
                         ).generate(" ".join(data))
    
    # Plot the WordCloud image                        
    plt.figure(figsize = (6, 4), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title,fontsize=16)
    plt.show()
    
# Create Wordcloud
wordCloud_generator(sentences, title="Word cloud")

### Clean Data

In [None]:
# Remove special characters
sentences = re.sub('[^A-Za-z0-9]+', ' ', sentences)

# Remove 1 letter words
sentences = re.sub(r'(?:^| )\w(?:$| )', ' ', sentences).strip()

# Lower all characters
sentences = sentences.lower()

## Vocabulary

In [None]:
# Split sentense to derive vocabulary (tokens)
words = sentences.split()
vocab = set(words)

vocab_size = len(vocab)
embed_dim = 10
context_size = 2

print(vocab)

## Implementation

### Dictionaries

In [None]:
# Bag-of-Words
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

print(word_to_ix)

### Data bags

In [None]:
# Create data bags with ([context], target) as the basis for modeling
data = []

for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))

# Show first entries in 'data'
data[:20]

### Word embeddings

In [None]:
# Create word embeddings
np.random.seed(42)
embeddings = np.random.random_sample((vocab_size, embed_dim))

# Show first embeddings
print(embeddings[0:5])

# Dimensions of the array 'np.ndarray'
print('\nDimensions of np.ndarray:', embeddings.ndim)

# Shape of the array 'np.ndarray'
print('\nShape of np.ndarray:', embeddings.shape)


### Linear Model

In [None]:
# Function to perform linear transformation of word embeddings
def linear(m, theta):
    w = theta

    return m.dot(w)

### Log softmax + NLLLoss = Cross Entropy

In [None]:
# Loss function to measure the model performance during training
def log_softmax(x):
    e_x = np.exp(x - np.max(x))

    return np.log(e_x / e_x.sum())

def NLLLoss(logs, targets):
    out = logs[range(len(targets)), targets]

    return -out.sum()/len(out)

def log_softmax_crossentropy_with_logits(logits,target):
    out = np.zeros_like(logits)
    out[np.arange(len(logits)),target] = 1
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    
    return (- out + softmax) / logits.shape[0]

### Forward function

In [None]:
# Function to provide a forward pass through a neural network
def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m, theta)
    o = log_softmax(n)
    
    return m, n, o

### Backward function

In [None]:
# Function to provide a backward pass through a neural network
def backward(preds, theta, target_idxs):
    m, n, o = preds
    
    dlog = log_softmax_crossentropy_with_logits(n, target_idxs)
    dw = m.T.dot(dlog)
    
    return dw

### Optimize function

In [None]:
# Function for parameter (theta) optimization
def optimize(theta, grad, lr=0.03):
    theta -= grad * lr

    return theta

## Training

In [None]:
# Training the neural network
theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))

epoch_losses = {}

for epoch in range(100):

    losses =  []

    for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        preds = forward(context_idxs, theta)

        target_idxs = np.array([word_to_ix[target]])
        loss = NLLLoss(preds[-1], target_idxs)

        losses.append(loss)

        grad = backward(preds, theta, target_idxs)
        theta = optimize(theta, grad, lr=0.03)
    
    epoch_losses[epoch] = losses
    

## Analyze

### Plot loss/epoch

In [None]:
# Analyzing the neural network
ix = np.arange(0,100)

fig = plt.figure(figsize=(6,4))
fig.suptitle('Epoch/Losses', fontsize=14)
plt.plot(ix,[epoch_losses[i][0] for i in ix], c='orange')
plt.xlabel('Epochs', fontsize=10)
plt.ylabel('Losses', fontsize=10)

### Predict next word

In [None]:
# Function to predict the target word
def predict(words):
    
    context_idxs = np.array([word_to_ix[w] for w in words])
    preds = forward(context_idxs, theta)
    word = ix_to_word[np.argmax(preds[-1])]
    
    return word

# Predict target word based on input words
print(predict(['according', 'to', 'belief', 'the']))
print(predict(['the', 'post', 'war', 'ii']))
print(predict(['originated', 'in', 'in', 'the']))
print(predict(['the', 'currywurst', 'in', 'berlin']))

### Predict all targets of all data bags and show accuracy

In [None]:
# Calculate accuracy
def accuracy():
    wrong = 0
    for context, target in data:
        if(predict(context) != target):
            wrong += 1
            
    return (1 - (wrong / len(data)))

print(f'{accuracy():.4f}')

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')