# TRAINING WORD2VEC FROM SCRATCH

## 1. Data Preparation

In [1]:
import re
import numpy as np
import import_ipynb
import pandas as pd
from utils import get_batches
from nltk.tokenize import TweetTokenizer

importing Jupyter notebook from utils.ipynb


In [2]:
df = pd.read_csv('npr.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


### Data Cleaning

In [4]:
def data_cleaning(text):
    
    ### Lowercasing ###
    text = text.lower()
    
    ### Using correct apostrophe ###
    apostrophe = re.compile(r"’")
    text = apostrophe.sub(r"'", text)
    
    ### Removing hashtags (#brexit etc) 
    hashtag_words = re.compile(r"#[A-Za-z0-9]+")
    text = hashtag_words.sub(r"", text)
    
    ### Converting punctuation into fullstop ###
    text =  re.sub(r'[,!?;-]', '.', text)
    text = re.sub(r"\.+", '.', text)
    
    text = re.sub(r"[^a-z. ]+", "", text)
    
    text = TweetTokenizer().tokenize(text)
    
    return text

df['Article'] = df['Article'].map(lambda x: data_cleaning(x)) 

In [5]:
df.head()

Unnamed: 0,Article
0,"[in, the, washington, of, ., even, when, the, ..."
1,"[donald, trump, has, used, twitter, his, prefe..."
2,"[donald, trump, is, unabashedly, praising, rus..."
3,"[updated, at, p, ., m, ., et, ., russian, pres..."
4,"[from, photography, ., illustration, and, vide..."


### Corpus Creation

In [6]:
def create_corpus(df):
    
    corpus = []
    
    for x in df['Article']:
        for i in x:
            corpus.append(i)
             
    return corpus

In [7]:
corpus = create_corpus(df)

In [8]:
len(corpus)

10173847

In [9]:
vocab = sorted(set(corpus))

In [10]:
len(vocab)

99915

### Mapping words to indices and indices to words

In [11]:
def get_dict(list_of_tokens):
    
    words = sorted(list(set(list_of_tokens)))
    n = len(words)
    idx = 0
    
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    
    return word2Ind, Ind2word

In [12]:
word2Ind, Ind2word = get_dict(corpus)

V = len(word2Ind)
print("Vocabulary Size: ", V)

Vocabulary Size:  99915


## 2. Model Training

### Input Layer --> Hidden Layer --> Output Layer

You will now initialize two matrices and two vectors. 
- The first matrix ($W_1$) is of dimension $N \times V$, where $V$ is the number of words in your vocabulary and $N$ is the dimension of your word vector.
- The second matrix ($W_2$) is of dimension $V \times N$. 
- Vector $b_1$ has dimensions $N\times 1$
- Vector $b_2$ has dimensions  $V\times 1$. 
- $b_1$ and $b_2$ are the bias vectors of the linear layers from matrices $W_1$ and $W_2$

In [13]:
def initialize_model(N,V, random_seed=1):
    
    np.random.seed(random_seed)
    
    W1 = np.random.rand(N,V)
    W2 = np.random.rand(V,N)
    b1 = np.random.rand(N,1)
    b2 = np.random.rand(V,1)
    
    return W1, W2, b1, b2

### Softmax Function

$$ \text{softmax}(z_i) = \frac{e^{z_i} }{\sum_{i=0}^{V-1} e^{z_i} }  \tag{5} $$

In [14]:
def softmax(z):
    
    e_z = np.exp(z)
    yhat = e_z/np.sum(e_z,axis=0)
    
    return yhat

### Forward Propagation

Implement the forward propagation $z$. <br>

\begin{align}
 h &= W_1 \  X + b_1  \tag{1} \\
 a &= ReLU(h)  \tag{2} \\
 z &= W_2 \  a + b_2   \tag{3} \\
\end{align}

For that, you will use as activation the Rectified Linear Unit (ReLU) given by:

$$f(h)=\max (0,h) \tag{6}$$

In [15]:
def forward_prop(x, W1, W2, b1, b2):
    
    h = np.dot(W1,x) + b1
    
    h = np.maximum(0,h)
    
    z = np.dot(W2,h) + b2
    
    return z, h

### Cost function

- Implementing the *cross-entropy* cost function.

In [16]:
def compute_cost(y, yhat, batch_size):
    
    logprobs = np.multiply(np.log(yhat),y) + np.multiply(np.log(1-yhat), 1 - y)
    cost = -1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

### Backpropagation

In [17]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    
    l1 = np.dot(W2.T,(yhat-y))
    l1 = np.maximum(0,l1)
    
    grad_W1 = (1/batch_size)*np.dot(l1,x.T)
    grad_W2 = (1/batch_size)*np.dot(yhat-y,h.T)
    grad_b1 = np.sum((1/batch_size)*np.dot(l1,x.T), axis=1, keepdims=True)
    grad_b2 = np.sum((1/batch_size)*np.dot(yhat-y,h.T), axis=1, keepdims=True)
    
    return grad_W1, grad_W2, grad_b1, grad_b2

### Gradient Descent

In [18]:
def gradient_descent(data, word2Ind, N, V, epochs, alpha=0.03):
    
    W1, W2, b1, b2 = initialize_model(N,V, random_seed=282)
    batch_size = 128
    iters = 0
    C = 2
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        # Get z and h
        z, h = forward_prop(x,W1,W2,b1,b2)
        # Get yhat
        yhat = softmax(z)
        # Get cost
        cost = compute_cost(y, yhat, batch_size)
        if ( (iters+1) % 10 == 0):
            print(f"Epochs: {iters + 1} cost: {cost:.6f}")
        # Get gradients
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)
        
        # Update weights and biases
        W1 -= alpha*grad_W1
        W2 -= alpha*grad_W2
        b1 -= alpha*grad_b1
        b2 -= alpha*grad_b2
        
        iters += 1 
        if iters == epochs: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2

### Training

In [19]:
%%time
C = 2
N = 100
word2Ind, Ind2word = get_dict(corpus)
V = len(word2Ind)
epochs = 200
print("Call gradient_descent")
W1, W2, b1, b2 = gradient_descent(corpus, word2Ind, N, V, epochs)

Call gradient_descent
Epochs: 10 cost: 0.035759
Epochs: 20 cost: 0.016522
Epochs: 30 cost: 0.010792
Epochs: 40 cost: 0.008024
Epochs: 50 cost: 0.006389
Epochs: 60 cost: 0.005310
Epochs: 70 cost: 0.004543
Epochs: 80 cost: 0.003970
Epochs: 90 cost: 0.003526
Epochs: 100 cost: 0.003172
Epochs: 110 cost: 0.002965
Epochs: 120 cost: 0.002792
Epochs: 130 cost: 0.002638
Epochs: 140 cost: 0.002501
Epochs: 150 cost: 0.002377
Epochs: 160 cost: 0.002264
Epochs: 170 cost: 0.002162
Epochs: 180 cost: 0.002069
Epochs: 190 cost: 0.001983
Epochs: 200 cost: 0.001905
Wall time: 11min 9s
