In [None]:
import numpy as np
import pandas as pd
import re, random
from tqdm import tqdm
import time

# Initialize Training Data

In [None]:
# with open('copperfield.txt') as f:
#     raw_text = f.read()

In [None]:
x = pd.read_csv('data/actors_with_text.csv')[['name', 'iso', 'entity_type', 'sources', 'text', 'keywords']]
x.head()
x = x.dropna(subset=['text'])

In [None]:
raw_text = ' '.join(list(x['text']))

In [None]:
raw_text[:1000]

#### Clean text

In [None]:
raw_text = re.sub(r'\s+', ' ', raw_text) # remove extra spacing
raw_text = raw_text.lower() # lowercase 
raw_text = re.sub(r'[^a-zA-Z\d\s-]', '', raw_text)
raw_text = raw_text.strip()

In [None]:
raw_text[:1000]

#### Generate vocabulary and unigram probabilities

In [None]:
raw_text = raw_text.split(' ')
# raw_text = raw_text[:100000]
len(raw_text)

In [None]:
vocab_id = {}
id_vocab = {}
vocab_probabilities = {}
index = 0
for word in raw_text:
    if word not in vocab_id:
        vocab_id[word] = index
        id_vocab[index] = word
        index += 1
    vocab_probabilities[word] = vocab_probabilities.get(word, 0) + (1 / len(raw_text))
        
len(vocab_id), len(vocab_id) == len(id_vocab)

In [None]:
vocab_probabilities['the'], vocab_probabilities['it']

#### Create a version of the text with word indices. 

In [None]:
text = [vocab_id[word] for word in raw_text]

In [None]:
text[:10]

#### Create the training set
keys = word index.      
values = dictionary where keys are the context word (j) and values are the time that word appears in (i)'s context.    
Counts are weighted by -1 for negative examples.

In [None]:
training_data = {}
window_size = 4
neg_pos_ratio = 1

In [None]:
# positive examples
for text_i, word_i in enumerate(text):
    if word_i not in training_data:
        training_data[word_i] = {}
        
    start_window = max(0, text_i - window_size)
    end_window = min(len(text), text_i + window_size + 1)
        
    for text_j in range(start_window, end_window):
        word_j = text[text_j]
        if text_i != text_j:
            training_data[word_i][word_j] = training_data[word_i].get(word_j, 0) + 1

In [None]:
id_vocab[3], len(training_data[3]), len(training_data) == len(vocab_id)

In [None]:
# negative_examples
for word_i in tqdm(training_data.keys()):
    
    found = 0
    positive_samples = sum(training_data[word_i].values())
    
    while found < neg_pos_ratio * positive_samples:
        neg_i = random.choice(text)
        if (neg_i not in training_data[word_i]) or (training_data[word_i][neg_i] < 0):
            training_data[word_i][neg_i] = training_data[word_i].get(neg_i, 0) - 1
            found += 1

In [None]:
sum(training_data[3].values()), sum(training_data[3].values()) == 0

In [None]:
def get_data():
    training_data = {}

    # positive examples
    for text_i, word_i in enumerate(text):
        if word_i not in training_data:
            training_data[word_i] = {}

        start_window = max(0, text_i - window_size)
        end_window = min(len(text), text_i + window_size + 1)

        for text_j in range(start_window, end_window):
            word_j = text[text_j]
            if text_i != text_j:
                training_data[word_i][word_j] = training_data[word_i].get(word_j, 0) + 1
                
    # negative_examples
    for word_i in training_data.keys():

        found = 0
        positive_samples = sum(training_data[word_i].values())

        while found < neg_pos_ratio * positive_samples:
            neg_i = random.choice(text)
            if (neg_i not in training_data[word_i]) or (training_data[word_i][neg_i] < 0):
                training_data[word_i][neg_i] = training_data[word_i].get(neg_i, 0) - 1
                found += 1
                
    return training_data

# Similarity

In [None]:
def cosine_similarity(i, j):
    # calculate cosine similarities.
    m_y = np.matmul(i.mean_u.T, j.mean_u) 
    m_y = m_y / (np.linalg.norm(i.mean_u) * np.linalg.norm(j.mean_u))
    var_y = np.matrix.trace(np.matmul(i.covariance_u, j.covariance_u)) 
    var_y += np.matmul(np.matmul(i.mean_u.T, i.covariance_u), i.mean_u)
    var_y += np.matmul(np.matmul(j.mean_u.T, j.covariance_u), j.mean_u)
#     var_y = var_y / (np.linalg.norm(i.mean_u) * np.linalg.norm(j.mean_u))
    return float(m_y), float(var_y)

In [None]:
# cosine_similarity(words[1], words[4])

In [None]:
def most_similar(i, prnt=None):
    wi = words[i]
    if prnt: print(wi.text)
    info = []
    for wj in words:
        if wi != wj:
            info.append((wj.text, cosine_similarity(wi,wj)))
    info.sort(key=lambda x: x[1][0], reverse=True)
    
    if prnt:
        for i in info[:prnt]:
            print(i)
    else:
        return info

In [None]:
# most_similar(2)[:10]

# Training
#### Initialization stage

In [None]:
class WordData:
    def __init__(self, text, m=50):
        self.text = text
        self.vector_size = m
        
        self.mean_u = np.random.randn(m,1)
        self.mean_v = np.random.randn(m,1)
        self.covariance_u = np.identity(m)
        self.covariance_v = np.identity(m)
        
        self.P_u = np.identity(m)
        self.P_v = np.identity(m)
        self.P_u_new = np.zeros((m,m))
        self.P_v_new = np.zeros((m,m))
        
        self.R_u = np.zeros((m,1))
        self.R_v = np.zeros((m,1))
        self.R_u_new = np.zeros((m,1))
        self.R_v_new = np.zeros((m,1))
        
    def u_parameter_update(self, beta):
        
        expr = lambda x, y: beta * x + (1-beta) * y
        
        # update. 
        self.R_u = expr(self.R_u_new, self.R_u)
        self.P_u = expr(self.P_u_new, self.P_u)
        
        # u
        self.covariance_u = np.linalg.inv(self.P_u)
        self.mean_u = np.matmul(self.covariance_u, self.R_u)
        self.covariance_u = np.diag(np.diagonal(self.covariance_u))
                
        # clear new values. 
        self.R_u_new = np.zeros((m,1))
        self.P_u_new = np.zeros((m,m))
        
    def v_parameter_update(self, beta):
        
        expr = lambda x, y: beta * x + (1-beta) * y
        
        # update. 
        self.R_v = expr(self.R_v_new, self.R_v)
        self.P_v = expr(self.P_v_new, self.P_v)
        
        # v
        self.covariance_v = np.linalg.inv(self.P_v)
        self.mean_v = np.matmul(self.covariance_v, self.R_v)
        self.covariance_v = np.diag(np.diagonal(self.covariance_v))
        
        # clear new values. 
        self.R_v_new = np.zeros((m,1))
        self.P_v_new = np.zeros((m,m))

In [None]:
m = 40
tau = 0.0
tau = tau * np.identity(m)
gamma = 0.7
n_without_update = 5

words = [WordData(v, m=m) for v in vocab_id.items()]

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-1 * x))

#### Training loop

In [None]:
for epoch in range(500):
    e = 0
    training_data = get_data()
    beta = 1
    if epoch > n_without_update: beta = (epoch-n_without_update) ** (-1 * gamma)
    for i,j_dict in tqdm(training_data.items()):

        wi = words[i]
        
        var_wiu = np.expand_dims(np.diagonal(wi.covariance_u), axis=1)
        var_wiv = np.expand_dims(np.diagonal(wi.covariance_v), axis=1)
        xi_ui = ((var_wiu) + np.square(wi.mean_u))
        xi_vi = ((var_wiv) + np.square(wi.mean_v))

        for j, d in j_dict.items():
            wj = words[j]

            # for u
            var_wjv = np.expand_dims(np.diagonal(wj.covariance_v), axis=1)
            xi = np.matmul(xi_ui.T, (var_wjv + np.square(wj.mean_v)))
            xi = np.sqrt(xi)
            lambda_xi = (0.5 / xi) * (sigmoid(xi) - 0.5)

            eq = wj.covariance_v + np.matmul(wj.mean_v, wj.mean_v.T)
            wi.P_u_new += abs(d) * (2 * lambda_xi * eq + tau)
            wi.R_u_new += 0.5 * d * wj.mean_v

            # for v
            var_wju = np.expand_dims(np.diagonal(wj.covariance_u), axis=1)
            xi = np.matmul(xi_vi.T, (var_wju + np.square(wj.mean_u)))
            xi = np.sqrt(xi)
            lambda_xi = (0.5 / xi) * (sigmoid(xi) - 0.5)
            
            eq = wj.covariance_u + np.matmul(wj.mean_u, wj.mean_u.T)
            wi.P_v_new += abs(d) * (2 * lambda_xi * eq + tau)
            wi.R_v_new += 0.5 * d * wj.mean_u

        e += np.linalg.norm(wi.R_u_new - wi.R_u)
        wi.u_parameter_update(beta)
        wi.v_parameter_update(beta)
        
    print(e / len(words))
    most_similar(2, prnt=5)
    time.sleep(0.5)

In [None]:
words[1000].mean_u

In [None]:
vocab_id['reduce'], vocab_id['china']

In [None]:
most_similar(2760, prnt=20)