In [1]:
from itertools import starmap
import nltk
import re
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

ModuleNotFoundError: No module named 'nltk'

In [None]:
parse = lambda corpus: list(map(nltk.word_tokenize, corpus))

def tokenize(corpus_parsed):
    vocabularies, unique_inverse = np.unique(np.hstack(corpus_parsed), return_inverse=True)
    return vocabularies, np.split(unique_inverse, np.cumsum(list(map(len, corpus_parsed)))[:-1])

def yield_io(corpus_tokenized, n_vocabularies, context_size):
    roll = lambda a, window: map(a.__getitem__, starmap(slice, zip(range(len(a)), range(window, len(a) + 1))))
    I = np.eye(n_vocabularies)
    window = 2 * context_size
    nans = [np.nan] * context_size
    for words in corpus_tokenized:
        words_extended = np.hstack([nans, words, nans])
        C = np.delete(np.vstack(roll(words_extended, window)), context_size, axis=1)
        for word, c, mask in zip(words, C, ~np.isnan(C)):
            center_word = I[word]
            context_words = I[c[mask].astype(int)]
            yield center_word, context_words

def initialize(n_embeddings, n_vocabularies, random_state=None):
    M, V = n_embeddings, n_vocabularies
    np.random.seed(random_state)
    rand = np.random.rand
    return rand(V, M), rand(M, V)

def softmax(x):
    normalize = lambda x: x / np.sum(x)
    return normalize(np.exp(x - np.max(x)))

def skipgram(center_word, context_words, W1, W2, loss, learning_rate):
    x, L, eta = center_word, loss, learning_rate
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y = softmax(u)
    e = np.sum(y - context_words, axis=0)
    dW2 = np.outer(h, e)
    dW1 = np.outer(x, np.dot(W2, e))
    W1 -= eta * dW1
    W2 -= eta * dW2
    L += -np.sum(u.dot(context_words.T)) + len(context_words) * np.log(np.sum(np.exp(u)))
    return W1, W2, L

predict = lambda W1, W2: lambda x: softmax(np.dot(W2.T, np.dot(W1.T, x)))

In [None]:
corpus = [
    "Donald John Trump (born June 14, 1946) is the 45th and current President of the United States, in office since January 20, 2017. Before entering politics, he was a businessman and television personality. Trump was born and raised in the New York City borough of Queens, and received an economics degree from the Wharton School of the University of Pennsylvania. He took charge of his family's real estate business in 1971, renamed it The Trump Organization, and expanded it into Manhattan. The company built or renovated several skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, including licensing his name for real estate and consumer products. He managed the company until his 2017 inauguration. He co-authored several books, including The Art of the Deal. He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015, and he produced and hosted the reality television show The Apprentice from 2003 to 2015. According to March 2018 estimates by Forbes, he is the world's 766th richest person, with a net worth of US$3.1 billion. Trump entered the 2016 presidential race as a Republican and defeated sixteen opponents in the primaries. Commentators described his political positions as populist, protectionist, and nationalist. His campaign received extensive free media coverage; many of his public statements were controversial or false. Trump was elected president in a surprise victory over Democratic nominee Hillary Clinton. He became the oldest and wealthiest person ever to assume the presidency, the first without prior military or government service, and the fifth to have won the election while losing the popular vote. His election and policies have sparked numerous protests. During his presidency, Trump ordered a travel ban on citizens from several Muslim-majority countries, citing security concerns; a revised version of the ban was implemented after legal challenges. He signed tax cut legislation, rescinded the individual insurance mandate provision of the Affordable Care Act, and opened the Arctic Refuge for oil drilling. He enacted a partial repeal of the Dodd-Frank Act that had imposed stricter constraints on banks in the aftermath of the 2008 financial crisis. In foreign policy, he pursued his America First agenda: he withdrew the U.S. from the Trans-Pacific Partnership trade negotiations, the Paris Agreement on climate change, and the Iran nuclear deal. He recognized Jerusalem as the capital of Israel. He imposed import tariffs on various goods from China, Canada, Mexico and the European Union. After Trump dismissed FBI Director James Comey, the Justice Department appointed Robert Mueller as Special Counsel to investigate coordination or links between the Trump campaign and the Russian government in its election interference. Trump has repeatedly denied accusations of collusion and obstruction of justice, calling the investigation a politically motivated 'witch hunt'."
]

In [None]:
stop_words = []

In [None]:
corpus_parsed = parse(corpus)
vocabularies, corpus_tokenized = tokenize(corpus_parsed)
V = len(vocabularies)

In [None]:
start = time.clock()
W1, W2 = initialize(200, V, 42)
loss_vs_epoch = []
append = loss_vs_epoch.append
for i in range(64):
    loss = 0.
    for center_word, context_words in yield_io(corpus_tokenized, V, 3):
        W1, W2, loss = skipgram(center_word, context_words, W1, W2, loss, 0.01)
    append([i + 1, loss])
loss_vs_epoch = pd.DataFrame(loss_vs_epoch, columns=['epoch', 'loss']).set_index('epoch')
print(time.clock() - start)

In [None]:
loss_vs_epoch.plot()
# plt.xscale('log')

In [None]:
word_vectors = dict(zip(vocabularies, W2.T))

In [None]:
word_vectors['Trump']