# 引入Libraries & 必要Function

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from scipy import linalg  #提供線代函式
from collections import defaultdict


def sigmoid(z):
    # sigmoid function
    return 1.0/(1.0+np.exp(-z))


def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx


def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed


def get_vectors(data, word2Ind, V, C):
    i = C
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
        num_ctx_words = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq/num_ctx_words
        yield x, y
        i += 1
        if i >= len(data):
            print('i is being set to 0')
            i = 0


def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch = []


def compute_pca(data, n_components=2):
    """
    Input: 
        data: of dimension (m,n) where each row corresponds to a word vector
        n_components: Number of components you want to keep.
    Output: 
        X_reduced: data transformed in 2 dims/columns + regenerated original data
    pass in: data as 2D NumPy array
    """

    m, n = data.shape

    ### START CODE HERE ###
    # mean center the data
    data -= data.mean(axis=0)
    # calculate the covariance matrix
    R = np.cov(data, rowvar=False)
    # calculate eigenvectors & eigenvalues of the covariance matrix
    # use 'eigh' rather than 'eig' since R is symmetric,
    # the performance gain is substantial
    evals, evecs = linalg.eigh(R)
    # sort eigenvalue in decreasing order
    # this returns the corresponding indices of evals and evecs
    idx = np.argsort(evals)[::-1]

    evecs = evecs[:, idx]
    # sort eigenvectors according to same index
    evals = evals[idx]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    evecs = evecs[:, :n_components]
    ### END CODE HERE ###
    return np.dot(evecs.T, data.T).T


def get_dict(data):
    """
    Input:
        K: the number of negative samples
        data: the data you want to pull from
        indices: a list of word indices
    Output:
        word_dict: a dictionary with the weighted probabilities of each word
        word2Ind: returns dictionary mapping the word to its index
        Ind2Word: returns dictionary mapping the index to its word
    """
    #
#     words = nltk.word_tokenize(data)
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    # return these correctly
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word


In [None]:
# 上傳shakespeare.txt'
import re                                   
data = open('shakespeare.txt').read()                     #  讀取莎士比亞做corpus
data = re.sub(r'[,!?;-]', '.',data)                      #  標點符號統一換成句號.
data = nltk.word_tokenize(data)                          # 斷詞
data = [ ch.lower() for ch in data if ch.isalpha() or ch == '.']    # 留下單詞及句號，並將單詞統一小寫
print("Number of tokens:", len(data),'\n', data[:15])            

Number of tokens: 60933 
 ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [None]:
# 將vocabulary轉成陣列儲存並給予編號
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5772


# 訓練模型

In [None]:
np.random.seed(1)
# 初始化模型參數
def initialize_model(N,V):

    W1 = np.random.rand(N,V)
    W2 = np.random.rand(V,N)
    b1 = np.random.rand(N,1)
    b2 = np.random.rand(V,1)     

    return W1, W2, b1, b2

def softmax(x):
    
    s = (np.exp(x) / np.sum(np.exp(x), axis=1)).T
    
    return s

def forward_prop(x, W1, W2, b1, b2):
    '''
    Inputs: 
        x: average one hot vector for the context 
        W1, W2, b1, b2:  matrices and biases to be learned
     Outputs: 
        z:  output score vector
        h:  output of first hidden layer
    '''
   
    h = np.dot(W1,x) + b1   
    h = max(0,h)  # 使用ReLU 做激勵函數(若h>0則回傳h,若h<0則回傳0)
    z = np.dot(W2, h) + b2

    h = h.T
    z = z.T


    return z, h

# Cost Function
try:
    from scipy.misc import logsumexp
except ImportError:
    from scipy.special import logsumexp
def compute_cost(z, C, y, yhat, batch_size):
    z_hat = logsumexp(z, axis=-1, keepdims=True)                      
    cost = (-np.sum(y*np.log(yhat)) + np.sum(2.0*C*z_hat)) / batch_size
    return cost

In [None]:
def back_prop(x, z, y, h, W1, W2, b1, b2, batch_size, m):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        z:  score vector
        y:  target vector
        h:  hidden vector (see eq. 1)
        W1, W2, b1, b2:  matrices and biases  
        batch_size: batch size 
        m:  number of context words
     Outputs: 
        grad_W1, grad_W2, grad_b1, grad_b2:  gradients of matrices and biases   
    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR OWN CODE) ###
    ### (YOU WILL NEED TO ADD CODE IN ADDITION TO JUST REPLACING THE 'None' VALUES) ###
    l1 = np.dot(W2.T, (z - y))
    l1[l1<0] = 0.0
    
    grad_W2 = (1.0/(batch_size*m)) * np.dot((z - y), h)

    grad_b2 = (1.0/(batch_size*m)) * np.sum((z - y), axis = 1, keepdims=True)
    
    grad_b1 = (1.0/(batch_size*m)) * np.sum(l1, axis = 1, keepdims=True)

    grad_W1 = (1.0/(batch_size*m)) * np.dot(l1, x.T)
    ### END CODE HERE ###

    return grad_W1, grad_W2, grad_b1, grad_b2

In [None]:
def gradient_descent(data, word2Ind, C, N, V, num_iters, alpha=0.03):
    
    '''
    This is the gradient_descent function
    
      Inputs: 
        data:      text
        word2Ind:  words to Indices
        C:         context window
        N:         dimension of hidden vector  
        V:         dimension of vocabulary 
        num_iters: number of iterations  
     Outputs: 
        W1, W2, b1, b2:  updated matrices and biases   

    '''
    W1, W2, b1, b2 = initialize_model(N,V) #W1=(N,V) and W2=(V,N)
    m = (2*C)
    batch_size = 128
    iters = 0
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        z, h = forward_prop(x, W1, W2, b1, b2)
        yhat = softmax(z)
        cost = compute_cost(z, C, y, yhat, batch_size)
        print('iters', iters + 1 , '   cost',cost)
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size, m)
        W1 = W1 - alpha * grad_W1 
        W2 = W2 - alpha * grad_W2
        b1 = b1 - alpha * grad_b1 
        b2 = b2 - alpha * grad_b2
        iters += 1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
    
        ### END CODE HERE ###

    return W1, W2, b1, b2

In [None]:
from matplotlib import pyplot
%config InlineBackend.figure_format = 'svg'
words = ['king', 'queen','lord','man', 'woman','dog','horse',
         'rich','happy','sad']

embs = (W1.T + W2)/2.0
 
# given a list of words and the embeddings, it returns a matrix with all the embeddings
idx = [word2Ind[word] for word in words]
X = embs[idx, :]
print(X.shape, idx)  # X.shape:  Number of words of dimension N each

In [None]:
result= compute_pca(X, 2)
pyplot.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

In [None]:
result= compute_pca(X, 4)
pyplot.scatter(result[:, 3], result[:, 1])
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 3], result[i, 1]))
pyplot.show()