In [2]:
# Collect 10 sentences as a corpus 
# Make a word vectors of this corpus

corpus = ['king is a strong man', 
          'queen is a wise woman', 
          'boy is a young man',
          'girl is a young woman',
          'prince is a young king',
          'princess is a young queen',
          'man is strong', 
          'woman is pretty',
          'prince is a boy will be king',
          'princess is a girl will be queen']

In [3]:
# Remove Stop word. The words which are more frequent in sentence are called stop word, i.e. is,a,the,will

def remove_stop_words(corpus):
    stop_words = ['is','a','will','be']
    results = []
    #print(corpus)
    for sentence in corpus:
        words_in_sentence = sentence.split(' ')
        for stp_w in stop_words:
            if stp_w in words_in_sentence:
                words_in_sentence.remove(stp_w)
        results.append(" ".join(words_in_sentence))
        
    return results


In [12]:
corpus = remove_stop_words(corpus)
corpus

# make a word list with unique value
word_list = []
for sentence in corpus:
    for word in sentence.split(' '):
        word_list.append(word)

word_list = set(word_list) 
#word_list

In [20]:
# Generation of data.

word2int = {}
for i,word in enumerate(word_list):
    word2int[word] = i
    
#word2int

# generate multidimentional array. it contains word of each sentence in a array.
sentences = []
for sentence in corpus:
    sentences.append(sentence.split())

# Findout neighbors of window size 2.
WINDOW_SIZE = 2
data = []
for sentence in sentences:    
    for idx, word in enumerate(sentence):
#         print(idx,word)
#         print(sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1])
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if neighbor != word:
                data.append([word, neighbor])
#data

In [21]:
#preparing Dataframe using panda
import pandas as pd

# for text in corpus:
#     print(text)

df = pd.DataFrame(data, columns = ['input', 'label'])
df

Unnamed: 0,input,label
0,king,strong
1,king,man
2,strong,king
3,strong,man
4,man,king
5,man,strong
6,queen,wise
7,queen,woman
8,wise,queen
9,wise,woman


In [28]:
# Gererate one hot encoding for every piece of data (input, label)
import tensorflow as tf
import numpy as np

ONE_HOT_ENCODE = len(word_list)

# function for generating one hot encoding
def one_hot_encoding(word_index):
    one_hot_encoding = np.zeros(ONE_HOT_ENCODE)
    one_hot_encoding[word_index] = 1 #place 1 for the specified word
    return one_hot_encoding

X = [] # input array
Y = [] # target array

for x,y in zip(df['input'],df['label']):
    X.append(one_hot_encoding(word2int[x]))
    Y.append(one_hot_encoding(word2int[y]))
    
# convert them to numpy arrays
X_train = np.asarray(X)
Y_train = np.asarray(Y)

# making placeholders for X_train and Y_train
x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_ENCODE))
y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_ENCODE))

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 2 

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_ENCODE, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_ENCODE]))
b2 = tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

# training operation
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)


In [29]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

iteration = 20000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 3000 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

iteration 0 loss is :  3.51227
iteration 3000 loss is :  1.81209
iteration 6000 loss is :  1.77548
iteration 9000 loss is :  1.75915
iteration 12000 loss is :  1.7484
iteration 15000 loss is :  1.74068
iteration 18000 loss is :  1.73473


In [34]:
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
print(vectors)

[[-1.36793506  1.22014272]
 [ 0.33782911  0.23423308]
 [-0.01701438  0.24115127]
 [ 0.69317311  0.83706433]
 [-0.90382135 -0.01469946]
 [ 1.28661215  2.06791425]
 [ 4.20249462  1.68647325]
 [-0.69227517  0.19573742]
 [ 3.42684937  4.19658613]
 [ 2.51872969  5.39394712]
 [-5.12695503  3.30944538]
 [-2.08761883  0.78458595]]


In [32]:
w2v_df = pd.DataFrame(vectors, columns = ['x1', 'x2'])
w2v_df['word'] = word_list
w2v_df = w2v_df[['word', 'x1', 'x2']]
w2v_df

Unnamed: 0,word,x1,x2
0,boy,-1.367935,1.220143
1,woman,0.337829,0.234233
2,young,-0.017014,0.241151
3,queen,0.693173,0.837064
4,man,-0.903821,-0.014699
5,girl,1.286612,2.067914
6,pretty,4.202495,1.686473
7,king,-0.692275,0.195737
8,wise,3.426849,4.196586
9,princess,2.51873,5.393947


In [None]:
#word vector in 2d chart. Graphically showing how words are similar to each other
