<a href="https://colab.research.google.com/github/nalika/NLP/blob/master/word_embedding_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embedding Example

In [0]:
"""
This is a implementation of Word2Vec using numpy. Uncomment the print functions to see Word2Vec in action! Also remember to change the number of epochs and set training_data to training_data[0] to avoid flooding your terminal. A Google Sheet implementation of Word2Vec is also available here - https://docs.google.com/spreadsheets/d/1mgf82Ue7MmQixMm2ZqnT1oWUucj6pEcd2wDs_JgHmco/edit?usp=sharing

Have fun learning!

Author: Derek Chia
Email: derek@derekchia.com
"""

import numpy as np
from collections import defaultdict

## Randomly initialise
"""getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066],
		[-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811],
		[-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529],
		[0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275],
		[0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237],
		[0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408],
		[0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612],
		[0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429],
		[0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]]

getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
		[-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
		[-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
		[0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
		[-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
		[0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
		[-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757],
		[-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395],
		[-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425],
		[0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]]
"""


## Randomly initialise
getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744],
		[-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685],
		[-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401],
		[0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879],
		[0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377],
		[0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088],
		[0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687],
		[0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344],
		[0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817]]

getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
		[-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
		[-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
        [0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
		[-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
		[0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
		[-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757]]


class word2vec():

	def __init__(self):
		self.n = settings['n']
		self.lr = settings['learning_rate']
		self.epochs = settings['epochs']
		self.window = settings['window_size']

	def generate_training_data(self, settings, corpus):
		# Find unique word counts using dictonary
		word_counts = defaultdict(int)
		for row in corpus:
			for word in row:
				word_counts[word] += 1
		#########################################################################################################################################################
		# print(word_counts)																																	#
		# # defaultdict(<class 'int'>, {'natural': 1, 'language': 1, 'processing': 1, 'and': 2, 'machine': 1, 'learning': 1, 'is': 1, 'fun': 1, 'exciting': 1})	#
		#########################################################################################################################################################

		## How many unique words in vocab? 9
		self.v_count = len(word_counts.keys())
		#########################
		# print(self.v_count)	#
		# 9						#
		#########################

		# Generate Lookup Dictionaries (vocab)
		self.words_list = list(word_counts.keys())
		#################################################################################################
		# print(self.words_list)																		#
		# ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'exciting']	#
		#################################################################################################
		
		# Generate word:index
		self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
		#############################################################################################################################
		# print(self.word_index)																									#
		# # {'natural': 0, 'language': 1, 'processing': 2, 'and': 3, 'machine': 4, 'learning': 5, 'is': 6, 'fun': 7, 'exciting': 8}	#
		#############################################################################################################################

		# Generate index:word
		self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
		#############################################################################################################################
		# print(self.index_word)																									#
		# {0: 'natural', 1: 'language', 2: 'processing', 3: 'and', 4: 'machine', 5: 'learning', 6: 'is', 7: 'fun', 8: 'exciting'}	#
		#############################################################################################################################

		training_data = []

		# Cycle through each sentence in corpus
		for sentence in corpus:
			sent_len = len(sentence)

			# Cycle through each word in sentence
			for i, word in enumerate(sentence):
				# Convert target word to one-hot
				w_target = self.word2onehot(sentence[i])

				# Cycle through context window
				w_context = []

				# Note: window_size 2 will have range of 5 values
				for j in range(i - self.window, i + self.window+1):
					# Criteria for context word 
					# 1. Target word cannot be context word (j != i)
					# 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
					# 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range 
					if j != i and j <= sent_len-1 and j >= 0:
						# Append the one-hot representation of word to w_context
						w_context.append(self.word2onehot(sentence[j]))
						# print(sentence[i], sentence[j]) 
						#########################
						# Example:				#
						# natural language		#
						# natural processing	#
						# language natural		#
						# language processing	#
						# language append 		#
						#########################
						
				# training_data contains a one-hot representation of the target word and context words
				#################################################################################################
				# Example:																						#
				# [Target] natural, [Context] language, [Context] processing									#
				# print(training_data)																			#
				# [[[1, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]]]]	#
				#################################################################################################
				training_data.append([w_target, w_context])

		return np.array(training_data)

	def word2onehot(self, word):
		# word_vec - initialise a blank vector
		word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count)
		#############################
		# print(word_vec)			#
		# [0, 0, 0, 0, 0, 0, 0, 0]	#
		#############################

		# Get ID of word from word_index
		word_index = self.word_index[word]

		# Change value from 0 to 1 according to ID of the word
		word_vec[word_index] = 1

		return word_vec

	def train(self, training_data):
		# Initialising weight matrices
		# np.random.uniform(HIGH, LOW, OUTPUT_SHAPE)
		# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.random.uniform.html
		self.w1 = np.array(getW1)
		self.w2 = np.array(getW2)
		# self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
		# self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
		
		# Cycle through each epoch
		for i in range(self.epochs):
			# Intialise loss to 0
			self.loss = 0
			# Cycle through each training sample
			# w_t = vector for target word, w_c = vectors for context words
			for w_t, w_c in training_data:
				# Forward pass
				# 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
				y_pred, h, u = self.forward_pass(w_t)
				#########################################
				# print("Vector for target word:", w_t)	#
				# print("W1-before backprop", self.w1)	#
				# print("W2-before backprop", self.w2)	#
				#########################################

				# Calculate error
				# 1. For a target word, calculate difference between y_pred and each of the context words
				# 2. Sum up the differences using np.sum to give us the error for this particular target word
				EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
				#########################
				# print("Error", EI)	#
				#########################

				# Backpropagation
				# We use SGD to backpropagate errors - calculate loss on the output layer 
				self.backprop(EI, h, w_t)
				#########################################
				#print("W1-after backprop", self.w1)	#
				#print("W2-after backprop", self.w2)	#
				#########################################

				# Calculate loss
				# There are 2 parts to the loss function
				# Part 1: -ve sum of all the output +
				# Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
				# Note: word.index(1) returns the index in the context word vector with value 1
				# Note: u[word.index(1)] returns the value of the output layer before softmax
				self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
				
				#############################################################
				# Break if you want to see weights after first target word 	#
				# break 													#
				#############################################################
			print('Epoch:', i, "Loss:", self.loss)

	def forward_pass(self, x):
		# x is one-hot vector for target word, shape - 9x1
		# Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
		h = np.dot(x, self.w1)
		# Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
		u = np.dot(h, self.w2)
		# Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
		y_c = self.softmax(u)
		return y_c, h, u

	def softmax(self, x):
		e_x = np.exp(x - np.max(x))
		return e_x / e_x.sum(axis=0)

	def backprop(self, e, h, x):
		# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
		# Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
		# Going backwards, we need to take derivative of E with respect of w2
		# h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
		# x - shape 9x1, w2 - 10x9, e.T - 9x1
		dl_dw2 = np.outer(h, e)
		dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
		########################################
		# print('Delta for w2', dl_dw2)			#
		# print('Hidden layer', h)				#
		# print('np.dot', np.dot(self.w2, e.T))	#
		# print('Delta for w1', dl_dw1)			#
		#########################################

		# Update weights
		self.w1 = self.w1 - (self.lr * dl_dw1)
		self.w2 = self.w2 - (self.lr * dl_dw2)

	# Get vector from word
	def word_vec(self, word):
		w_index = self.word_index[word]
		v_w = self.w1[w_index]
		return v_w

	# Input vector, returns nearest word(s)
	def vec_sim(self, word, top_n):
		v_w1 = self.word_vec(word)
		word_sim = {}

		for i in range(self.v_count):
			# Find the similary score for each word in vocab
			v_w2 = self.w1[i]
			theta_sum = np.dot(v_w1, v_w2)
			theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
			theta = theta_sum / theta_den

			word = self.index_word[i]
			word_sim[word] = theta

		words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

		for word, sim in words_sorted[:top_n]:
			print(word, sim)

#####################################################################

In [0]:
# importing modules 
import requests

from lxml import etree 

import bs4 as bs
import urllib.request
import re



scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scraped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text


In [0]:
# Removing Square Brackets and Extra Spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)

In [0]:
# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

In [0]:
import nltk
nltk.download('punkt')

# Converting Text To Sentences
sentence_list = nltk.sent_tokenize(article_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
text = "natural language processing and machine learning is fun and exciting"

#text = article_text

# Note the .lower() as upper and lowercase does not matter in our implementation
# [['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']]
corpus = [[word.lower() for word in text.split()]]

In [0]:
len(corpus)

1

In [0]:
corpus

[['natural',
  'language',
  'processing',
  'and',
  'machine',
  'learning',
  'is',
  'fun',
  'and',
  'exciting']]

In [0]:
settings = {
	'window_size': 2,	# context window +- center word
	'n': 7,		# dimensions of word embeddings, also refer to size of hidden layer
	'epochs': 50,		# number of training epochs
	'learning_rate': 0.01	# learning rate
}

In [0]:
# Initialise object
w2v = word2vec()
# Numpy ndarray with one-hot representation for [target_word, context_words]
# target_word is the input 
# context_word is the output
training_data = w2v.generate_training_data(settings, corpus)

In [0]:
np.asanyarray(training_data[0][0]).shape

(4053,)

In [0]:
training_data[0][1:]

array([list([[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]])],
      dtype=object)

In [0]:
np.asanyarray(training_data[0][1:]).shape

(1,)

In [0]:
# Training
w2v.train(training_data)



Epoch: 0 Loss: 81.26139545860659
Epoch: 1 Loss: 80.16189979840644
Epoch: 2 Loss: 79.1560195789214
Epoch: 3 Loss: 78.23076176494702
Epoch: 4 Loss: 77.37568808932373
Epoch: 5 Loss: 76.58227832114267
Epoch: 6 Loss: 75.84347845900334
Epoch: 7 Loss: 75.15337394201013
Epoch: 8 Loss: 74.5069490142869
Epoch: 9 Loss: 73.89990657372726
Epoch: 10 Loss: 73.32853124268237
Epoch: 11 Loss: 72.78958384483869
Epoch: 12 Loss: 72.2802190609857
Epoch: 13 Loss: 71.79792044020037
Epoch: 14 Loss: 71.34044857955139
Epoch: 15 Loss: 70.90579941726183
Epoch: 16 Loss: 70.4921703789483
Epoch: 17 Loss: 70.09793268266111
Epoch: 18 Loss: 69.72160851735995
Epoch: 19 Loss: 69.36185210875492
Epoch: 20 Loss: 69.01743390828315
Epoch: 21 Loss: 68.68722730739334
Epoch: 22 Loss: 68.37019740554226
Epoch: 23 Loss: 68.0653914570859
Epoch: 24 Loss: 67.77193069718068
Epoch: 25 Loss: 67.48900330536094
Epoch: 26 Loss: 67.21585831159366
Epoch: 27 Loss: 66.95180028623874
Epoch: 28 Loss: 66.69618468461677
Epoch: 29 Loss: 66.4484137404

In [0]:
# Get vector for word
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)



machine [ 0.81745182 -0.97754696  0.45881187  0.20378167 -0.47598899 -0.73740533
  0.4403721 ]


In [0]:
# Find similar words
w2v.vec_sim("machine", 5)

machine 1.0
fun 0.864375673602599
is 0.7280950417829436
natural 0.5093532632621883
and 0.49518851046012796


Credits to https://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/


---



Preparing the text data

In [0]:
url = 'http://mattmahoney.net/dc/'

In [0]:
import os
import urllib

def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

In [0]:
# Read the data into a list of strings.
import zipfile
import tensorflow as tf

def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split() 
        # Note: 
        # 1. TensorFlow function as_str which ensures that the text is created as a string data-type.
        # 2. split() - to create a list with all the words in the text file, separated by white-space characters. 
    return data

In [0]:
filename = maybe_download('text8.zip', url, 31344016)

Found and verified text8.zip


In [0]:
vocabulary = read_data(filename)
print(vocabulary[:7])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


As you can observe, the returned vocabulary data contains a list of plain English words, ordered as they are in the sentences of the original extracted text file.  Now that we have all the words extracted in a list, we have to do some further processing to enable us to create our skip-gram batch data.  These further steps are:

1.    Extract the top 10,000 most common words to include in our embedding vector
2.  Gather together all the unique words and index them with a unique integer value – this is what is required to create an equivalent one-hot type input for the word.  We’ll use a dictionary to do this
3. Loop through every word in the dataset (vocabulary variable) and assign it to the unique integer word identified, created in Step 2 above.  This will allow easy lookup / processing of the word data stream



In [0]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    """setting up a “counter” list, which will store the number of times a word 
    is found within the data-set.  Because we are restricting our vocabulary to 
    only 10,000 words, any words not within the top 10,000 most common words 
    will be marked with an “UNK” designation, standing for “unknown”. """
    dictionary = dict() # creates a dictionary
    for word, _ in count:
        dictionary[word] = len(dictionary)
    """ value assigned to each unique word key is simply an increasing integer 
    count of the size of the dictionary.  So, for instance, the most common word 
    will receive the value 1, the second most common the value 2, the third most 
    common word the value 3, and so on (the integer 0 is assigned to the ‘UNK’ 
    words).   This step creates a unique integer value for each word within the
    vocabulary """    
        
    data = list() # list called data 
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    """ A list called data is created, which will be the same length as words 
    but instead of being a list of individual words, it will instead be a list 
    of integers – with each word now being represented by the unique integer 
    that was assigned to this word in dictionary.  So, for the first sentence of
    our data-set [‘anarchism’, ‘originated’, ‘as’, ‘a’, ‘term’, ‘of’, ‘abuse’], 
    now looks like this in the data variable: [5242, 3083, 12, 6, 195, 2, 3136]"""
        
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    """creates a dictionary called reverse_dictionary that allows us to look up 
    a word based on its unique integer identifier, rather than looking up the 
    identifier based on the word i.e. the original dictionary.  """
    
    return data, count, dictionary, reversed_dictionary

In [0]:
data_index = 0
# generate batch data
def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            context[i * num_skips + j, 0] = buffer[target]  # these are the context words
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, context

In [0]:
import collections
buffer = collections.deque(maxlen=span)
for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)

In [0]:
for i in range(batch_size // num_skips):
    target = skip_window  # input word at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
        while target in targets_to_avoid:
            target = random.randint(0, span - 1)
        targets_to_avoid.append(target)
        batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
        context[i * num_skips + j, 0] = buffer[target]  # these are the context words
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)

In [0]:
import numpy as np

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [0]:
vocabulary_size=7
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a context.

In [0]:
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [0]:
# Look up embeddings for inputs.
embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [0]:
import math
# Construct the variables for the softmax
weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                          stddev=1.0 / math.sqrt(embedding_size)))
biases = tf.Variable(tf.zeros([vocabulary_size]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

In [0]:
# convert train_context to a one-hot format
train_one_hot = tf.one_hot(train_context, vocabulary_size)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, 
    labels=train_one_hot))
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

Setting up our data is now to create a data set comprising of our input words and associated grams, which can be used to train our Word2Vec embedding system. 

### Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset
https://github.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/blob/master/README.md

In [0]:
import pandas as pd

Data from URL

In [0]:
URL_Tr = 'https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv'
#URL_Te = 'https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/test.tsv'

In [0]:
# load test data
train = pd.read_csv(URL_Tr, sep='\t')
#test = pd.read_csv(URL_Te, sep='\t')

Step 2: Analysing data: Dataframe analysis

In [0]:
#train.head()

In [0]:
#test.head()

In [0]:
# Checking on the dimension
print(train.shape, "\n" )

(156060, 4) 



**NOTE**: Notice the dimension of the train and test data

In [0]:
# Null entry checking
print ("\t",train.isnull().values.any(), "\n\t" )

	 False 
	


In [0]:
# Number of unique sentences in the training / testing dataset
print (len(train.groupby('SentenceId').nunique()))

8529


Let's create a dataset with only full sentences. Exploring data this way will gives us cleaner graphs that aren't biased toward longer sentences. We can also add a label for the sentiment value to increased readability.

In [0]:
#Create df of full sentences
fullSent = train.loc[train.groupby('SentenceId')['PhraseId'].idxmin()]

#Change sentiment to increase readability
fullSent['sentiment_label'] = ''
Sentiment_Label = ['Negative', 'Somewhat Negative', 
                  'Neutral', 'Somewhat Positive', 'Positive']
for sent, label in enumerate(Sentiment_Label):
    fullSent.loc[train.Sentiment == sent, 'sentiment_label'] = label
    
fullSent.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,sentiment_label
0,1,1,A series of escapades demonstrating the adage ...,1,Somewhat Negative
63,64,2,"This quiet , introspective and entertaining in...",4,Positive
81,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,Somewhat Negative
116,117,4,A positively thrilling combination of ethnogra...,3,Somewhat Positive
156,157,5,Aggressive self-glorification and a manipulati...,1,Somewhat Negative


**Step 2**: Data Preprocessing

In [0]:
## Helper Functions

from nltk.tokenize import TreebankWordTokenizer

from nltk.stem import WordNetLemmatizer

from nltk.stem import PorterStemmer

treebank_tokenizer = TreebankWordTokenizer() # We use the treebank tokenizer.

lem = WordNetLemmatizer() # lemmatization.

stemmer = PorterStemmer() # Stemming.

def tokenize(x):
    return ' '.join(treebank_tokenizer.tokenize(x))

def lemmatize(x):
    return ' '.join([lem.lemmatize(s) for s in x.split(' ')])

def stem(x):
    return ' '.join([stemmer.stem(s) for s in x.split(' ')])

In [0]:
## All labled data
#sentences = fullSent['Phrase'] # our text data
#sentiment = fullSent['Sentiment']


## Extended labeld data
sentences = np.array(train['Phrase'])
sentiment = np.array(train['Sentiment'])

In [0]:
# Tokenizer (object that will split a sentence to a set of tokens )
from keras.preprocessing.text import Tokenizer

# Creating a tokenizer
tokenizer = Tokenizer(lower=True)

# Building word indices
tokenizer.fit_on_texts(sentences)

In [0]:
sentences[0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [0]:
sentences[2]

'A series'

In [0]:
## Tokenizing sentences
X = tokenizer.texts_to_sequences(sentences)

print(len(X[0]), len(X[100]), len(X[200]), len(X[1000]), len(X[8529-1]))


35 9 7 2 1


---
**Problem**: Variable token lenths

---
**Solution:** Zero padding



---



In [0]:
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
tokens = list(map(sequence_to_text, X))


In [0]:
tokens[0][0:10]

['a',
 'series',
 'of',
 'escapades',
 'demonstrating',
 'the',
 'adage',
 'that',
 'what',
 'is']

In [0]:
tokens[2]

['a', 'series']

In [0]:
# Sequences padding.
from keras.preprocessing.sequence import TimeseriesGenerator , pad_sequences

X = pad_sequences(X, padding='post') #Returns x: Numpy array with shape (len(sequences), maxlen)

In [0]:
X.shape

(156060, 49)

In [0]:
X[0]

array([    2,   323,     3, 14150,  6028,     1,  6586,     9,    52,
           8,    46,    13,     1,  2976,     8,   177,    46,    13,
           1, 10913,    65,     3,    78,   668, 10117,    19,   576,
           3,    78,  2123,     5,    57,     3,     2,    42,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)

In [0]:
# Vocabulary Size.
vocab_size = len(tokenizer.word_index) + 1


print("Vocab size : ", vocab_size)
print("X's shape : " , X.shape)

Vocab size :  15289
X's shape :  (156060, 49)


In [0]:
# Construct the target.
Y = sentiment
print("Target size : ",Y.shape)

Target size :  (156060,)


### Step 3: Train/Test Splitting

In [0]:
# build train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size=0.2, 
                                                    random_state=4)

In [0]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((124848, 49), (124848,), (31212, 49), (31212,))

In [0]:
# one hot encode
from keras.utils import to_categorical
Y_train = to_categorical(Y_train)
Y_test = to_categorical(Y_test)

print("Y_train's shape : ", Y_train.shape)
print("Y_test's shape : " , Y_test.shape)

Y_train's shape :  (124848, 5)
Y_test's shape :  (31212, 5)


### **Step 4**: Model Building and Traning

In [0]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten 
from keras.layers import Activation, Conv1D, GlobalMaxPooling1D
from keras import optimizers

# Define model
def baseline_cnn_model(X , Y , embed_dim , vocab_size, epochs = 10, 
                       batch_size = 32, compiler='SGD'):
    
    # training and testing set :
    length  = X.shape[0] 
    target_shape = Y.shape[1]
    
    # create model
    model = Sequential()
    
    #Embeding layer
    model.add(Embedding(vocab_size, embed_dim,input_length = X.shape[1]))    
    
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(64))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(target_shape, activation='softmax'))
    model.summary()  
    
    
    # compile the model
    model.compile(optimizer=compiler, loss='categorical_crossentropy', 
                  metrics=['acc',f1_m,precision_m, recall_m])


    return model


In [0]:
## Helper Functions
from keras import backend as K
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [0]:
# Training Hyper Parameters
lr = 1e-3
batch_size = 128
num_epochs = 1000
decay=1e-4
embed_dim = 32


adm = optimizers.Adam(lr = lr , decay = decay)
sgd = optimizers.SGD(lr = lr , nesterov=True, momentum=0.7, decay=decay) # schedule_decay=0.0004)
Nadam = optimizers.Nadam(lr = lr , beta_1=0.9, beta_2=0.999, epsilon=1e-08)#, schedule_decay=0.0004)
    

model = baseline_cnn_model(X_train, Y_train, embed_dim = embed_dim, 
                           vocab_size = vocab_size, 
                           epochs = num_epochs , batch_size = batch_size, 
                           compiler = Nadam)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 49, 32)            486848    
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 47, 64)            6208      
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 23, 64)            0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 21, 128)           24704     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 10, 128)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 1280)              0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 1280)              0         
__________

In [0]:
# fit model
model.fit(X_train, Y_train, batch_size=batch_size,
          epochs=num_epochs,verbose=1,validation_split=0.2)

Train on 99878 samples, validate on 24970 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 6

<keras.callbacks.History at 0x7fc09c8d9198>

In [0]:
# test_tfidf
test_tfidf_mat = test_tfidf.toarray()

# reshape from [samples, sequence] into [samples, sequence, features]
fea_vec_dim = test_tfidf_mat.shape[1]
n_class = 5
print(fea_vec_dim, n_class)

X_test = test_tfidf_mat.reshape((test_tfidf_mat.shape[0], test_tfidf_mat.shape[1], 1))
X_test.shape



3443 5


(1706, 3443, 1)

In [0]:
# evaluate the model
import numpy as np
def print_metrics(accuracy, f1_score, precision, recall):  
    print('SIMPLE CNN MODEL PERFORMANCE')
    print('Accuracy:', np.round(accuracy, 4))
    print('Precision:', np.round(precision, 4))
    print('Recall:', np.round(recall, 4))
    print('F1 Score:', np.round(f1_score, 4))
    print('\n')


loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, Y_test, verbose=0)
print_metrics(accuracy, f1_score, precision, recall)

SIMPLE CNN MODEL PERFORMANCE
Accuracy: 0.5709
Precision: 0.5994
Recall: 0.5368
F1 Score: 0.5658




# **Wrod Embedding:** 
 
*   Even if we already use an Embedding Layer to avoid sparsity and reduce dimensionality, a pre-trained word embedding can be very interesting since it already contains information from thousands of lines of various text.
*   We can see it as a type of transfer learning, which can be very useful since we don’t have much data.






In [0]:
def preprocess(x):
    tk = treebank_tokenizer.tokenize(x)
    tk = [s.lower() for s in tk]
    return tk

In [0]:
sentences = sentences.apply(preprocess)

In [0]:
def vectorize(sentences, B):
    
    lens = sentences.apply(lambda x:len(x))
    ebds = []
    tsteps = max(lens)
    
    X = np.zeros((len(sentences) , tsteps , B))
    
    for i in range(len(sentences)):
        
        words = sentences[i]
        k = tsteps -1
        
        for word in words[::-1] :
            try:
                X[i , k] = model[word]
                k = k - 1
            except:
                lens[i] = lens[i] - 1
                
    ntsteps = max(lens)
    
    return X[: , (tsteps - ntsteps): ]

In [0]:
sentences = sentences.reset_index(drop = True)
X = vectorize(sentences , 100)

In [0]:
X = vectorize(sentences , 100)
Y = sentiment

print("X's shape :" , X.shape)
print("Y's shape :" , Y.shape)

X's shape : (8529, 0, 100)
Y's shape : (8529,)
