In [79]:
import numpy as np
import matplotlib.pyplot as plt
import json
from nltk.tokenize import sent_tokenize, word_tokenize
import collections
import itertools
from sklearn.manifold import TSNE
import random

In [80]:
input_file = '../../reviews_Movies_and_TV.json'
# input_file = 'try.json'

# Load the data
sentences = []
counter = 0
with open(input_file, 'r') as f:
    for line in f:
        if counter > 50000:
            break
        # add each sentence as a list of words to the sentences list, but each line of the json object is a document containing multiple sentences
        # sentences.append(word_tokenize(json.loads(line)['reviewText']))
        doc_sentences = sent_tokenize(json.loads(line)['reviewText'])
        # sentences.append([word_tokenize(sentence) for sentence in doc_sentences])
        for sentence in doc_sentences:
            sentences.append([word.lower() for word in word_tokenize(sentence)])
        counter += 1
        

print('Number of sentences: {}'.format(len(sentences)))
print(sentences[0])

# for sentence in sentences:
#     print(sentence)

Number of sentences: 315157
['this', 'has', 'some', 'great', 'tips', 'as', 'always', 'and', 'is', 'helping', 'me', 'to', 'complete', 'my', 'good', 'eats', 'collection', '.']


In [81]:
# form the vocabulary
# Flatten the list of sentences into a single list of words
words = itertools.chain.from_iterable(sentences)

# Create a Counter object to count the frequency of each word
word_counter = collections.Counter(words)

# Extract the unique words from the Counter object to form the vocabulary
min_freq = 5
# vocabulary = set(word_counter.keys())
# vocabulary = set(word for word, count in word_counter.items() if count >= min_freq)
# add the word if it occurs more than min_freq times, else add <unk> token
vocabulary = set(word if count >= min_freq else '<unk>' for word, count in word_counter.items())

# add the <pad> token
vocabulary.add('<pad>')

# Print the size of the vocabulary
print('Vocabulary size: {}'.format(len(vocabulary)))

# Create a dictionary to map each word to an index
word2idx = {word: idx for idx, word in enumerate(vocabulary)}

# Create a dictionary to map each index to a word
idx2word = {idx: word for idx, word in enumerate(vocabulary)}

# print the 10 most common words
print('The 10 most common words are: ')
print(word_counter.most_common(10))

Vocabulary size: 27522
The 10 most common words are: 
[('the', 343600), (',', 278064), ('.', 272598), ('and', 181342), ('a', 160983), ('of', 154338), ('to', 138051), ('is', 124059), ('it', 108617), ('i', 103332)]


In [82]:
# prepare the data for training
window_size = 2
sliding_window_size = window_size*2 + 1
num_neg_samples_per_context = 3

vocab_indices = list(word2idx.values())
vocab_size = len(vocab_indices)

# create data with X being indices of the context words and the target word, and y being 0 or 1 based on whether the target word is correct for the context words
# also add negative samples
def create_data_with_negative_sampling(sentences, word2idx, window_size, num_neg_samples_per_context):
    X = []
    y = []
    # counter = 0
    for sentence in sentences:
        for i in range(len(sentence)):
            # a list of indices of context words and the target word
            # if it goes out of bounds, add <pad> tokens            
            context_words = sentence[max(0, i-window_size):i] + sentence[i+1:min(len(sentence), i+window_size+1)]
            target_word = sentence[i]
            # if the any of the words are not in the vocabulary, replace it with <unk>
            context_words = [word if word in word2idx else '<unk>' for word in context_words]
            target_word = target_word if target_word in word2idx else '<unk>'
            data_point = [word2idx[context_word] for context_word in context_words]
            # if the size of the data point is less than the sliding window size, add <pad> tokens
            # if len(data_point) < sliding_window_size:
            data_point += [word2idx['<pad>']]*(sliding_window_size-len(data_point)-1)
            data_point.append(word2idx[target_word])

            # add this to X and y
            X.append(data_point)
            y.append(1)

            # add negative samples
            for _ in range(num_neg_samples_per_context):
                # generate a random index between 0 and vocab_size
                negative_word = random.randint(0, vocab_size+1)
                X.append(data_point[:-1] + [negative_word])                
                y.append(0)
        # counter += 1
        # print(counter)
    return X, y    

    
X, y = create_data_with_negative_sampling(sentences, word2idx, window_size, num_neg_samples_per_context)



In [83]:

X = np.array(X)
y = np.array(y)

# save the data to a file so that it can be loaded later
# np.savez('data.npz', X=X, y=y)

# load the data from the file
def load_data(filename):
    data = np.load(filename)
    X = data['X']
    y = data['y']
    return X, y  

In [84]:
print('Number of data points: {}'.format(len(X)))
print('Number of labels: {}'.format(len(y)))

# print(vocab_indices)
print('index of <unk> is: {}'.format(word2idx['<unk>']))
print('index of <pad> is: {}'.format(word2idx['<pad>']))

for i in range (50):
    print('{}   {}'.format(X[i], y[i]))

Number of data points: 27932812
Number of labels: 27932812
index of <unk> is: 15693
index of <pad> is: 18046
[17492  5566 18046 18046 24029]   1
[17492  5566 18046 18046  3402]   0
[17492  5566 18046 18046 18963]   0
[17492  5566 18046 18046 24333]   0
[24029  5566 19275 18046 17492]   1
[24029  5566 19275 18046 18987]   0
[24029  5566 19275 18046  6081]   0
[24029  5566 19275 18046 22028]   0
[24029 17492 19275 14370  5566]   1
[24029 17492 19275 14370 13480]   0
[24029 17492 19275 14370 22244]   0
[24029 17492 19275 14370  2323]   0
[17492  5566 14370 16306 19275]   1
[17492  5566 14370 16306 13318]   0
[17492  5566 14370 16306 21800]   0
[17492  5566 14370 16306 24455]   0
[ 5566 19275 16306 22765 14370]   1
[ 5566 19275 16306 22765 11729]   0
[ 5566 19275 16306 22765 19231]   0
[ 5566 19275 16306 22765 25433]   0
[19275 14370 22765  7565 16306]   1
[19275 14370 22765  7565  6089]   0
[19275 14370 22765  7565 18565]   0
[19275 14370 22765  7565 15216]   0
[14370 16306  7565 10043 22

In [85]:
# cbow with negative sampling
# hyperparameters
embedding_size = 100
epochs = 100
learning_rate = 0.01
batch_size = 64

# initialize the weights
# embedding matrix
embeddings = np.random.uniform(-1, 1, (len(vocabulary), embedding_size))

# use the same embedding matrix for both context and target


# sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))