In [32]:
from nltk import tokenize
import math
import numpy as np
import collections

In [18]:
import pandas as pd
data = pd.read_csv('slate.csv')
 
corpus = data['text'].tolist()

In [19]:

 
data["tokens"]= [tokenize.word_tokenize(x) for x in corpus]
 
words = [y for x in data["tokens"] for y in x]
 
word_freq = collections.Counter(words)
 
print(word_freq.most_common(30))

[('the', 266007), ('of', 115973), ('-', 114156), ('to', 107951), ('a', 100993), ('and', 96375), ('in', 74561), ('that', 64448), ('is', 51590), ('it', 38175), ('for', 38016), ('not', 35118), ('on', 30197), ('as', 28392), ('with', 26804), ('he', 23882), ('i', 23831), ('are', 23022), ('but', 22382), ('by', 22304), ('was', 22019), ('his', 21901), (':', 21822), ('have', 19955), ('be', 19609), ('this', 18631), ('an', 18128), ('you', 17305), ('at', 16468), ('from', 15870)]


In [20]:
class Word:
    def __init__(self, word):
        self.word = word
        self.count = 0

In [95]:

vocabulary = {}
word_count = 0

for line in data['tokens']:
    for token in line:
        if token not in vocabulary:
            vocabulary[token]=Word(token)
        vocabulary[token].count += 1
        word_count += 1

        if word_count % 1000000 == 0:
            print("\processed %d words" % word_count)

    word_count += 2

print('Total words in corpus: %d' % word_count)
print('Vocabulary size: %d' % len(vocabulary))
    

\processed 1000000 words
\processed 2000000 words
\processed 3000000 words
\processed 4000000 words
Total words in corpus: 4868432
Vocabulary size: 73651


In [110]:
dim = 100
win = 10
start_alpha = 0.05
neg = 10
min_count = 5

In [99]:
# truncate dictionary and map rare words to <unk> token
truncated = []
truncated.append(Word('<unk>'))
unk_hash = 0

count_unk = 0
for k, token in vocabulary.items():
    if token.count < min_count:
        count_unk += 1
        truncated[unk_hash].count += token.count
    else:
        truncated.append(token)

truncated.sort(key=lambda token : token.count, reverse=True)

id2token = {}
for i, token in enumerate(truncated):
    id2token[token.word] = i

vocabulary_items = truncated
id2token = id2token
vocab_size = len(id2token)
print('Unknown vocab size:', count_unk)
print('Truncated vocab size: %d' % vocab_size)

Unknown vocab size: 42155
Truncated vocab size: 31497


In [85]:
# Create table of probabilities for negative sampling

exponent = 0.75
normlization_factor = sum([math.pow(t.count, exponent) for t in vocabulary]) # Normalizing constant

table_size = int(1e8) # Length of the unigram table
table = np.zeros(table_size, dtype=np.int)

p = 0 # Cumulative probability
i = 0
for j, unigram in enumerate(vocabulary):
    p += float(math.pow(unigram.count, exponent))/normlization_factor
    while i < table_size and float(i) / table_size < p:
        table[i] = j
        i += 1
        
def sample(table,count):
    indices = np.random.randint(low=0, high=len(table), size=count)
    return [table[i] for i in indices]

In [101]:
import struct

# Sigmoid Function
def sigmoid(z):
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))
    
# Init syn0 with uniform distribution on the interval [-0.5, 0.5]/dim
tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
syn0 = np.ctypeslib.as_ctypes(tmp)
syn0 = np.array(syn0)

tmp = np.zeros(shape=(vocab_size, dim))
syn1 = np.ctypeslib.as_ctypes(tmp)
syn1 = np.array(syn1)

In [114]:
current_sent = 0
truncated_vocabulary = [x.word for x in vocabulary_items]
corpus = data['tokens'].tolist()

while current_sent < data.count()[0]:
    line = corpus[current_sent]
    sent = [vocab_hash[token] if token in truncated_vocabulary else vocab_hash['<unk>'] 
            for token in line]

    for sent_pos, token in enumerate(sent):
        
        current_win = np.random.randint(low=1, high=win+1)
        context_start = max(sent_pos - current_win, 0)
        context_end = min(sent_pos + current_win + 1, len(sent))
        context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end]

        for context_word in context:
            embed = np.zeros(dim)
            classifiers = [(token, 1)] + [(target, 0) for target in table[np.random.randint(len(table), size=neg)]]
            for target, label in classifiers:
                z = np.dot(syn0[context_word], syn1[target])
                p = sigmoid(z)
                g = start_alpha * (label - p)
                embed += g * syn1[target] 
                syn1[target] += g * syn0[context_word] 
            syn0[context_word] += embed

        word_count += 1
    current_sent += 1

    if current_sent % 2000 == 0:
        print("\rReading sentence %d" % current_sent)


embedding = dict(zip(truncated_vocabulary,syn0))
print("Trained embeddings")

# Save embedding

fo = open("word2vec", 'w+')
for token, vector in zip(truncated_vocabulary, syn0):
    fo.write('%s ' % token)
    for s in vector:
        fo.write(('%f '% s))
    fo.write('\n')
fo.close()

print(syn0.shape)

KeyboardInterrupt: 

In [72]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity([embedding['king']],[embedding['queen']]))
print(cosine_similarity([embedding['man']],[embedding['kettle']]))

list