In [109]:
import numpy as np
import pandas as pd
import re
import nltk
import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# nltk.download()

In [126]:
def load_raw_data():
    return pd.read_csv('./data/raw.csv', quotechar='"', header=None).to_numpy()


def create_label_map(raw_data):
    seen_y = {}
    idx = 0
    batch = len(raw_data)
    Y = [None] * batch
    for i in range(batch):
        row = raw_data[i]        
        y = raw_data[i][1]
        if (y not in seen_y):
            seen_y[y] = idx
            idx += 1

        Y[i] = seen_y[y]

    y_map = {v: k for k, v in seen_y.items()}

    return (np.array(Y), y_map)


In [127]:
raw_data = load_raw_data()

In [128]:
vocab_size=10000
embedding_dim = 16

X = raw_data[:, 0]
tokenizer = Tokenizer(num_words=vocab_size, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)

Y, y_map = create_label_map(raw_data)

batch_size = len(X)
train_size = int(0.8 * batch_size)
val_size = batch_size - train_size

train_X = X[:train_size]
train_Y = Y[:train_size]
val_X = X[train_size:]
val_Y = Y[train_size:]


In [131]:
model = keras.Sequential()

model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=len(X[0])))
model.add(keras.layers.Flatten())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(train_X, train_Y, epochs=3, validation_data=(val_X, val_Y))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [150]:
e = model.layers[0]
w = e.get_weights()[0]

import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

index_to_word = {v:k for k, v in tokenizer.word_index.items()}

for word_num in range(1, len(index_to_word)):
    word = index_to_word[word_num]
    embeddings = w[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')

out_m.close()
out_v.close()

8487