In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
docs = ['this is a test ', 'this is also a test', 'why and I still testing this', 'ok now the test is over']


class SkipGram:
    def __init__(self, max_len=5, tokenizer=None):
        self.max_len = max_len
        self.input_len = 2
        self.tokenizer = tokenizer
        self.words = None
        self.probs = None
        self.n_words = None
        if tokenizer:
            self._setup_words()
        pass

    def _setup_words(self):
        self.n_words = sum(self.tokenizer.word_counts.values())
        self.words = np.array(list(self.tokenizer.word_counts.keys()))
        self.probs = np.array(list(self.tokenizer.word_counts.values()))/self.n_words

    def fit(self, docs):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(docs)
        self._setup_words()

    def _text_to_padded_sequences(self, docs):
        seqs = self.tokenizer.texts_to_sequences(docs)
        seqs_padded = pad_sequences(seqs, maxlen=self.max_len, padding='post', truncating='post')
        return seqs_padded

    def _random_choice_int(self, v):
        new_word = np.random.choice(self.words, 1, p=self.probs)[0]
        output = self.tokenizer.word_index[new_word]
        if v == output:
            new_word = np.random.choice(self.words, 1, p=self.probs)[0]
            output = self.tokenizer.word_index[new_word]
        return output

    def _missing_word_gen(self, docs):
        seqs = self._text_to_padded_sequences(docs)
        for seq in seqs:
            seq = np.array(seq)
            if sum(seq) != 0:
                for i in range(self.input_len, self.max_len):
                    new_seq = np.delete(seq, i)
                    x = np.array([new_seq, new_seq])
                    word_index = seq[i]
                    new_word_index = self._random_choice_int(word_index)
                    context = np.array([[word_index], [new_word_index]])
                    label = np.array([[1], [0]])
                    yield x, context, label

    def array_to_texts(self, x):
        return list(s.tokenizer.sequences_to_texts_generator(x))

    def transform(self, docs):
        array_list = list(self._missing_word_gen(docs))
        x = np.vstack([v[0] for v in array_list])
        context = np.vstack([v[1] for v in array_list])
        labels = np.vstack([v[2] for v in array_list]).flatten()
        return x, context, labels

docs = ['this is a test', 'this is still a test', 'why am I still testing', '']
s = SkipGram()
s.fit(docs)
x, context, labels = s.transform(docs)
list(zip(s.array_to_texts(x), s.array_to_texts(context), labels ))

[('this is test', 'a', 1),
 ('this is test', 'why', 0),
 ('this is a', 'test', 1),
 ('this is a', 'why', 0),
 ('this is a test', '', 1),
 ('this is a test', 'a', 0),
 ('this is a test', 'still', 1),
 ('this is a test', 'test', 0),
 ('this is still test', 'a', 1),
 ('this is still test', 'test', 0),
 ('this is still a', 'test', 1),
 ('this is still a', 'still', 0),
 ('why am still testing', 'i', 1),
 ('why am still testing', 'a', 0),
 ('why am i testing', 'still', 1),
 ('why am i testing', 'still', 0),
 ('why am i still', 'testing', 1),
 ('why am i still', 'is', 0)]

In [21]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/kamillamagna/ICD-10-CSV/master/codes.csv', header=None)
df.head()
docs = df.iloc[:, 4]


In [53]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,A00,0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...","Cholera due to Vibrio cholerae 01, biovar chol...",Cholera
1,A00,1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor",Cholera
2,A00,9,A009,"Cholera, unspecified","Cholera, unspecified",Cholera
3,A010,0,A0100,"Typhoid fever, unspecified","Typhoid fever, unspecified",Typhoid fever
4,A010,1,A0101,Typhoid meningitis,Typhoid meningitis,Typhoid fever


In [23]:
max_len = 6
s = SkipGram(max_len=max_len)
s.fit(docs)
x, context, labels = s.transform(docs)

In [27]:
list(zip(s.array_to_texts(x[0:10,:]), s.array_to_texts(context[0:10,:]), labels[0:10] ))

[('cholera due vibrio cholerae 01', 'to', 1),
 ('cholera due vibrio cholerae 01', 'routine', 0),
 ('cholera due to cholerae 01', 'vibrio', 1),
 ('cholera due to cholerae 01', 'ulceration', 0),
 ('cholera due to vibrio 01', 'cholerae', 1),
 ('cholera due to vibrio 01', 'abuse', 0),
 ('cholera due to vibrio cholerae', '01', 1),
 ('cholera due to vibrio cholerae', 'pick', 0),
 ('cholera due vibrio cholerae 01', 'to', 1),
 ('cholera due vibrio cholerae 01', 'of', 0)]

In [24]:
x.shape, context.shape, labels.shape

((573632, 5), (573632, 1), (573632,))

In [34]:
len(s.tokenizer.word_index)


7122

In [51]:

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams

input_len = x.shape[1]
vocab_size = len(s.tokenizer.word_index)

dim_embedddings =10

# inputs
w_inputs = Input(shape=(input_len, ), dtype='int32')
w = Embedding(vocab_size, dim_embedddings)(w_inputs)

# context
c_inputs = Input(shape=(1, ), dtype='int32')
c  = Embedding(vocab_size, dim_embedddings)(c_inputs)
o = Dot(axes=2)([w, c])
o = Reshape((1,input_len), input_shape=(input_len, 1))(o)
o = Flatten()(o)
o = Dense(1)(o)
o = Activation('sigmoid')(o)

model = Model(inputs=[w_inputs, c_inputs], outputs=o)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
# fit the model


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 5)            0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, 5, 10)        71220       input_19[0][0]                   
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 1, 10)        71220       input_20[0][0]                   
__________________________________________________________________________________________________
dot_8 (Dot

In [52]:
model.fit([x, context], labels)

Epoch 1/1


<keras.callbacks.History at 0x1b1dd737a90>