In [34]:
import pprint
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from keras.models import Sequential
from keras.layers import Dense
from gensim.models import Word2Vec

import re
# import gensim.models.keyedvectors as word2vec
from gensim.test.utils import datapath
from gensim.models import KeyedVectors


In [9]:
def text_preprocessing(
    text:list,
    punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_“~+''',
    stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will']
    )->list:
    """
    A method to preproces text
    """
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "")

    # Removing words that have numbers in them
    text = re.sub(r'\w*\d\w*', '', text)

    # Removing digits
    text = re.sub(r'[0-9]+', '', text)

    # Cleaning the whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Setting every word to lower
    text = text.lower()

    # Converting all our text to a list 
    text = text.split(' ')

    # Droping empty strings
    text = [x for x in text if x!='']

    # Droping stop words
    text = [x for x in text if x not in stop_words]

    return text

In [10]:
def preprocess(dataset: list)->list:
    processed = []
    for line in dataset:
        text = text_preprocessing(line)
        processed.append(text)
    return processed

In [13]:
file1 = open('data/arxiv_3.txt', 'r') 
dataset = file1.readlines()

processed_dataset = preprocess(dataset)

In [20]:
processed_dataset

[['xlnet',
  'generalized',
  'autoregressive',
  'pretraining',
  'language',
  'understanding'],
 ['modeling',
  'bidirectional',
  'contexts',
  'denoising',
  'autoencoding',
  'pretraining',
  'bert',
  'autoregressive',
  'language',
  'modeling'],
 ['masks', 'bert', 'pretrainfinetune', 'discrepancy'],
 ['xlnet',
  'generalized',
  'autoregressive',
  'pretraining',
  'learning',
  'bidirectional',
  'contexts',
  'bert',
  'autoregressive',
  'formulation'],
 ['xlnet', 'ransformerxl', 'autoregressive', 'model', 'into', 'pretraining'],
 ['xlnet',
  'outperforms',
  'bert',
  'question',
  'answering',
  'natural',
  'language',
  'inference',
  'sentiment',
  'analysis',
  'document',
  'ranking'],
 ['bert', 'transformer', 'stack', 'of', 'lstms'],
 ['roberta', 'robustly', 'optimized', 'bert', 'bert'],
 ['bert',
  'undertrained',
  'stateoftheart',
  'results',
  'glue',
  'race',
  'squad',
  'bert',
  'undertrained',
  'improved',
  'training',
  'bert',
  'models',
  'roberta',

In [19]:
tokenizer = Tokenizer(filters='''!()-[]{};:'"\,<>./?@#$%^&*_“~+''')
tokenizer.fit_on_texts(processed_dataset)
corpus = tokenizer.texts_to_sequences(processed_dataset)

In [22]:
V = len(tokenizer.word_index) + 1
dim = 200
window_size = 5


In [41]:
print(V)

773


In [23]:
model = Sequential()
model.add(Dense(input_dim=V, output_dim=dim))
model.add(Dense(input_dim=dim, output_dim=V, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               154800    
_________________________________________________________________
dense_2 (Dense)              (None, 773)               155373    
Total params: 310,173
Trainable params: 310,173
Non-trainable params: 0
_________________________________________________________________


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
def generate_data(corpus, window_size, V):
    for words in corpus:
        couples, labels = skipgrams(words, V, window_size, negative_samples=0, shuffle=True)
        if couples:
            X, y = zip(*couples)
            X = np_utils.to_categorical(X, V)
            y = np_utils.to_categorical(y, V)
            yield X, y

for epoch in range(10):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += model.train_on_batch(x, y)

    print(epoch, loss)


0 1405.0363674163818
1 1324.953682899475
2 1292.7048888206482
3 1268.710914850235
4 1245.85511136055
5 1224.3833136558533
6 1204.7064981460571
7 1186.9092469215393
8 1170.7286331653595
9 1155.7856698036194


In [25]:
with open('vectors.txt', 'w') as f:
    f.write(' '.join([str(V-1), str(dim)]))
    f.write('\n')
    vectors = model.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(' ')
        f.write(' '.join(map(str, list(vectors[i, :]))))
        f.write('\n')

In [40]:
w2v = KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)
pprint.pprint(w2v.most_similar(positive=['bert']))

[('bidirectional', 0.6121548414230347),
 ('inference', 0.5848471522331238),
 ('language', 0.5712134838104248),
 ('natural', 0.568121075630188),
 ('pretrained', 0.5610018968582153),
 ('pretraining', 0.5552810430526733),
 ('processing', 0.5532826781272888),
 ('autoregressive', 0.5525839328765869),
 ('nlp', 0.5367773771286011),
 ('ranking', 0.5365626215934753)]


In [None]:
wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False)  # C text format