In [1]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split

try:
  stop_words = nltk.corpus.stopwords.words("english")
except:
  nltk.download('stopwords')
  stop_words = nltk.corpus.stopwords.words("english")

import numpy as np
import tensorflow as tf
import torch
from transformers import ElectraConfig, ElectraTokenizerFast, TFElectraForPreTraining


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [None]:
#  Get the files from google drive
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Get english train data file
file_id = '1m3Ax9Z8OHMU-7FqraKc-ddI3YQ7yY_Q6'  # file id on the Google Drive
downloaded = FileIO("en.trial.complete.json", 'w')
request = drive_service.files().get_media(fileId=file_id)
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  status, done = downloader.next_chunk()
  print("Download {}%.".format(int(status.progress() * 100)))

In [2]:
en_df = pd.read_json("trial-data_all/en.trial.complete.json")
en_df.head()

Unnamed: 0,id,word,pos,gloss,example,type,counts,f_rnk,concrete,polysemous,sgns,char,electra
0,en.trial.1,beautiful,adjective,Pleasant ; clear .,"It 's beautiful outside , let 's go for a walk .",synonym/antonym-based,124908,706,0,0,"[1.393769145, 0.7516670227000001, -2.581333160...","[0.295645088, 0.098426342, 0.0463486575, 0.016...","[0.0800914839, -0.1875839084, -0.0411579385000..."
1,en.trial.2,cocktail,noun,A mixture of other substances or things .,a cocktail of illegal drugs,hypernym-based,4187,13245,1,0,"[2.0872907639, 0.2617726326, 0.668431639700000...","[0.3878918886, 0.1971583217, -0.44026631120000...","[-1.4771454334, -0.4742421806, 0.0847439319, -..."
2,en.trial.3,institutionalized,adjective,Having been established as an institution .,It is very difficult to get bureaucracies to a...,paraphrastic,961,35934,0,0,"[0.7893871069, -0.43510755900000003, 0.8553860...","[-0.0519028902, 0.2257766128, -0.1839749813, 0...","[-1.1030955315, -0.9046602845, 0.1503403783, -..."
3,en.trial.4,menial,noun,"A servant , especially a domestic servant .","The world was awake to the 2nd of May , but Ma...",hypernym-based,517,53267,1,1,"[0.1222261563, 0.1572209597, 0.5396134257, -0....","[-0.3667449057, -0.1431699395, -0.0671329796, ...","[-1.6584062576, -0.24498166140000002, 0.150174..."
4,en.trial.5,seek,verb,To try to find ; to look for ; to search for .,"Not long ago , it was difficult to produce pho...",paraphrastic,25195,3212,0,0,"[1.1894155741, 1.3668279648000001, -1.61634504...","[0.6137102246, 0.5464909673, -0.0161557049, 9....","[-0.5474479198000001, -0.0880863219, 0.0784259..."


In [14]:
len(en_df.electra[0])

256

In [3]:
def clean(gloss):
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  cleaned = tokenizer.tokenize(gloss)
  cleaned = list(set([word.lower() for word in cleaned]))
  # stop words should be replaced with x most frequent words
  cleaned = [word for word in cleaned if not word in stop_words]
  return cleaned

gloss_lists = en_df.gloss.apply(clean)
gloss_lists

0                                      [clear, pleasant]
1                          [things, substances, mixture]
2                             [established, institution]
3                        [servant, domestic, especially]
4                              [search, find, look, try]
                             ...                        
195                        [cells, animal, plant, color]
196                                [seeming, appearance]
197       [person, travel, permission, vehicle, proceed]
198                      [moving, much, sitting, around]
199    [part, achievement, forming, sentence, heraldi...
Name: gloss, Length: 200, dtype: object

In [4]:
voc = []
voc_1d = []
for i in range(len(gloss_lists)):
  for j in range(len(gloss_lists[i])):
    if not gloss_lists[i][j] in voc_1d and not gloss_lists[i][j] in en_df.word:
      voc_1d.append(gloss_lists[i][j])
  if not en_df.word[i] in voc_1d:
    voc_1d.append(en_df.word[i])
  line = gloss_lists[i][:]
  line.insert(0, en_df.word[i])
  voc.append(line)

voc_size = len(voc_1d)
print(voc[:5], "\n", voc_size)

[['beautiful', 'clear', 'pleasant'], ['cocktail', 'things', 'substances', 'mixture'], ['institutionalized', 'established', 'institution'], ['menial', 'servant', 'domestic', 'especially'], ['seek', 'search', 'find', 'look', 'try']] 
 941


In [91]:
filename = "vocab.txt"
with open(filename, "w") as f:
    for word in voc_1d:
        f.write(word)
        f.write('\n')
    for word in ['[UNK]\n', '[CLS]\n', '[SEP]\n', '[MASK]\n', '[PAD]']:
        f.write(word)
voc_size += 5

In [14]:
voc2 = []
for v in voc:
    voc2.append(" ".join(v))
voc2[:5]

['beautiful clear pleasant',
 'cocktail things substances mixture',
 'institutionalized established institution',
 'menial servant domestic especially',
 'seek search find look try']

In [15]:
filename = "train.txt"
indices = np.random.choice(200, 200, replace = False)
with open(filename, "w") as f:
    for i in indices[:-51]:
        f.write(" ".join(voc2[i]))
        f.write('\n')
    f.write(" ".join(voc2[indices[-51]]))

In [16]:
filename = "test.txt"
with open(filename, "w") as f:
    for i in indices[-50:-1]:
        f.write(" ".join(voc2[i]))
        f.write('\n')
    f.write(" ".join(voc2[indices[-1]]))

In [17]:
tokenizer = ElectraTokenizerFast("vocab.txt", tokenize_chinese_chars = False)
config = ElectraConfig(
    vocab_size = voc_size,
    embedding_size = 256,
    num_hidden_layers = 4)
electra = TFElectraForPreTraining(config)

In [18]:
compiler = tf.keras.optimizers.Adam()
loss = tf.keras.losses.BinaryCrossentropy(from_logits = True)
electra.compile(compiler, loss)

In [29]:
input_ids = tf.constant(tokenizer.encode(voc2[0]))
tokens = tokenizer.convert_ids_to_tokens(input_ids)
tokens

['[CLS]', 'beautiful', 'clear', 'pleasant', '[SEP]']

In [46]:
inputs = tokenizer.encode(voc2[0])
inputs

[942, 2, 1, 0, 943]

In [47]:
inputs = tokenizer.encode_plus(voc2[0])
electra.fit([942, 2, 1, 0, 943])

TypeError: in user code:

    /usr/local/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:855 train_function  *
        return step_function(self, iterator)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /usr/local/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    /usr/local/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:838 run_step  **
        outputs = model.train_step(data)
    /usr/local/lib/python3.9/site-packages/transformers/modeling_tf_utils.py:792 train_step
        if y is None and "labels" in x:

    TypeError: argument of type 'Tensor' is not iterable


In [30]:
input_ids = tf.constant(tokenizer.encode(voc2[0]))
tokens = tokenizer.convert_ids_to_tokens(input_ids)
electra(input_ids, tokens)

# get accuracy
# embeddings = electra.get_input_embeddings()

ValueError: not enough values to unpack (expected 2, got 1)

In [11]:
import logging

from simpletransformers.language_modeling import LanguageModelingModel

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_file = "train.txt"
test_file = "test.txt"

model = LanguageModelingModel(
    "electra",
    None,
    evaluate_during_training = True,
    output_hidden_states = True,
    args={"reprocess_input_data": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 1,
    "dataset_type": "simple",
    "vocab_size": voc_size,
    "generator_config": {
        "embedding_size": 256,
        "hidden_size": 256,
        "num_hidden_layers": 3,
    },
    "discriminator_config": {
        "embedding_size": 256,
        "hidden_size": 256,
    }
    },
    train_files=train_file,
    use_cuda=False
)

# Train the model
model.train_model(train_file, eval_file=test_file)

# Evaluate the model
result = model.eval_model(test_file)

INFO:simpletransformers.language_modeling.language_modeling_model: Training of None tokenizer complete. Saved to outputs/.
INFO:simpletransformers.language_modeling.language_modeling_model: Training language model from scratch
INFO:simpletransformers.language_modeling.language_modeling_utils: Creating features from dataset file at cache_dir/
100%|██████████| 150/150 [00:13<00:00, 11.35it/s]
100%|██████████| 49/49 [00:00<00:00, 113110.01it/s]
INFO:simpletransformers.language_modeling.language_modeling_utils: Saving features into cached file cache_dir/electra_cached_lm_126_train.txt
INFO:simpletransformers.language_modeling.language_modeling_model: Training started
Epochs 0/1. Running Loss:   35.2150: 100%|██████████| 7/7 [00:07<00:00,  1.13s/it]
Epoch 1 of 1: 100%|██████████| 1/1 [00:08<00:00,  8.13s/it]
INFO:simpletransformers.language_modeling.language_modeling_model: Training of electra model complete. Saved to outputs/.


AttributeError: 'LanguageModelingModel' object has no attribute 'predict'

In [12]:
model.s

{'eval_loss': 33.712968826293945, 'perplexity': tensor(4.3788e+14)}