In [2]:
!pip install -q --upgrade pip

## installing the latest transformers version from pip
!pip install --use-feature=2020-resolver -q transformers==3.0.2
import transformers

## installing Google Translator package
!pip install -q googletrans

[K     |████████████████████████████████| 1.5MB 7.7MB/s 
[K     |████████████████████████████████| 769 kB 9.2 MB/s 
[K     |████████████████████████████████| 883 kB 16.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 30.8 MB/s 
[K     |████████████████████████████████| 3.0 MB 45.2 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 55 kB 3.3 MB/s 
[K     |████████████████████████████████| 981 kB 12.8 MB/s 
[K     |████████████████████████████████| 42 kB 1.6 MB/s 
[K     |████████████████████████████████| 53 kB 2.4 MB/s 
[K     |████████████████████████████████| 65 kB 3.9 MB/s 
[K     |████████████████████████████████| 98 kB 7.4 MB/s 
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
  Building wheel for contextvars (setup.py) ... [?25l[?25hdone


In [3]:
import gc
import os
import random
import transformers
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K

from googletrans import Translator
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModel

print(f"TensorFlow version: {tf.__version__}")
print(f"Transformers version: {transformers.__version__}")

warnings.filterwarnings("ignore")

TensorFlow version: 2.3.0
Transformers version: 3.0.2


In [None]:
def encode_text(df, tokenizer, max_len, padding):
    """
    Preprocessing textual data into encoded tokens.
    """
    text = df["text"].values.tolist()

    # encoding text using tokenizer of the model
    text_encoded = tokenizer.batch_encode_plus(
        text,
        pad_to_max_length = padding,
        max_length = max_len
    )

    return text_encoded


def get_tf_dataset(X, y, auto, labelled = True, repeat = False, shuffle = False, batch_size = 128):
    """
    Creating tf.data.Dataset for TPU.
    """
    if labelled:
        ds = (tf.data.Dataset.from_tensor_slices((X["input_ids"], y)))
    else:
        ds = (tf.data.Dataset.from_tensor_slices(X["input_ids"]))

    if repeat:
        ds = ds.repeat()

    if shuffle:
        ds = ds.shuffle(2048)

    ds = ds.batch(batch_size)
    ds = ds.prefetch(auto)

    return ds

In [1]:
def build_model(model_name, max_len, learning_rate, metrics):
    """
    Building the Deep Learning architecture
    """
    # defining encoded inputs
    input_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_ids")
    
    # defining transformer model embeddings
    transformer_model = TFAutoModel.from_pretrained(model_name)
    transformer_embeddings = transformer_model(input_ids)[0]

    # defining output layer
    output_values = Dense(3, activation = "softmax")(transformer_embeddings[:, 0, :])

    # defining model
    model = Model(inputs = input_ids, outputs = output_values)
    opt = Adam(learning_rate = learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
    metrics = metrics

    model.compile(optimizer = opt, loss = loss, metrics = metrics)

    return model

In [None]:
def predict_test(model_name,test_data):
    """
    Testing the model
    """
    ## reading data
    test_dict = {"text":[test_data]}
    df_test = pd.DataFrame(test_dict,columns=["text"])

    X_test_encoded = encode_text(df = df_test, tokenizer = AutoTokenizer.from_pretrained(model_name), max_len = 128, padding = True)


    # Build the model
    model = build_model(model_name, 128, "1e-5", ["sparse_categorical_accuracy"])
    model.load_weights("/content/drive/MyDrive/model.h5")
    
    ds_test = get_tf_dataset(X_test_encoded,auto=tf.data.experimental.AUTOTUNE,labelled = False,y=-1,batch_size=64) #, -1, config.AUTO, labelled = False, batch_size = config.BATCH_SIZE * config.REPLICAS * 4)
    print(np.argmax(model.predict(ds_test)))
    

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import logging
logging.basicConfig(level=logging.ERROR)
predict_test("bert-large-cased","Still, as I urged our leaving Ireland with such inquietude and impatience, my father thought it best to yield.")