<a href="https://colab.research.google.com/github/nhanphanvan/Transformer/blob/main/NMT_Demo_Use_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Package

In [None]:
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
!git clone https://github.com/nhanphanvan/Transformer.git

In [None]:
!pip install streamlit

In [None]:
!pip -q install transformers
# !pip install pyngrok

In [None]:
# ### for cpu
# !apt install libomp-dev
# !pip install faiss
# ### for gpu
!pip install faiss-gpu

In [None]:
import torch

In [None]:
# trained model path
folder = "https://drive.google.com/drive/folders/1HkRLj9iTdUi1pPUk_hU0fXH2BERAsCXf?usp=sharing"
best_bert = 'https://drive.google.com/file/d/1a5-iSc08WdpZmIWmQezBTKSpWI3RoU17/view?usp=sharing'
long_dataset_70000_index = 'https://drive.google.com/file/d/1H0WgrRJxmYuZcw3qoYEd_tGv22lUkvWx/view?usp=sharing'
medical_dataset_70000_index = 'https://drive.google.com/file/d/1FlKCWtemEUfWDEggMD5_2guxtEXEOBVh/view?usp=sharing'
medical_vals = 'https://drive.google.com/file/d/1cciP8LLqUlYddYuGPbxZdOD-VGmsTQdn/view?usp=sharing'
vals = 'https://drive.google.com/file/d/1fBBtd7eYbk8VGk-cy5pMH32oXrqPJQE1/view?usp=sharing'

# please download and move to a folder, enter folder path here
PATH = './'

In [None]:
# from google.colab import drive

# drive.mount('/content/gdrive')

### Machine Translation Demo

In [None]:
%%writefile setup.py
import torch

# please enter folder path here
FOLDER_PATH = "./"

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

SRC_VOCAB_SIZE = 28996
TGT_VOCAB_SIZE = 64001
HIDDEN_SIZE = 768
NUM_ENCODER_LAYERS = 12
NUM_DECODER_LAYERS = 12
NUM_ATTENTION_HEADS = 12
FEEDFORWARD_SIZE = 3072
DROPOUT = 0.1
ACTIVATION = 'gelu'
LAYER_NORM_EPS = 1e-12
SRC_UNK_ID, SRC_PADDING_ID, SRC_BOS_ID, SRC_EOS_ID = 100, 0, 101, 102
TGT_UNK_ID, TGT_PADDING_ID, TGT_BOS_ID, TGT_EOS_ID = 3, 1, 0, 2
NORM_FIRST = True
MAX_SEQUENCE_LENGTH = 1024
BATCH_SIZE = 10
BERT_EMBEDDING = True
OUTPUT_HIDDEN_STATES = True
APPLY_LAYER_NORM = True

############################################################################
from transformers import AutoTokenizer, AutoConfig

src_model_id = 'bert-base-cased'
tgt_model_id = 'vinai/phobert-base'

src_config = AutoConfig.from_pretrained(src_model_id)
# src_bert = AutoModel.from_pretrained(src_model_id, config=src_config)
src_tokenizer = AutoTokenizer.from_pretrained(src_model_id)
src_tokenizer.model_max_length = MAX_SEQUENCE_LENGTH

tgt_config = AutoConfig.from_pretrained(tgt_model_id)
# tgt_bert = AutoModel.from_pretrained(tgt_model_id, config=tgt_config)
tgt_tokenizer = AutoTokenizer.from_pretrained(tgt_model_id)
tgt_tokenizer.model_max_length = MAX_SEQUENCE_LENGTH

############################################################################
from Transformer.modules.config import TransformerConfig
from Transformer.modules.transformer import Transformer
from Transformer.modules.embedding import PositionalEmbedding, TransformerEmbedding
from Transformer.modules.seq2seq_transformer import Seq2SeqTransformer

kwargs = {
    'src_vocab_size': SRC_VOCAB_SIZE,
    'tgt_vocab_size': TGT_VOCAB_SIZE,
    'hidden_size': HIDDEN_SIZE,
    'num_encoder_layers': NUM_ENCODER_LAYERS,
    'num_decoder_layers': NUM_DECODER_LAYERS,
    'num_attention_heads': NUM_ATTENTION_HEADS,
    'feedforward_size': FEEDFORWARD_SIZE,
    'dropout': DROPOUT,
    'activation': ACTIVATION,
    'layer_norm_eps': LAYER_NORM_EPS,
    'src_padding_id': SRC_PADDING_ID,
    'tgt_padding_id': TGT_PADDING_ID,
    'norm_first': NORM_FIRST,
    'max_sequence_length': MAX_SEQUENCE_LENGTH,
    'bert_embedding': BERT_EMBEDDING,
    'output_hidden_states': OUTPUT_HIDDEN_STATES,
    'apply_layer_norm': APPLY_LAYER_NORM,
    'device': DEVICE,
    'dtype': torch.float32
}

config = TransformerConfig(**kwargs)
transformer = Seq2SeqTransformer(config=config)
transformer = transformer.to(DEVICE)
transformer.load_state_dict(torch.load(FOLDER_PATH + 'best-NMT.pt'))
##########################################################################
from Transformer.application.NMT import Datastore, DatastoreBuilder, NMTModel, TranslateMachine, CustomDataset, calculate_bleu_score
import numpy as np

load_path_1 = FOLDER_PATH + 'long_dataset_70000_index'
val_path_1 = FOLDER_PATH + 'vals.npy'

load_path_2 = FOLDER_PATH + 'medical_dataset_70000_index'
val_path_2 = FOLDER_PATH + 'medical_vals.npy'

nmt_model = NMTModel(SRC_BOS_ID, SRC_EOS_ID, TGT_BOS_ID, TGT_EOS_ID, src_tokenizer, tgt_tokenizer, config, transformer)
# datastore_builder = DatastoreBuilder(nmt_model, DEVICE)
# embeddings_results, vals = datastore_builder.batch_create_features_file(long_src_path, long_src_path, batch_size=20, end_index=70000)
general_data_store = Datastore(768, size_value_array=TGT_VOCAB_SIZE, num_centroid=128, nprobe=32, load_file=load_path_1)
medical_data_store = Datastore(768, size_value_array=TGT_VOCAB_SIZE, num_centroid=128, nprobe=32, load_file=load_path_2)
# data_store.build_datastore(embeddings_results)
general_vals = np.load(val_path_1)
medical_vals = np.load(val_path_2)
general_translate_machine = TranslateMachine(nmt_model, general_data_store, general_vals, device=DEVICE)
medical_translate_machine = TranslateMachine(nmt_model, medical_data_store, medical_vals, device=DEVICE)

print('Done')

In [None]:
%%writefile app.py
from setup import general_translate_machine, medical_translate_machine
import streamlit as st

st.sidebar.subheader('Select your domain below.')
domain = st.sidebar.selectbox("Domain",['General', 'Medical'])


st.title('Simple English ➡️ Vietnamese Translation App')
st.write('This is a simple machine translation app that will translate\
         your English input text into Vietnamese language\
         by leveraging a pre-trained [Text-To-Text Transfer Tranformers](https://arxiv.org/abs/1910.10683) model.')

st.subheader('Input Text')
text = st.text_area(' ', height=200)

if text != '':
    
    translate_machine = medical_translate_machine if domain == 'Medical' else general_translate_machine
    translated_sentence_pure = translate_machine.beam_translate(text, num_knns=64, use_datastore=False)
    translated_sentence_pure = translated_sentence_pure.strip().replace('_', ' ')
    translated_sentence = translate_machine.beam_translate(text, num_knns=64)
    translated_sentence = translated_sentence.strip().replace('_', ' ')
    
    st.subheader('Translated Text (Use Datastore)')
    st.write(translated_sentence)
    st.subheader('Translated Text (Not Use Datastore)')
    st.write(translated_sentence_pure)

Writing app.py


In [None]:
# use this if you are using colab
!streamlit run app.py & npx localtunnel --port 8501

In [None]:
# # use if you in local machine
# !stremlit run app.py