# Translite - A Low Resoure Modular Multi-Lingual NLP Translator

## Section 1 - Configuration
This is where the datasets will be prepared and configured. User is capable of adding as many languages as they'd like to this. The main requirement will be that datasets must be stored on Kaggle and in the following formats:
  - CSV
  - Excel
  - JSON
  - Parquet

This can be checked by looking at the **Data Explorer** on the Kaggle page for your dataset and seeing if the files have the extension ".json", ".csv", ".xlsx", or ".parquet".

#### 1.1 - Install Requirements

In [1]:
!pip install ipywidgets pandas kaggle

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


#### 1.2 - Load/Create Config

Here you will add your datasets for all of the languages you'd like to translate to English. You will need to add the url to the raw dataset, the name of the column where the English versions are, the name of the column where the non-English versions are stored, and the actual name of the language. It is suggested that you use the export button to save your config when you're done so that you can quickly import again when you return. A sample dataset with Spanish and Italian is available as well.

In [2]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
from google.colab import files

df = pd.DataFrame(columns=[
    "Kaggle Dataset Name",
    "Language Name",
    "English Column Name",
    "Other Language Column Name"])

def display_table(df):
    clear_output(wait=True)
    display(ds_uri, lang, eng_col, natv_col)
    display(widgets.HBox((add_button, import_button, export_button, clear_button)))
    display(df)

def add_row(ds_uri, lang, eng_col, natv_col):
    global df
    new_row = pd.DataFrame([[ds_uri, lang, eng_col, natv_col]], columns=df.columns.tolist())
    df = pd.concat([df, new_row], ignore_index=True)
    display_table(df)

ds_uri = widgets.Text(description="Kaggle Dataset Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
lang = widgets.Text(description="Language Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
eng_col = widgets.Text(description="English Column Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
natv_col = widgets.Text(description="Other Language Column Name:",
                      layout=widgets.Layout(width="500px"),
                      style=dict(description_width='initial'))
add_button = widgets.Button(description="Add Row")
import_button = widgets.Button(description="Import Dataset")
export_button = widgets.Button(description="Export Dataset")
clear_button = widgets.Button(description="Clear Table")

def on_import_button_click(b):
    uploaded = files.upload()
    filename = next(iter(uploaded))
    global df
    df = pd.read_json(filename)
    display_table(df)

def on_export_button_click(b):
    df.to_json("translite_datasets.json", index=False)
    files.download("translite_datasets.json")

def on_add_button_click(b):
    add_row(ds_uri.value, lang.value, eng_col.value, natv_col.value)
    ds_uri.value = ""
    lang.value = ""
    eng_col.value = ""
    natv_col.value = ""

def on_clear_button_click(b):
    global df
    df = pd.DataFrame(columns=[
        "Kaggle Dataset Name",
        "Language Name",
        "English Column Name",
        "Other Language Column Name"])
    display_table(df)

add_button.on_click(on_add_button_click)
import_button.on_click(on_import_button_click)
export_button.on_click(on_export_button_click)
clear_button.on_click(on_clear_button_click)
display_table(df)


Text(value='', description='Kaggle Dataset Name:', layout=Layout(width='500px'), style=DescriptionStyle(descri…

Text(value='', description='Language Name:', layout=Layout(width='500px'), style=DescriptionStyle(description_…

Text(value='', description='English Column Name:', layout=Layout(width='500px'), style=DescriptionStyle(descri…

Text(value='', description='Other Language Column Name:', layout=Layout(width='500px'), style=DescriptionStyle…

HBox(children=(Button(description='Add Row', style=ButtonStyle()), Button(description='Import Dataset', style=…

Unnamed: 0,Kaggle Dataset Name,Language Name,English Column Name,Other Language Column Name


##### Load Sample (Optional)
If you just want to test this script and don't want to curate a list of languages yourself please use this feature to load a sample dataset.

In [3]:
import json
import pandas as pd
from google.colab import files

raw_config = '{"Kaggle Dataset Name":{"0":"devicharith\/language-translation-englishfrench","1":"lonnieqin\/englishspanish-translation-dataset"},"Language Name":{"0":"French","1":"Spanish"},"English Column Name":{"0":"English words\/sentences","1":"english"},"Other Language Column Name":{"0":"French words\/sentences","1":"spanish"}}'
df = pd.DataFrame(json.loads(raw_config))
df.to_json("translite_datasets.json", index=False)
files.download("translite_datasets.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### 1.3 - Connect to Kaggle API
Datasets for this product will need to be provided from Kaggle, this means that if you need to utilize cutom language translation datasets you must first host them on https://kaggle.com. You need to get a Kaggle API Key as well. To get this go to your account settings page on Kaggle, and click "Create New API Token". This will create a file called `kaggle.json` that you will upload here.



In [4]:
from google.colab import files
files.upload()
!mkdir -p ~/.config/kaggle
!cp kaggle.json ~/.config/kaggle/
!chmod 600 ~/.config/kaggle/kaggle.json

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

Saving kaggle.json to kaggle.json


#### 1.4 - Load Datasets from Config
This is where you'll test out your config and ensure that you can properly load datasets from it.

In [5]:
import os
import pandas as pd

from collections.abc import Callable as function

allowed_types: dict[str, function] = {
    'csv': pd.read_csv,
    'json': pd.read_json,
    'xlsx': pd.read_excel,
    'parquet': pd.read_parquet
}

id_counter: int = 0
dfs = []
classes: dict[int, str] = {}

for _, dataset in df.iterrows():
    print(f"Loading {dataset['Language Name']}")
    dataset_name: str = dataset["Kaggle Dataset Name"]
    language_name: str = dataset["Language Name"]

    # Get and prep datasets from kaggle
    !kaggle datasets download {dataset_name} -d {language_name}
    !unzip {dataset_name.split('/')[-1]}.zip -d {language_name}

    files: list[tuple[str, function]] = [(f, allowed_types.get(f.split('.')[-1]))
    for f in os.listdir(language_name)]

    curr = pd.DataFrame(columns=[
        "class", "english", "native"
    ])

    # Look for all files in the dir that are readable as datasets and
    # reformat them


    for f, fn in files:
      if fn:
        tmp = fn(f"{language_name}/{f}")
        tmp = tmp.rename(columns={
            dataset["English Column Name"]: "english",
            dataset["Other Language Column Name"]: "native"
        })
        tmp["class"] = language_name
        tmp["class_int"] = id_counter
        classes[f"{id_counter}"] = language_name
        curr = pd.concat([curr, tmp], ignore_index=True)
    id_counter += 1

    dfs.append(curr)

combined = pd.concat(dfs, ignore_index=True)
combined.to_csv("translite.csv", index=False)

Loading French
Dataset URL: https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench
License(s): CC0-1.0
Archive:  language-translation-englishfrench.zip
  inflating: French/eng_-french.csv  
Loading Spanish
Dataset URL: https://www.kaggle.com/datasets/lonnieqin/englishspanish-translation-dataset
License(s): unknown
Archive:  englishspanish-translation-dataset.zip
  inflating: Spanish/data.csv        


#### 1.5 - Prepare Datasets for Classification
This is where datasets will be prepared for classification model training.

##### 1.5.1 - Prepare Train Test Splits

In [6]:
TRAIN_SIZE = 0.85

print("Filling embedding reference map...")
combined = combined.sample(frac=1)
train_in = combined['native'][:int(len(combined) * TRAIN_SIZE)].tolist()
train_out = combined['class_int'][:int(len(combined) * TRAIN_SIZE)].tolist()
test_in = combined['native'][int(len(combined) * TRAIN_SIZE):].tolist()
test_out = combined['class_int'][int(len(combined) * TRAIN_SIZE):].tolist()

Filling embedding reference map...


## Section 2 - Models


#### 2.1 - Classifier Training

##### 2.1.1 - Training

In [7]:
# https://developer.ibm.com/tutorials/awb-classifying-data-multinomial-naive-bayes-algorithm/
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

model = Pipeline([
    ('embeddings', TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
    ('classifier', MultinomialNB(
        alpha=0.1))
])

model.fit(train_in, train_out)
classifier_model = model


##### 2.1.2 - Validation

In [8]:
predictions = model.predict(test_in)
acc = sum(predictions == test_out) / len(test_out)
print(f"Accuracy: {acc}")

Accuracy: 0.9983705983524939


##### 2.1.3 - Manual Test

In [9]:
print(classes[f"{int(model.predict([input('>>> ')]).item())}"])

>>> je ne sais pas
French


#### 2.2 - Translator Training

##### 2.2.1 - Import Requirements

In [10]:
from __future__ import unicode_literals, print_function, division
from io import open
from pandas import DataFrame
from typing import Union
import unicodedata
import re
import random
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### 2.2.1 - DataLoader
This is how we will convert our datasets into a usable format for translation.

##### 2.2.2.1 - Config

In [11]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 10

###### 2.2.2.1 - Embeddings

In [12]:
class Lang:
    def __init__(self, name: str):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence: str):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word: str):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def cleanStr(s, lang: Lang=None): # No crash on my watch
  s = ''.join(
      c for c in unicodedata.normalize('NFD', s.lower().strip())
      if unicodedata.category(c) != 'Mn'
  )
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z!?]+", r" ", s)

  if lang:
    s = s.split()

    for word in s:
      if word not in lang.word2index:
        word = ""

    s = ' '.join(s)

  return s.strip()

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH

def loadLangs(lang: str, df: DataFrame) -> Union[lang, lang, list]:
    pairs = [[cleanStr(row["native"]), cleanStr(row["english"])] for _, row in df.iterrows() if row["class"] == lang]
    pairs = list(filter(filterPair, pairs))
    input_lang, output_lang = Lang(lang), Lang('English')

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

###### 2.2.2.2 - DataLoader Defintion

In [13]:
def indexesFromSentence(lang, sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
  indexes = indexesFromSentence(lang, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, target_tensor)

def get_dataloader(lang, batch_size):
  input_lang, output_lang, pairs = loadLangs(lang, combined)

  n = len(pairs)
  input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
  target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

  for idx, (inp, tgt) in enumerate(pairs):
      inp_ids = indexesFromSentence(input_lang, inp)
      tgt_ids = indexesFromSentence(output_lang, tgt)
      inp_ids.append(EOS_token)
      tgt_ids.append(EOS_token)
      input_ids[idx, :len(inp_ids)] = inp_ids
      target_ids[idx, :len(tgt_ids)] = tgt_ids

  train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                              torch.LongTensor(target_ids).to(device))

  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
  return input_lang, output_lang, train_dataloader

##### 2.2.2 - Model Definition

In [14]:
class Encoder(nn.Module):
  def __init__(self, input_size, hidden_size, dropout_p=0.1):
      super(Encoder, self).__init__()
      self.hidden_size = hidden_size

      self.embedding = nn.Embedding(input_size, hidden_size)
      self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
      self.dropout = nn.Dropout(dropout_p)

  def forward(self, input):
      embedded = self.dropout(self.embedding(input))
      output, hidden = self.gru(embedded)
      return output, hidden

In [15]:
class Decoder(nn.Module):
  def __init__(self, hidden_size, output_size):
      super(Decoder, self).__init__()
      self.embedding = nn.Embedding(output_size, hidden_size)
      self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
      self.out = nn.Linear(hidden_size, output_size)

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
      batch_size = encoder_outputs.size(0)
      decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
      decoder_hidden = encoder_hidden
      decoder_outputs = []

      for i in range(20):
          decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
          decoder_outputs.append(decoder_output)

          if target_tensor is not None:
              decoder_input = target_tensor[:, i].unsqueeze(1)
          else:
              _, topi = decoder_output.topk(1)
              decoder_input = topi.squeeze(-1).detach()

      decoder_outputs = torch.cat(decoder_outputs, dim=1)
      decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
      return decoder_outputs, decoder_hidden, None

  def forward_step(self, input, hidden):
      output = self.embedding(input)
      output = F.relu(output)
      output, hidden = self.gru(output, hidden)
      output = self.out(output)
      return output, hidden

In [16]:
class BahdanauAttention(nn.Module):
  def __init__(self, hidden_size):
      super(BahdanauAttention, self).__init__()
      self.Wa = nn.Linear(hidden_size, hidden_size)
      self.Ua = nn.Linear(hidden_size, hidden_size)
      self.Va = nn.Linear(hidden_size, 1)

  def forward(self, query, keys):
      scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
      scores = scores.squeeze(2).unsqueeze(1)

      weights = F.softmax(scores, dim=-1)
      context = torch.bmm(weights, keys)

      return context, weights

In [17]:
class AttentionDecoder(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_p=0.1):
      super(AttentionDecoder, self).__init__()
      self.embedding = nn.Embedding(output_size, hidden_size)
      self.attention = BahdanauAttention(hidden_size)
      self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
      self.out = nn.Linear(hidden_size, output_size)
      self.dropout = nn.Dropout(dropout_p)

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
      batch_size = encoder_outputs.size(0)
      decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
      decoder_hidden = encoder_hidden
      decoder_outputs = []
      attentions = []

      for i in range(MAX_LENGTH):
          decoder_output, decoder_hidden, attn_weights = self.forward_step(
              decoder_input, decoder_hidden, encoder_outputs
          )
          decoder_outputs.append(decoder_output)
          attentions.append(attn_weights)

          if target_tensor is not None:
              decoder_input = target_tensor[:, i].unsqueeze(1)
          else:
              _, topi = decoder_output.topk(1)
              decoder_input = topi.squeeze(-1).detach()

      decoder_outputs = torch.cat(decoder_outputs, dim=1)
      decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
      attentions = torch.cat(attentions, dim=1)

      return decoder_outputs, decoder_hidden, attentions


  def forward_step(self, input, hidden, encoder_outputs):
      embedded =  self.dropout(self.embedding(input))

      query = hidden.permute(1, 0, 2)
      context, attn_weights = self.attention(query, encoder_outputs)
      input_gru = torch.cat((embedded, context), dim=2)

      output, hidden = self.gru(input_gru, hidden)
      output = self.out(output)

      return output, hidden, attn_weights

##### 2.2.3 - Training

###### 2.2.3.1 - Per Model Training
This is the script that each model will need to run to train for each language.

In [18]:
def train_epoch(
    dataloader,
    encoder,
    decoder,
    encoder_optimizer,
    decoder_optimizer,
    criterion):

  total_loss = 0
  for data in dataloader:
      input_tensor, target_tensor = data

      encoder_optimizer.zero_grad()
      decoder_optimizer.zero_grad()

      encoder_outputs, encoder_hidden = encoder(input_tensor)
      decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

      loss = criterion(
          decoder_outputs.view(-1, decoder_outputs.size(-1)),
          target_tensor.view(-1)
      )
      loss.backward()

      encoder_optimizer.step()
      decoder_optimizer.step()

      total_loss += loss.item()

  return total_loss / len(dataloader)

def train(train_dataloader,
          encoder,
          decoder,
          n_epochs,
          learning_rate=0.001,
          print_every=100):
  start = time.time()
  plot_losses = []
  print_loss_total = 0
  plot_loss_total = 0

  encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
  criterion = nn.NLLLoss()

  for epoch in range(1, n_epochs + 1):
      loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
      print_loss_total += loss
      plot_loss_total += loss

      if epoch % print_every == 0:
          print_loss_avg = print_loss_total / print_every
          print_loss_total = 0
          print(f"Epoch: {epoch}, Progress: {epoch / n_epochs * 100:.2f}, Loss: {print_loss_avg}")

###### 2.2.4.2 - Train All

In [20]:
models = {}
hidden_size = 128
batch_size = 4096 # A100 - 40G

for lang in classes.values():
  print(f"Training {lang}...")
  input_lang, output_lang, train_dataloader = get_dataloader(lang, batch_size)
  encoder = Encoder(input_lang.n_words, hidden_size).to(device)
  decoder = AttentionDecoder(hidden_size, output_lang.n_words).to(device)
  train(train_dataloader, encoder, decoder, 80, print_every=10)
  models[lang] = (input_lang, output_lang, encoder, decoder)

Training French...
Epoch: 10, Progress: 12.50, Loss: 3.417725846346687
Epoch: 20, Progress: 25.00, Loss: 2.058082409466014
Epoch: 30, Progress: 37.50, Loss: 1.4055474211187922
Epoch: 40, Progress: 50.00, Loss: 1.037968441668679
Epoch: 50, Progress: 62.50, Loss: 0.8254792032872927
Epoch: 60, Progress: 75.00, Loss: 0.6900376367218353
Epoch: 70, Progress: 87.50, Loss: 0.5963052206179675
Epoch: 80, Progress: 100.00, Loss: 0.5276131424833747
Training Spanish...
Epoch: 10, Progress: 12.50, Loss: 3.832967051506042
Epoch: 20, Progress: 25.00, Loss: 2.6776475725173947
Epoch: 30, Progress: 37.50, Loss: 2.0272846779823306
Epoch: 40, Progress: 50.00, Loss: 1.5271002430915832
Epoch: 50, Progress: 62.50, Loss: 1.1956762280464173
Epoch: 60, Progress: 75.00, Loss: 0.9799036798477173
Epoch: 70, Progress: 87.50, Loss: 0.8303933930397033
Epoch: 80, Progress: 100.00, Loss: 0.7212131414413453


##### 2.2.5 - Evaluate

In [21]:
def predict(input, encoder, decoder):
  with torch.no_grad():
      input_tensor = tensorFromSentence(input_lang, input)

      encoder_outputs, encoder_hidden = encoder(input_tensor)
      decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

      _, topi = decoder_outputs.topk(1)
      decoded_ids = topi.squeeze()

      decoded_words = []
      for idx in decoded_ids:
          if idx.item() == EOS_token:
              decoded_words.append('<EOS>')
              break
          decoded_words.append(output_lang.index2word[idx.item()])
      output_sentence = ' '.join(decoded_words)
      print('Translated: ', output_sentence)

In [22]:
encoder.eval()
decoder.eval()
input_sentence = cleanStr(input('>>> '), lang=input_lang)
predict(input_sentence, encoder, decoder)

>>> ¿Está bien tu coche?
Translated:  is your car ok ? <EOS>


##### 2.3 - Export Everything as ZIP
This will put all of your trained models and binaries into a usable format to be loaded into your application based on the specs of the pipeline outlined in section 3.

In [25]:
from google.colab import files
import pickle

!rm -rf translite_out
!mkdir translite_out

config = {
    "max_sent_length": MAX_LENGTH,
    "hidden_size": hidden_size,
    "EOS_token": EOS_token,
    "SOS_token": SOS_token
}

for model, (input_lang, output_lang, encoder, decoder) in models.items():
  torch.save(encoder.state_dict(), f"translite_out/{model}_encoder.pt")
  torch.save(decoder.state_dict(), f"translite_out/{model}_decoder.pt")

  with open(f'translite_out/{model}_class.pkl', 'wb') as f:
    pickle.dump(input_lang, f)

  with open(f'translite_out/{model}_class_out.pkl', 'wb') as f:
    pickle.dump(output_lang, f)

with open(f'translite_out/classes.json', 'w') as f:
  json.dump(classes, f)

with open(f'translite_out/config.json', 'w') as f:
  json.dump(config, f)

with open(f'translite_out/classifier.pkl', 'wb') as f:
  pickle.dump(classifier_model, f)

!zip -r translite_out.zip translite_out
files.download("translite_out.zip")

updating: translite_out/ (stored 0%)
updating: translite_out/Spanish_class_out.pkl (deflated 50%)
updating: translite_out/French_decoder.pt (deflated 7%)
updating: translite_out/config.json (deflated 19%)
updating: translite_out/Spanish_encoder.pt (deflated 7%)
updating: translite_out/classifier.pkl (deflated 41%)
updating: translite_out/French_encoder.pt (deflated 7%)
updating: translite_out/Spanish_decoder.pt (deflated 7%)
updating: translite_out/classes.json (deflated 3%)
updating: translite_out/French_class_out.pkl (deflated 50%)
updating: translite_out/Spanish_class.pkl (deflated 53%)
updating: translite_out/French_class.pkl (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Section 3 - Pipeline
This is a mockup of what the expected output pipeline will look like in an application where Translite models are used.

In [26]:
import os
import pickle

def load_translite(folder: str = ''):
  translation_models = {}

  if not os.path.isfile(f"{folder}/config.json"):
    raise FileNotFoundError("config.json not found")

  if not os.path.isfile(f"{folder}/classes.json"):
    raise FileNotFoundError("classes.json not found")

  if not os.path.isfile(f"{folder}/classifier.pkl"):
    raise FileNotFoundError("classifier.pkl not found")

  with open(f"{folder}/config.json", "r") as f:
    config = json.load(f)

  with open(f"{folder}/classes.json", "r") as f:
    classes = json.load(f)

  with open(f"{folder}/classifier.pkl", "rb") as f:
    classifier_model = pickle.load(f)

  for class_name in classes.values():
    if not os.path.isfile(f"{folder}/{class_name}_encoder.pt"):
      raise FileNotFoundError(f"{class_name}_encoder.pt not found")

    if not os.path.isfile(f"{folder}/{class_name}_decoder.pt"):
      raise FileNotFoundError(f"{class_name}_decoder.pt not found")

    if not os.path.isfile(f"{folder}/{class_name}_class.pkl"):
      raise FileNotFoundError(f"{class_name}_class.pkl not found")

    if not os.path.isfile(f"{folder}/{class_name}_class_out.pkl"):
      raise FileNotFoundError(f"{class_name}_class_out.pkl not found")

    input_lang = pickle.load(open(f"{folder}/{class_name}_class.pkl", "rb"))
    output_lang = pickle.load(open(f"{folder}/{class_name}_class_out.pkl", "rb"))
    encoder = Encoder(input_lang.n_words, hidden_size).to(device)
    decoder = AttentionDecoder(hidden_size, output_lang.n_words).to(device)
    encoder.load_state_dict(torch.load(f"{folder}/{class_name}_encoder.pt"))
    decoder.load_state_dict(torch.load(f"{folder}/{class_name}_decoder.pt"))
    translation_models[class_name] = (input_lang, output_lang, encoder, decoder)

  return classifier_model, translation_models, classes, config

load_translite("translite_out")


(Pipeline(steps=[('embeddings',
                  TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                 ('classifier', MultinomialNB(alpha=0.1))]),
 {'French': (<__main__.Lang at 0x7d2543f5dfd0>,
   <__main__.Lang at 0x7d25241a2fd0>,
   Encoder(
     (embedding): Embedding(19370, 128)
     (gru): GRU(128, 128, batch_first=True)
     (dropout): Dropout(p=0.1, inplace=False)
   ),
   AttentionDecoder(
     (embedding): Embedding(11604, 128)
     (attention): BahdanauAttention(
       (Wa): Linear(in_features=128, out_features=128, bias=True)
       (Ua): Linear(in_features=128, out_features=128, bias=True)
       (Va): Linear(in_features=128, out_features=1, bias=True)
     )
     (gru): GRU(256, 128, batch_first=True)
     (out): Linear(in_features=128, out_features=11604, bias=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )),
  'Spanish': (<__main__.Lang at 0x7d2542ba6f50>,
   <__main__.Lang at 0x7d2543697590>,
   Encoder(
     (embedding): Embedding(21447, 1