# Python to Scala Transpiler using Neural Machine Translation
This project demonstrates how to convert simple Python expressions into Scala expressions. For example, consider the following Python expression.
```python
[a(x, z) for x in l if b(x, y)]
```
Within the context of this work, the equivalent Scala expressions is
```scala
l.filter(x => b(x, y)).map(x => a(x, z))
```

[Neural machine translation (NMT)](https://en.wikipedia.org/wiki/Neural_machine_translation)
is applied using a [recurrent neural network](https://en.wikipedia.org/wiki/Recurrent_neural_network)
to implement this transpiler. The RNN is created by adapting the excellent work of Zafarali Ahmed in
[keras-attention](https://github.com/datalogue/keras-attention),
which was originally developed to convert dates from varied human readable format to machine format.

See README.md for more details on this work and reproducing and extending it.

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.utils.training_utils import multi_gpu_model


Using TensorFlow backend.


In [2]:
# include companion project keras-attention libraries
import sys
sys.path.append('../keras-attention')

from data.reader import Vocabulary, Data
from models.NMT import simpleNMT
from utils.metrics import all_acc

# Load Data
See details in README.md for generating data

In [3]:
expressions = pd.read_csv('data/expressions', sep='|', names=['python', 'scala'])
expressions = expressions.rename(columns={'python':'input', 'scala':'output'})
print(len(expressions))

expressions.sample(8)

400000


Unnamed: 0,input,output
163704,"[z(j) for j in m(g(t),[d for d in t if j(d)])]","m(g(t),t.filter(j)).map(z)"
216160,"[m(f,b) for f in c]","c.map(f => m(f,b))"
317981,"[j(k) for k in q(y(q,v),[q(v,f(z(g(k),v(q)),j(...","q(y(q,v),q.filter(v => v).map(v => q(v,f(z(g(k..."
194187,[b for b in c if b(b)],c.filter(b)
266994,"[p(b) for b in o(i,x(b)) if b]","o(i,x(b)).filter(b => b).map(p)"
55682,"e(u,h(u))","e(u,h(u))"
119073,"[f for f in k([u(r) for r in l],u(l)) if f]","k(l.map(u),u(l)).filter(f => f)"
298186,"[c(r(j),u) for u in z]","z.map(u => c(r(j),u))"


## Determine Padding Size

In [4]:
max_input_length = expressions['input'].str.len().max()
max_input_length

278

In [5]:
max_output_length = expressions['output'].str.len().max()
max_output_length

282

In [6]:
padding = max(max_input_length, max_output_length) + 16
padding

298

In [7]:
class ExpressionVocabulary(Vocabulary):
    """Hacked class to expose the vecabularly using an API required by keras-attention
       for data in this project.
    """
    
    def __init__(self, vocabulary, padding):
        self.vocabulary = vocabulary
        self.padding = padding
        self.reverse_vocabulary = {v: k for k, v in self.vocabulary.items()}
        
    @classmethod
    def from_strings(cls, strs, padding):
        strs = list(strs)
        chars = sorted({c for s in strs for c in s})
        chars = chars + ['<unk>', '<eof>']
        return cls({c:i for i,c in enumerate(chars)}, padding)

In [8]:
input_vocab = ExpressionVocabulary.from_strings(expressions['input'], padding)
input_vocab.size()

34

In [9]:
output_vocab = ExpressionVocabulary.from_strings(expressions['output'], padding)
output_vocab.size()

35

In [10]:
class ExpressionData(Data):
    """Hacked class to expose the expressions using an API required by keras-attention
       for data in this project.
    """
    
    def __init__(self, inputs, targets, input_vocabulary, output_vocabulary):
        self.inputs = list(inputs)
        self.targets = list(targets)
        self.input_vocabulary = input_vocabulary
        self.output_vocabulary = output_vocabulary
    

In [11]:
training_size = int(0.8 * len(expressions))
training_size

320000

In [12]:
training = ExpressionData(
    expressions['input'].iloc[:training_size],
    expressions['output'].iloc[:training_size],
    input_vocab,
    output_vocab
)
training.transform()

In [13]:
validation = ExpressionData(
    expressions['input'].iloc[training_size:],
    expressions['output'].iloc[training_size:],
    input_vocab,
    output_vocab
)
validation.transform()

## Create the NMT model using keras-attention

In [14]:
model = simpleNMT(pad_length=padding,
              n_chars=input_vocab.size(),
              n_labels=output_vocab.size(),
              embedding_learnable=False,
              encoder_units=512,
              decoder_units=512,
              trainable=True,
              return_probabilities=False)

inputs shape: (?, ?, 1024)


In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 298)               0         
_________________________________________________________________
OneHot (Embedding)           (None, 298, 34)           1156      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 298, 1024)         2240512   
_________________________________________________________________
attention_decoder_1 (Attenti (None, 298, 35)           3781356   
Total params: 6,023,024
Trainable params: 6,021,868
Non-trainable params: 1,156
_________________________________________________________________


In [16]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', all_acc])

## Model Training

In [17]:
def train_cycle():
    cp = ModelCheckpoint("./data/weights/NMT.{epoch:02d}-{val_loss:.2f}.hdf5",
                         monitor='val_loss',
                         verbose=0,
                         save_best_only=True,
                         save_weights_only=True,
                         mode='auto')

    batch_size = 10
    model.fit_generator(generator=training.generator(batch_size),
                        steps_per_epoch=100,
                        validation_data=validation.generator(batch_size),
                        validation_steps=100,
                        callbacks=[cp],
                        workers=1,
                        verbose=1,
                        epochs=5)

## Show Examples of Model on Validation Input

In [18]:
def encode(input_string):
    "Convert an input string in an array of numbers, on which modeling can be applied"
    return np.array([input_vocab.string_to_int(input_string)])

def apply_model(input_string):
    "Run the model on a single input string"
    full_prediction = model.predict(encode(input_string))
    prediction = np.argmax(full_prediction[0], axis=-1)
    return output_vocab.int_to_string(prediction)
    
def show_example_ml_application(input_string):
    "Interpret the terminal and padding characters in raw results"
    results = apply_model(input_string)
    pretty = ''.join('|' if x=='<eot>' else ('' if x == '<unk>' else x)
                     for x in results)
    print(repr(input_string), '->', repr(pretty))
    

examples = list(expressions['input'].iloc[training_size:].sample(10))

def show_examples():
    for example in examples:
        show_example_ml_application(example)

# Train and Show Examples Loop

In [None]:
for i in range(100000):
    print('cycle', i)
    train_cycle()
    show_examples()
    print('-'*80)

cycle 0
Epoch 1/5