# Python to Scala Comprehension Transpiler using Neural Machine Translation
This project demonstrates how to convert simple Python expressions into Scala expressions. For example, consider the following Python expression.
```python
[a(x, z) for x in l if b(x, y)]
```
Within the context of this work, the equivalent Scala expressions is
```scala
l.filter(x => b(x, y)).map(x => a(x, z))
```

[Neural machine translation (NMT)](https://en.wikipedia.org/wiki/Neural_machine_translation)
is applied using a [recurrent neural network](https://en.wikipedia.org/wiki/Recurrent_neural_network)
to implement this transpiler. The RNN is created by adapting the excellent work of Zafarali Ahmed in
[keras-attention](https://github.com/datalogue/keras-attention),
which was originally developed to convert dates from varied human readable format to machine format.

See README.md for more details on this work and reproducing and extending it.

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.utils.training_utils import multi_gpu_model


Using TensorFlow backend.


In [2]:
# include companion project keras-attention libraries
import sys
sys.path.append('../keras-attention')

from data.reader import Vocabulary, Data
from models.NMT import simpleNMT
from utils.metrics import all_acc

# Load Data
See details in README.md for generating data

In [3]:
expressions = pd.read_csv('data/expressions', sep='|', names=['python', 'scala'])
expressions = expressions.rename(columns={'python':'input', 'scala':'output'})
print(len(expressions))

# Can't use the full set due to memory limits
expressions = expressions.sample(200 * 1000)

expressions.sample(5)

1000000


Unnamed: 0,input,output
434039,[x(n) for n in o],o.map(n => x(n))
828401,[n(o) for o in d(k) if o],d(k).filter(o => o).map(o => n(o))
462296,"[g(w) for w in p(c,c(c))]","p(c,c(c)).map(w => g(w))"
358923,"[d for d in z if j(d,k)]","z.filter(d => j(d,k))"
375576,"[d for d in t(g,v) if r(d,f)]","t(g,v).filter(d => r(d,f))"


## Determine Padding Size

In [4]:
max_input_length = expressions['input'].str.len().max()
max_input_length

241

In [5]:
max_output_length = expressions['output'].str.len().max()
max_output_length

242

In [6]:
padding = max(max_input_length, max_output_length) + 16
padding

258

In [7]:
class ExpressionVocabulary(Vocabulary):
    """Hacked class to expose the vecabularly using an API required by keras-attention
       for data in this project.
    """
    
    def __init__(self, vocabulary, padding):
        self.vocabulary = vocabulary
        self.padding = padding
        self.reverse_vocabulary = {v: k for k, v in self.vocabulary.items()}
        
    @classmethod
    def from_strings(cls, strs, padding):
        strs = list(strs)
        chars = sorted({c for s in strs for c in s})
        chars = chars + ['<unk>', '<eof>']
        return cls({c:i for i,c in enumerate(chars)}, padding)

In [8]:
input_vocab = ExpressionVocabulary.from_strings(expressions['input'], padding)
input_vocab.size()

34

In [9]:
output_vocab = ExpressionVocabulary.from_strings(expressions['output'], padding)
output_vocab.size()

35

In [10]:
class ExpressionData(Data):
    """Hacked class to expose the expressions using an API required by keras-attention
       for data in this project.
    """
    
    def __init__(self, inputs, targets, input_vocabulary, output_vocabulary):
        self.inputs = list(inputs)
        self.targets = list(targets)
        self.input_vocabulary = input_vocabulary
        self.output_vocabulary = output_vocabulary
    

In [11]:
training_size = int(0.8 * len(expressions))
training_size

160000

In [12]:
training = ExpressionData(
    expressions['input'].iloc[:training_size],
    expressions['output'].iloc[:training_size],
    input_vocab,
    output_vocab
)
training.transform()

In [13]:
validation = ExpressionData(
    expressions['input'].iloc[training_size:],
    expressions['output'].iloc[training_size:],
    input_vocab,
    output_vocab
)
validation.transform()

## Create the NMT model using keras-attention

In [14]:
model = simpleNMT(pad_length=padding,
              n_chars=input_vocab.size(),
              n_labels=output_vocab.size(),
              embedding_learnable=False,
              encoder_units=512,
              decoder_units=512,
              trainable=True,
              return_probabilities=False)

inputs shape: (?, ?, 1024)


In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 258)               0         
_________________________________________________________________
OneHot (Embedding)           (None, 258, 34)           1156      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 258, 1024)         2240512   
_________________________________________________________________
attention_decoder_1 (Attenti (None, 258, 35)           3781356   
Total params: 6,023,024
Trainable params: 6,021,868
Non-trainable params: 1,156
_________________________________________________________________


In [16]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', all_acc])

## Model Training

In [17]:
def train_cycle():
    cp = ModelCheckpoint("./data/weights/NMT.{epoch:02d}-{val_loss:.2f}.hdf5",
                         monitor='val_loss',
                         verbose=0,
                         save_best_only=True,
                         save_weights_only=True,
                         mode='auto')

    batch_size = 16
    model.fit_generator(generator=training.generator(batch_size),
                        steps_per_epoch=100,
                        validation_data=validation.generator(batch_size),
                        validation_steps=100,
                        callbacks=[cp],
                        workers=1,
                        verbose=1,
                        epochs=5)

## Show Examples of Model on Validation Input

In [18]:
def encode(input_string):
    "Convert an input string in an array of numbers, on which modeling can be applied"
    return np.array([input_vocab.string_to_int(input_string)])

def apply_model(input_string):
    "Run the model on a single input string"
    full_prediction = model.predict(encode(input_string))
    prediction = np.argmax(full_prediction[0], axis=-1)
    return output_vocab.int_to_string(prediction)
    
def show_example_ml_application(input_string):
    "Interpret the terminal and padding characters in raw results"
    results = apply_model(input_string)
    pretty = ''.join('|' if x=='<eot>' else ('' if x == '<unk>' else x)
                     for x in results)
    print(repr(input_string), '->', repr(pretty))
    

examples = list(expressions['input'].iloc[training_size:].sample(10))

def show_examples():
    for example in examples:
        show_example_ml_application(example)

# Train and Show Examples Loop

In [None]:
while True:
    train_cycle()
    show_examples()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[y(d) for d in n]' -> 'm.map(p => >())'
'[b(b) for b in l if j(b)]' -> 't(f).ter(e => => >).))p( '
'x' -> ''
'[a for a in g if a]' -> 'f.filter(e => )'
'[q(m) for m in u(h(m),u)]' -> 'a(m).)ap(. =p = => )()))'
'[a for a in u(e,n(w(y),h(a))) if a]' -> 'j(f(.),))e)())(r)e)(( => =  )))))'
'[k for k in j if t(k)]' -> 'f.filter(e => =)'
'[o for o in r([h for h in z(a) if h],u(a)) if o]' -> 't(f(lterter(r => e  > () ))))))))))))))))))))))))))'
'k(m(x,u(c,k)),x)' -> '((m(,((((,))))'
'[l(h(n(y,s),a),n) for n in c(e,u)]' -> 'a(m,.),),,)ap((((((((((((())))'
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[y(d) for d in n]' -> 'm.map(n => n(d))'
'[b(b) for b in l if j(b)]' -> 'j(fi.ter(e => t(j)).map(v => v(v))'
'x' -> 'x'
'[a for a in g if a]' -> 'a.filter(v => v)'
'[q(m) for m in u(h(m),u)]' -> 'm(c,n(,)).mapmp => > =()))'
'[a for a in u(e,n(w(y),h(a))) if a]' -> 'n(c(n(,)(.)i)).filter(e => => ))'
'[k for k in j if t(k)]' -> 'r.filter(t => t