# Python to Scala Transpiler using Neural Machine Translation
This project demonstrates how to convert simple Python expressions into Scala expressions. For example, consider the following Python expression.
```python
[a(x, z) for x in l if b(x, y)]
```
Within the context of this work, the equivalent Scala expressions is
```scala
l.filter(x => b(x, y)).map(x => a(x, z))
```

[Neural machine translation (NMT)](https://en.wikipedia.org/wiki/Neural_machine_translation)
is applied using a [recurrent neural network](https://en.wikipedia.org/wiki/Recurrent_neural_network)
to implement this transpiler. The RNN is created by adapting the excellent work of Zafarali Ahmed in
[keras-attention](https://github.com/datalogue/keras-attention),
which was originally developed to convert dates from varied human readable format to machine format.

For a discussion of results, see the corresponding blog post, [A Python to Scala transpiler using neural machine translation (NMT)](https://medium.com/@matthagy/a-python-to-scala-transpiler-using-neural-machine-translation-nmt-90d4d02afa70).

See README.md for more details on this work and reproducing and extending it.

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.utils.training_utils import multi_gpu_model


Using TensorFlow backend.


In [2]:
# include companion project keras-attention libraries
import sys
sys.path.append('../keras-attention')

from data.reader import Vocabulary, Data
from models.NMT import simpleNMT
from utils.metrics import all_acc

# Load Data
See details in README.md for generating data

In [3]:
expressions = pd.read_csv('data/expressions', sep='|', names=['python', 'scala'])
expressions = expressions.rename(columns={'python':'input', 'scala':'output'})
print(len(expressions))

expressions.sample(8)

400000


Unnamed: 0,input,output
163704,"[z(j) for j in m(g(t),[d for d in t if j(d)])]","m(g(t),t.filter(j)).map(z)"
216160,"[m(f,b) for f in c]","c.map(f => m(f,b))"
317981,"[j(k) for k in q(y(q,v),[q(v,f(z(g(k),v(q)),j(...","q(y(q,v),q.filter(v => v).map(v => q(v,f(z(g(k..."
194187,[b for b in c if b(b)],c.filter(b)
266994,"[p(b) for b in o(i,x(b)) if b]","o(i,x(b)).filter(b => b).map(p)"
55682,"e(u,h(u))","e(u,h(u))"
119073,"[f for f in k([u(r) for r in l],u(l)) if f]","k(l.map(u),u(l)).filter(f => f)"
298186,"[c(r(j),u) for u in z]","z.map(u => c(r(j),u))"


## Determine Padding Size

In [4]:
max_input_length = expressions['input'].str.len().max()
max_input_length

278

In [5]:
max_output_length = expressions['output'].str.len().max()
max_output_length

282

In [6]:
padding = max(max_input_length, max_output_length) + 16
padding

298

In [7]:
class ExpressionVocabulary(Vocabulary):
    """Hacked class to expose the vecabularly using an API required by keras-attention
       for data in this project.
    """
    
    def __init__(self, vocabulary, padding):
        self.vocabulary = vocabulary
        self.padding = padding
        self.reverse_vocabulary = {v: k for k, v in self.vocabulary.items()}
        
    @classmethod
    def from_strings(cls, strs, padding):
        strs = list(strs)
        chars = sorted({c for s in strs for c in s})
        chars = chars + ['<unk>', '<eof>']
        return cls({c:i for i,c in enumerate(chars)}, padding)

In [8]:
input_vocab = ExpressionVocabulary.from_strings(expressions['input'], padding)
input_vocab.size()

34

In [9]:
output_vocab = ExpressionVocabulary.from_strings(expressions['output'], padding)
output_vocab.size()

35

In [10]:
class ExpressionData(Data):
    """Hacked class to expose the expressions using an API required by keras-attention
       for data in this project.
    """
    
    def __init__(self, inputs, targets, input_vocabulary, output_vocabulary):
        self.inputs = list(inputs)
        self.targets = list(targets)
        self.input_vocabulary = input_vocabulary
        self.output_vocabulary = output_vocabulary
    

In [11]:
training_size = int(0.8 * len(expressions))
training_size

320000

In [12]:
training = ExpressionData(
    expressions['input'].iloc[:training_size],
    expressions['output'].iloc[:training_size],
    input_vocab,
    output_vocab
)
training.transform()

In [13]:
validation = ExpressionData(
    expressions['input'].iloc[training_size:],
    expressions['output'].iloc[training_size:],
    input_vocab,
    output_vocab
)
validation.transform()

## Create the NMT model using keras-attention

In [14]:
model = simpleNMT(pad_length=padding,
              n_chars=input_vocab.size(),
              n_labels=output_vocab.size(),
              embedding_learnable=False,
              encoder_units=512,
              decoder_units=512,
              trainable=True,
              return_probabilities=False)

inputs shape: (?, ?, 1024)


In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 298)               0         
_________________________________________________________________
OneHot (Embedding)           (None, 298, 34)           1156      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 298, 1024)         2240512   
_________________________________________________________________
attention_decoder_1 (Attenti (None, 298, 35)           3781356   
Total params: 6,023,024
Trainable params: 6,021,868
Non-trainable params: 1,156
_________________________________________________________________


In [16]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', all_acc])

## Model Training

In [17]:
def train_cycle():
    cp = ModelCheckpoint("./data/weights/NMT.{epoch:02d}-{val_loss:.2f}.hdf5",
                         monitor='val_loss',
                         verbose=0,
                         save_best_only=True,
                         save_weights_only=True,
                         mode='auto')

    batch_size = 10
    model.fit_generator(generator=training.generator(batch_size),
                        steps_per_epoch=100,
                        validation_data=validation.generator(batch_size),
                        validation_steps=100,
                        callbacks=[cp],
                        workers=1,
                        verbose=1,
                        epochs=5)

## Show Examples of Model on Validation Input

In [18]:
def encode(input_string):
    "Convert an input string in an array of numbers, on which modeling can be applied"
    return np.array([input_vocab.string_to_int(input_string)])

def apply_model(input_string):
    "Run the model on a single input string"
    full_prediction = model.predict(encode(input_string))
    prediction = np.argmax(full_prediction[0], axis=-1)
    return output_vocab.int_to_string(prediction)
    
def show_example_ml_application(input_string):
    "Interpret the terminal and padding characters in raw results"
    results = apply_model(input_string)
    pretty = ''.join('|' if x=='<eot>' else ('' if x == '<unk>' else x)
                     for x in results)
    print(repr(input_string), '->', repr(pretty))
    

examples = list(expressions['input'].iloc[training_size:].sample(10))

def show_examples():
    for example in examples:
        show_example_ml_application(example)

# Train and Show Examples Loop

In [19]:
for i in range(100000):
    print('cycle', i)
    train_cycle()
    show_examples()
    print('-'*80)

cycle 0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'c(f(.)er(. (>(. (p(.((p))))))'
'[d for d in p(e,e) if t(s,d)]' -> 'c(f).tel)er(>(. ()'
'[l(n) for n in o]' -> 'a.map('
'[p for p in n if b(p)]' -> 'f.filter(e ('
'[f(q,e(p,r)) for q in q(k(q),g(r(q),z)) if j(z(z,z),q)]' -> 'c(f(.(,)(.)())))()(((((((((((()))))))))'
'[v(r) for r in m(d,y) if l(r)]' -> 'c(f).ter(e (>(.))))'
'[a(e,c) for e in b]' -> 'a.ma.((p(p))'
'[j(i,v([o(e) for e in z(n)],u(k(l,x(j)),n))) for i in e(t) if c(i)]' -> 'c(f(.(,)(.)())))()(((((((((((((())))))))'
'[i(c,z) for z in t if z]' -> 'c(m).ter(. (>(p)))))'
'[p(p) for p in z(a(x(y),h(y,p)),y)]' -> 'c(f(.)il).)(>(. (p(.)()))))'
--------------------------------------------------------------------------------
cycle 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'd(f(l)er(e => p(=(((((()))))'
'[d for d in p(e,e) if t(s,d)]' -> '(f(l)ifilter(e '
'[l(n) for n in o]' -> 'm.m

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'k(k)l(k (> => (((((((((((((((,)))))))'
'[d for d in p(e,e) if t(s,d)]' -> 'p(p,p),)).ter(e => t())'
'[l(n) for n in o]' -> 'o.map(n)'
'[p for p in n if b(p)]' -> 'b.filter(b)'
'[f(q,e(p,r)) for q in q(k(q),g(r(q),z)) if j(z(z,z),q)]' -> 'k(k(q(q)((),))))))              ((((((((()))))'
'[v(r) for r in m(d,y) if l(r)]' -> 'd(d(d),)).filter(l).map('
'[a(e,c) for e in b]' -> 'b.map(e => e(e,c),'
'[j(i,v([o(e) for e in z(n)],u(k(l,x(j)),n))) for i in e(t) if c(i)]' -> 'n(n(l(e)(e)=)((((((((((((((((((((((((((((((((((((((((()))))))))'
'[i(c,z) for z in t if z]' -> 'z.filter(z => z).map(z => c(c,))'
'[p(p) for p in z(a(x(y),h(y,p)),y)]' -> 'a(a(a(,)(y),y).mapmppp'
--------------------------------------------------------------------------------
cycle 6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'p.f,lfirte)=> '
'[d for d in p(e,e) if t(s,d)]' -> 'o.fi.fir(e)'
'

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'l(k).map(s => h(d(k(k(k(k)))))))))'
'[d for d in p(e,e) if t(s,d)]' -> 'p(e,e)efilter(d => t(d,d))'
'[l(n) for n in o]' -> 'o.map(a)'
'[p for p in n if b(p)]' -> 'n.filter(b)'
'[f(q,e(p,r)) for q in q(k(q),g(r(q),z)) if j(z(z,z),q)]' -> 'q(q(q(qqqqqq)))))zr()(zr() ( (( ((()()))))))))))))))'
'[v(r) for r in m(d,y) if l(r)]' -> 'm(d,y).filter(r).map(v)'
'[a(e,c) for e in b]' -> 'b.map(e => a(a,c))'
'[j(i,v([o(e) for e in z(n)],u(k(l,x(j)),n))) for i in e(t) if c(i)]' -> 't(i).)i)(er(> =( ,((j(((()())))))))))))))))))))))))))))'
'[i(c,z) for z in t if z]' -> 't.filter(z => z).map(z => c(c,c))'
'[p(p) for p in z(a(x(y),h(y,p)),y)]' -> 'z(a(y(,(,y),y)y))p)pp(p(pp'
--------------------------------------------------------------------------------
cycle 11
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'l(k).map(s => h(d(s(k(,(,,,,,)))))'
'[d for d in p(e,e) if t(s,

'[p(p) for p in z(a(x(y),h(y,p)),y)]' -> 'a(a(y),),y,y),y),y).map(p)'
--------------------------------------------------------------------------------
cycle 15
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
'[h(d(s(k),f(y(k),w)),s) for s in l(k)]' -> 'k(k).map(k => h(d(s(k)))))))))))))'
'[d for d in p(e,e) if t(s,d)]' -> 'e(e,e).filter(d => t(t,d))'
'[l(n) for n in o]' -> 'o.map(l)'
'[p for p in n if b(p)]' -> 'n.filter(b)'
'[f(q,e(p,r)) for q in q(k(q),g(r(q),z)) if j(z(z,z),q)]' -> 'q(k(q),q)q))))))).)))).)))).)))).)))).))))())))())))()))'
'[v(r) for r in m(d,y) if l(r)]' -> 'd(d,y).filter(r).map(v)'
'[a(e,c) for e in b]' -> 'b.map(e => a(a,c))'
'[j(i,v([o(e) for e in z(n)],u(k(l,x(j)),n))) for i in e(t) if c(i)]' -> 'n(n).),l(u())))))))))))))))))))))))))))))))))())()())())))'
'[i(c,z) for z in t if z]' -> 't.filter(z => z).map(z => i(c,z))'
'[p(p) for p in z(a(x(y),h(y,p)),y)]' -> 'a(a(y),y,y,y)).))p(p)pp'
--------------------------------------------------------------------------

KeyboardInterrupt: 

In [36]:
evaluation = expressions.iloc[training_size:].sample(200).copy()
evaluation.sample(5)

Unnamed: 0,input,output
358683,"[a(k,u(w,p(z(k),k))) for k in i(x) if y(k)]","i(x).filter(y).map(k => a(k,u(w,p(z(k),k))))"
330020,o,o
333910,"[f(q,e(y(a),s(q))) for q in t if x(q)]","t.filter(x).map(q => f(q,e(y(a),s(q))))"
334352,g,g
384249,q(r),q(r)


In [37]:
def apply_convert(input_string):
    results = apply_model(input_string)
    terminate = results.index('<unk>')
    return ''.join(results[:terminate:])

evaluation['results'] = [apply_convert(input_string) for input_string in tqdm(evaluation['input'])]
evaluation.sample(5)

100%|██████████| 200/200 [01:41<00:00,  1.98it/s]


Unnamed: 0,input,output,results
376389,"[o for o in o(n) if r(z(b(o),a),o)]","o(n).filter(o => r(z(b(o),a),o))","o(n).filter(o => r(r(o(o),)),o))"
334148,t,t,t
328131,"[e for e in r(v,i(f,q(e))) if y(e)]","r(v,i(f,q(e))).filter(y)","r(v,f(f)l(())).filter(e)"
386355,"[p for p in q(s,q(p,a(f))) if h(p)]","q(s,q(p,a(f))).filter(h)","q(s,q(q,a(f))).filter(f)"
393946,[m(b) for b in d(c) if r(b)],d(c).filter(r).map(m),d(c).filter(r).map(m)


In [38]:
np.mean(evaluation['output'] == evaluation['results'])

0.565

In [41]:
with pd.option_context('display.max_colwidth', 200):
    display(evaluation.rename(columns={'output':'expected'}).sample(10))

Unnamed: 0,input,expected,results
398497,"[c(q(l,d),h) for h in l if h]","l.filter(h => h).map(h => c(q(l,d),h))","l.filter(h => h).map(h => h(c,d),)))"
381279,[r for r in t if r],t.filter(r => r),t.filter(r => r)
320977,"[x(l) for l in w(k,g)]","w(k,g).map(x)","w(k,g).map(x)"
367582,"[s(n,b(m,f)) for n in e(y(p(s),o(w(s),n)),s) if u(n)]","e(y(p(s),o(w(s),n)),s).filter(u).map(n => s(n,b(m,f)))","e(y(s(s))s),((s(s(),)),filter(er(r .> )(n (> ((((("
367794,"[z(j,e(w)) for j in h]","h.map(j => z(j,e(w)))","h.map(j => z(j,e)e)))"
384249,q(r),q(r),q(r)
364872,"[x(f) for f in s(r(d(f),l),f)]","s(r(d(f),l),f).map(x)","s(r(f)f),lf,lter(l => f(f,a)).ma"
328018,b,b,b
333045,[n for n in p if n],p.filter(n => n),p.filter(n => n)
373145,"[i(h) for h in t(i,a(h))]","t(i,a(h)).map(i)","t(i,a(h)).map(i)"


For a discussion of results, see the corresponding blog post, [A Python to Scala transpiler using neural machine translation (NMT)](https://medium.com/@matthagy/a-python-to-scala-transpiler-using-neural-machine-translation-nmt-90d4d02afa70).