In [1]:
library(keras)
library(data.table)

In [2]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [3]:
## Path to the data txt file on disk.
data_path = 'fra.txt'
text <- fread(data_path, sep="\t", header=FALSE, nrows=num_samples)

In [4]:
head(text)

V1,V2,V3
<chr>,<chr>,<chr>
Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)
Hi.,Salut.,CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)
Run!,Cours !,CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)
Run!,Courez !,CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)
Who?,Qui ?,CC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)


In [5]:
## Vectorize the data.
input_texts  <- text[[1]]
target_texts <- paste0('\t',text[[2]],'\n')

In [6]:
head(input_texts)

In [7]:
head(target_texts)

In [8]:
input_texts  <- lapply( input_texts, function(s) strsplit(s, split="")[[1]])
target_texts <- lapply( target_texts, function(s) strsplit(s, split="")[[1]])

In [9]:
typeof(input_texts)

In [11]:
input_characters  <- sort(unique(unlist(input_texts)))
target_characters <- sort(unique(unlist(target_texts)))

head(input_characters)
head(target_characters)

In [12]:
num_encoder_tokens <- length(input_characters)
num_decoder_tokens <- length(target_characters)

In [15]:
max_encoder_seq_length <- max(sapply(input_texts,length))
max_decoder_seq_length <- max(sapply(target_texts,length))

In [16]:
cat('Number of samples:', length(input_texts),'\n')
cat('Number of unique input tokens:', num_encoder_tokens,'\n')
cat('Number of unique output tokens:', num_decoder_tokens,'\n')
cat('Max sequence length for inputs:', max_encoder_seq_length,'\n')
cat('Max sequence length for outputs:', max_decoder_seq_length,'\n')

Number of samples: 10000 
Number of unique input tokens: 71 
Number of unique output tokens: 93 
Max sequence length for inputs: 16 
Max sequence length for outputs: 59 


In [17]:
input_token_index  <- 1:length(input_characters)
names(input_token_index) <- input_characters

input_token_index

names(input_token_index)

In [18]:
target_token_index <- 1:length(target_characters)
names(target_token_index) <- target_characters

In [19]:
# bikin wadah

encoder_input_data <- array(
  0, dim = c(length(input_texts), max_encoder_seq_length, num_encoder_tokens))
decoder_input_data <- array(
  0, dim = c(length(input_texts), max_decoder_seq_length, num_decoder_tokens))
decoder_target_data <- array(
  0, dim = c(length(input_texts), max_decoder_seq_length, num_decoder_tokens))

encoder_input_data

decoder_input_data

decoder_target_data

In [22]:
dim(encoder_input_data)
dim(decoder_input_data)
dim(decoder_target_data)

In [None]:
# 1000 jumlah data
# 16 max col

In [23]:
for(i in 1:length(input_texts)) {
  d1 <- sapply( input_characters, function(x) { as.integer(x == input_texts[[i]]) })
  encoder_input_data[i,1:nrow(d1),] <- d1
  d2 <- sapply( target_characters, function(x) { as.integer(x == target_texts[[i]]) })
  decoder_input_data[i,1:nrow(d2),] <- d2
  d3 <- sapply( target_characters, function(x) { as.integer(x == target_texts[[i]][-1]) })
  decoder_target_data[i,1:nrow(d3),] <- d3
}

In [25]:
head(d1)

Unnamed: 0,-,",",:,!,?,.,',&,%,⋯,u,U,v,V,w,W,x,y,Y,z
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,1,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [26]:
head(d2)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,-,",",:,!,⋯,u,U,ù,û,v,V,x,y,Y,z
1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1


In [28]:
## Define an input sequence and process it.
encoder_inputs  <- layer_input(shape=list(NULL,num_encoder_tokens))
encoder_inputs

Tensor("input_1:0", shape=(None, None, 71), dtype=float32)

In [29]:
encoder         <- layer_lstm(units=latent_dim, return_state=TRUE)
encoder

<tensorflow.python.keras.layers.recurrent.LSTM>

In [30]:
encoder_results <- encoder_inputs %>% encoder
encoder_results

[[1]]
Tensor("lstm/Identity:0", shape=(None, 256), dtype=float32)

[[2]]
Tensor("lstm/Identity_1:0", shape=(None, 256), dtype=float32)

[[3]]
Tensor("lstm/Identity_2:0", shape=(None, 256), dtype=float32)


In [31]:
## We discard `encoder_outputs` and only keep the states.
encoder_states  <- encoder_results[2:3]
encoder_states

[[1]]
Tensor("lstm/Identity_1:0", shape=(None, 256), dtype=float32)

[[2]]
Tensor("lstm/Identity_2:0", shape=(None, 256), dtype=float32)


In [32]:
## Set up the decoder, using `encoder_states` as initial state.
decoder_inputs  <- layer_input(shape=list(NULL, num_decoder_tokens))
decoder_inputs

Tensor("input_2:0", shape=(None, None, 93), dtype=float32)

In [33]:
## We set up our decoder to return full output sequences,
## and to return internal states as well. We don't use the
## return states in the training model, but we will use them in inference.
decoder_lstm    <- layer_lstm(units=latent_dim, return_sequences=TRUE,
                              return_state=TRUE, stateful=FALSE)
decoder_lstm

<tensorflow.python.keras.layers.recurrent.LSTM>

In [34]:
decoder_results <- decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_results

[[1]]
Tensor("lstm_1/Identity:0", shape=(None, None, 256), dtype=float32)

[[2]]
Tensor("lstm_1/Identity_1:0", shape=(None, 256), dtype=float32)

[[3]]
Tensor("lstm_1/Identity_2:0", shape=(None, 256), dtype=float32)


In [35]:
decoder_dense   <- layer_dense(units=num_decoder_tokens, activation='softmax')
decoder_dense

<tensorflow.python.keras.layers.core.Dense>

In [36]:
decoder_outputs <- decoder_dense(decoder_results[[1]])
decoder_outputs

Tensor("dense/Identity:0", shape=(None, None, 93), dtype=float32)

In [37]:
## Define the model that will turn
## `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model <- keras_model( inputs = list(encoder_inputs, decoder_inputs),
                      outputs = decoder_outputs )

In [None]:
## Compile model
model %>% compile(optimizer='rmsprop', loss='categorical_crossentropy')

## Run model
model %>% fit( list(encoder_input_data, decoder_input_data), decoder_target_data,
               batch_size=batch_size,
               epochs=epochs,
               validation_split=0.2)

## Save model
save_model_hdf5(model,'s2s.h5')
save_model_weights_hdf5(model,'s2s-wt.h5')