In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD, Adam, Adadelta
from keras.layers import Conv1D, Dense, MaxPooling1D, Flatten, Dropout, Embedding, Activation
from sklearn.model_selection import train_test_split
import warnings

In [None]:
df = pd.DataFrame(metagenomics_dataset, columns=['dna', 'class'])
df

Unnamed: 0,dna,class
27353,TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAG...,0.0
7327,CCTGTTTGCTCCCCACGCTTTCGAGCCTCAGCGTCAGTTACAGACC...,1.0
31147,TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAG...,0.0
10707,CCTGTTTGCTCCCCACGCTTTCGAGCCTCAGCGTCAGTTACAGTCC...,1.0
25197,CCTGTTTGCTCCCCACGCTTTCGCACCTGAGCGTCAGTCTTCGTCC...,0.0
...,...,...
7716,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCA...,1.0
7345,CCTGTTTGCCCCCCACGCTTTCGTGCCTCAGTGTCAGTTACAGTCC...,1.0
10546,TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCA...,1.0
17055,CCTGTTTGCTCCCCACGCTTTCGCACCTGAGCGTCAGTCTTCGTCC...,0.0


In [None]:
def obtain_Kmers(metadata, size=6):
    return [metadata[x:x+size].lower() for x in range(len(metadata) - size + 1)]
# list of k-mers
df['K_words'] = df.apply(lambda x: obtain_Kmers(x['dna']), axis=1)
data = df.drop('dna', axis=1)

In [None]:
data['K_words']

27353    [tacgta, acgtag, cgtagg, gtaggt, taggtc, aggtc...
7327     [cctgtt, ctgttt, tgtttg, gtttgc, tttgct, ttgct...
31147    [tacgta, acgtag, cgtagg, gtaggg, tagggg, agggg...
10707    [cctgtt, ctgttt, tgtttg, gtttgc, tttgct, ttgct...
25197    [cctgtt, ctgttt, tgtttg, gtttgc, tttgct, ttgct...
                               ...                        
7716     [tacgga, acggag, cggagg, ggaggg, gagggt, agggt...
7345     [cctgtt, ctgttt, tgtttg, gtttgc, tttgcc, ttgcc...
10546    [tacgga, acggag, cggagg, ggaggg, gagggt, agggt...
17055    [cctgtt, ctgttt, tgtttg, gtttgc, tttgct, ttgct...
7191     [cctgtt, ctgttt, tgtttg, gtttgc, tttgct, ttgct...
Name: words, Length: 37240, dtype: object

In [None]:
# converting the lists of k-mers into string sentences
word_text = list(data['K_words'])
for item in range(len(word_text)):
    word_text[item] = ' '.join(word_text[item])
y_data = data.iloc[:, 0].values 

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(conv_text)
encoded_docs = tokenizer.texts_to_sequences(conv_text)
max_length = max([len(s.split()) for s in conv_text])
X = pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, np.stack(y_data), test_size = 0.20, random_state=42)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
warnings.filterwarnings('ignore')

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = max_length))
model.add(Conv1D(filters = 16, kernel_size = 5, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

epochs = 100
lrate = 0.01
decay = lrate / epochs
sgd = SGD(lr = lrate, momentum = 0.9, nesterov = False)
model.compile(loss = 'binary_crossentropy', optimizer = sgd, metrics = ['binary_accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 249, 10)           40970     
                                                                 
 conv1d_1 (Conv1D)           (None, 245, 16)           816       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 122, 16)          0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 1952)              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 1953      
                                                                 
Total params: 43,739
Trainable params: 43,739
Non-trainable params: 0
__________________________________________________

In [None]:
history = model.fit(X_train , y_train, 
                    epochs = epochs, verbose = 2, validation_split = 0.30, batch_size = 32, shuffle = True)


In [None]:
history.history

In [None]:
# Let's evaluate the model on the test data
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Let's generate predictions 
predictions = model.predict(X_test[:3])