**Installation of Required Packages**

In [1]:
pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/76/02/8b606c4aa92ff61b5eda71d23b499ab1de57d5e818be33f77b01a6f435a8/biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 7.2MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.78


In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD, Adam, Adadelta, RMSprop
from keras.layers import Conv1D, Dense, MaxPooling1D, Flatten, Dropout, SpatialDropout1D
from keras.layers import Embedding, GlobalAveragePooling1D, LSTM, SimpleRNN, GRU, Bidirectional
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from keras.layers import Softmax
from google.colab import drive
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np 



drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pickle
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
    
def getKmers(sequence, size):
    return [sequence[x:x+size].upper() for x in range(len(sequence) - size + 1)]

def prepare_training_dataset(k,MOVE_WINDOW,READ_LEN, JUMP, WHOLE_SEQ, OUTPUTFILEPATH):
    # Any of the following is the selection criteria
    # Subtype with >=N samples
    # Top M subtypes.
    # k is the k-mer size
    # READ_LEN is the contig length for each data sample in the prepared dataset.
    # MOVE_WINDOW = 1 means moving the window every 'JUMP' number of charaters
    # MOVE_WINDOW = 0 does not mode the wondow. No use of JUMP in this case. 
    

    N=25
    M=25
    newnames={'B': 5727, 'C': 2077, '01_AE': 1426, 'A1': 498, '01B': 210, 
                    '02_AG': 168, 'BF1': 143, 'A6': 117, 'A1C': 111, 'G': 96, 'BC': 95, 
                    'A1D': 94, 'AD': 94, 'D': 87, 'F1': 82, 'A1CD': 62, 'CD': 61, 'O': 57,
                    '0107': 57, '01BC': 50, '07_BC': 41, '08_BC': 35, '02A1': 29, 
                    '11_cpx': 25, '35_AD': 22}

    subtype_seqs = {}
    
    type_seqs = {}
    type_texts = {}
    neucleotide_list = ['A','T','C','G']
    for key in newnames.keys(): 
        type_seqs[key] = []

        # You need to CHANGE the SUBTYPES FOLDER LOCATION according to your drive location. 
        subtype = SeqIO.parse(open('/content/drive/MyDrive/ML 472/Data/subtypes/'+key+'.fasta','r'), 'fasta')
        
        for record in subtype:
            chars = set(record.seq)
            new_seq = str(record.seq)
            for c in chars:
                if c not in neucleotide_list:
                    new_seq = new_seq.replace(c,'')

            if MOVE_WINDOW == 0:
                READ_LEN = len(str(new_seq)) if WHOLE_SEQ == 1 else READ_LEN
                NUMBER_OF_READS=(int)(len(str(new_seq))/READ_LEN)
                for i in range(NUMBER_OF_READS):
                    type_seqs[key].append(str(new_seq)[i*READ_LEN:(i+1)*READ_LEN])
            else:
                NUMBER_OF_READS=(int)(len(str(new_seq)) - READ_LEN + 1)
                for i in range(int(NUMBER_OF_READS/JUMP)):
                    type_seqs[key].append(str(new_seq)[i*JUMP:i*JUMP+READ_LEN])
    kmer = k
    type_texts = {}
    for key in newnames.keys():
        type_texts[key] = []
        for i in type_seqs[key]:
            type_texts[key].append(' '.join(getKmers(i, kmer)))

    print('k-mers are built.') 
    merge_texts = []
    labels = []
    count = 0
    for key in newnames.keys(): 
        for row in type_texts[key]:
            merge_texts.append(row)
            labels.append(count)
        count+=1
    labels = np.array(labels)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(merge_texts)

    

    encoded_docs = tokenizer.texts_to_sequences(merge_texts)
    max_length = max([len(s.split()) for s in merge_texts])
    # saving the tokenizer

    TOKENIZERPATH = '/content/drive/MyDrive/ML 472/Data/Models/tokenizer_k'+str(k)+'_readWhole.pickle'#+str(READ_LEN)+'.pickle'
    with open(TOKENIZERPATH, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(max_length, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Max_length: ',max_length)    
    X = pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')
    data_dict = {}

    data_dict['X_train'],data_dict['X_test'],data_dict['y_train'],data_dict['y_test'] = train_test_split(X,labels,
                                                        test_size=0.20,random_state=42)
    #data_dict['y_train']=to_categorical(data_dict['y_train'],25)
    #data_dict['y_test']=to_categorical(data_dict['y_test'],25)

    vocab_size = len(tokenizer.word_index) + 1
    print('Final Vocabulary size is: ', vocab_size)

    # Saving the training set and validation set into pickle file.
    f =open(OUTPUTFILEPATH, 'wb')
    pickle.dump(data_dict, f)
    pickle.dump(vocab_size, f)
    f.close()
    print('X_train, X_test, y_train, y_test, vocab saved in: ',OUTPUTFILEPATH )

# You neeed to specify where you want to save your Output dictionary file, in your drive.
OUTPUTFILEPATH = '/content/drive/MyDrive/ML 472/Data/Final_data_dicts/Final_data_dict_k15_readWhole.pickle'
    
#Function 'prepare_training_dataset' takes 5 arguments: k, MOVE_WINDOW, READ_LEN, JUMP, WHOLE_SEQ, and OUTPUTFILENAME
prepare_training_dataset(15,0,0,400,1,OUTPUTFILEPATH)


k-mers are built.
Max_length:  14811
Final Vocabulary size is:  3901291
X_train, X_test, y_train, y_test, vocab saved in:  /content/drive/MyDrive/ML 472/Data/Final_data_dicts/Final_data_dict_k15_readWhole.pickle


In [None]:
import pickle
from google.colab import drive
drive.mount('/content/drive')
OUTPUTFILEPATH = '/content/drive/MyDrive/Final_data_dict_k15_readWhole.pickle'
# This chunk of code loads the data from the saved dicitonary. 
f=open(OUTPUTFILEPATH, 'rb')
data_dict=pickle.load(f)
vocab_sizeA=pickle.load(f)
f.close()

X_train = data_dict['X_train'] 
y_train = data_dict['y_train']
X_test = data_dict['X_test']
y_test = data_dict['y_test']

print('X_train.shape: ', X_train.shape,'\nX_test.shape: ',X_test.shape, '\ny_train.shape: ',y_train.shape,'\ny_test.shape: ',y_test.shape)
print('Vocabulary size: ', vocab_sizeA)
vocab_size = vocab_sizeA

data_dict = {}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
X_train.shape:  (9171, 14811) 
X_test.shape:  (2293, 14811) 
y_train.shape:  (9171,) 
y_test.shape:  (2293,)
Vocabulary size:  3901291


In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_sample_weight
import warnings
warnings.filterwarnings('ignore')
from keras.models import load_model
from numpy.testing import assert_allclose
import matplotlib.pyplot as plt

# Cecking if GPU is available
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# YOU SHOULD CHANGE THIS PATH WHILE YOU ARE RUNNING FOR DIFFERENT K, READ_LENGTH, etc.
MODELWEIGHTSPATH = "/content/drive/MyDrive/weights.best.hdf5"
#----------------------------------------MODEL------------------------------------------------------------------------
Vector_dim=24
model = Sequential()
model.add(Embedding(vocab_size, Vector_dim, input_length=X_train.shape[1])) #dropout = 0.2 #input_length = max_length

model.add(Bidirectional(CuDNNLSTM(40)))
model.add(Dropout(0.2))

model.add(Dense(30,activation='relu'))
model.add(Dense(25,activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
checkpoint = ModelCheckpoint(MODELWEIGHTSPATH, monitor = 'val_accuracy', verbose = 1, 
                             save_best_only = True, mode = 'max')
print(model.summary())
#--------------------------------------------------------------------------------------------------------------------

# Set Epochs and Batch size before running
epochs = 15
batch_size = 64
y_train = to_categorical(y_train,25)
y_test = to_categorical(y_test,25)


history = model.fit(X_train, y_train, epochs=epochs, 
                    batch_size=batch_size,validation_data=(X_test,y_test),
                    shuffle=True,callbacks=[checkpoint])

def increaseEpochs(MODELWEIGHTSPATH, epochs, batch_size):
    modelpath = MODELWEIGHTSPATH
    new_model = load_model(modelpath)
    #assert_allclose(model.predict(X_train),new_model.predict(X_train),1e-5)

    checkpoint2 = ModelCheckpoint(modelpath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint2]
    history2 = new_model.fit(X_train, y_train, epochs=epochs,validation_data=(X_test,y_test),callbacks=[checkpoint2])
    return new_model, history2

def saveModel(model, MODELPATH):
    model_json = model.to_json()
    with open("/content/drive/MyDrive/model.json", "w") as json_file:
        json_file.write(model_json)
    
    print("Saved model to: ", MODELPATH)
    model.save(MODELPATH)

def plotter(history):
    plt.figure(figsize=(20,15))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss', fontsize = 20)
    plt.ylabel('Loss', fontsize = 20)
    plt.xlabel('Epoch', fontsize = 20)
    plt.legend(['Train', 'Validation'], fontsize = 20)
    plt.show()

    plt.figure(figsize=(20,15))
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_acc'])
    plt.title('Model Accuracy', fontsize = 20)
    plt.ylabel('Accuracy', fontsize = 20)
    plt.xlabel('Epoch', fontsize = 20)
    plt.legend(['Train', 'Validation'], fontsize = 20)
    plt.show()

# IF YOU NEED TO INCREASE EPOCHS TO GET BETTER PERFORMANCE, UNCOMMENT THE FOLLOWING LINE and COPY ALL THE FOLLOWING LINES TO A NEW CELL.  
# model = increaseEpochs(MODELWEIGHTSPATH, 30, 64)

# CHANGE THE FOLLOWING LINK BEFORE RUNNING FOR DIFFERENT K, READ_LENGTHS.
MODELPATH = "/content/drive/MyDrive/kmer_model_k15_readWhole.h5"
saveModel(model,MODELPATH)

#PLOTTING
plotter(history)

In [6]:
from keras.models import load_model
from numpy.testing import assert_allclose
def saveModelFromWeights():  
    MODELWEIGHTSPATH = "/content/drive/MyDrive/ML 472/Data/Models/weights.best_k1_readWhole.hdf5"
    new_model = load_model(MODELWEIGHTSPATH)
    model_json = new_model.to_json()
    with open("/content/drive/MyDrive/ML 472/Data/Models/model_k1_readWhole.json", "w") as json_file:
        json_file.write(model_json)
    MODELPATH = "/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k1_readWhole.h5"
    print("Saved model to: ", MODELPATH)
    new_model.save(MODELPATH)
saveModelFromWeights()

Saved model to:  /content/drive/MyDrive/ML 472/Data/Models/kmer_model_k1_readWhole.h5


In [None]:
from keras.models import load_model
from numpy.testing import assert_allclose
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from keras.preprocessing.sequence import pad_sequences
def getKmers(sequence, size):
    return [sequence[x:x+size].upper() for x in range(len(sequence) - size + 1)]
def process_seq(seq,k,MOVE_WINDOW,READ_LEN,JUMP):
    chars = set(seq)
    nucleotide_list = ['A','C','G','T']
    type_seqs = []
    for c in chars:
        if c not in nucleotide_list:
            seq = seq.replace(c,'')
    if MOVE_WINDOW == 0:
        if READ_LEN==0:
            READ_LEN = len(seq)
        NUMBER_OF_READS=(int)(len(seq)/READ_LEN)
        for i in range(NUMBER_OF_READS):
            type_seqs.append(seq[i*READ_LEN:(i+1)*READ_LEN])
    else:
        NUMBER_OF_READS=(int)(len(seq) - READ_LEN + 1)
        for i in range(int(NUMBER_OF_READS/JUMP)):
            type_seqs.append(seq[i*JUMP:i*JUMP+READ_LEN])
    kmer = k
    type_texts = []
    for i in type_seqs:
        type_texts.append(' '.join(getKmers(i, kmer)))
    return type_texts

def predict(seq):
    newnames={'B': 5727, 'C': 2077, '01_AE': 1426, 'A1': 498, '01B': 210, 
                    '02_AG': 168, 'BF1': 143, 'A6': 117, 'A1C': 111, 'G': 96, 'BC': 95, 
                    'A1D': 94, 'AD': 94, 'D': 87, 'F1': 82, 'A1CD': 62, 'CD': 61, 'O': 57,
                    '0107': 57, '01BC': 50, '07_BC': 41, '08_BC': 35, '02A1': 29, 
                    '11_cpx': 25, '35_AD': 22}
    types = list(newnames.keys())
    
    k = [1,21,15]
    MOVE_WINDOW = [1,0,0]
    READ_LEN = [7500,0,1000]
    JUMP=[400,400]
    NUM_MODELS = len(k)
    saved_models = {
            '117500':'/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k_1_sliding.h5',
            '2100':'/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k21_readWhole.h5',
            '1501000':'/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k15_read1000.h5'
    }
    saved_tokenizers = {
            '117500':'/content/drive/MyDrive/ML 472/Data/Models/tokenizer.pickle',
            '2100':'/content/drive/MyDrive/ML 472/Data/Models/tokenizer_k21_readWhole.pickle',
            '1501000':'/content/drive/MyDrive/ML 472/Data/Models/tokenizer_k15_read1000.pickle'
    }
    predictions = []
    for i in range(NUM_MODELS):
        s = str(k[i])+str(MOVE_WINDOW[i])+str(READ_LEN[i])
        
        #Loading the saved model
        MODEL_PATH = saved_models[s]
        
        #Loading saved tokenizer 
        TOKENIZERPATH = saved_tokenizers[s]
        
        # Preprocessing the newly provided seq
        type_texts = process_seq(seq,k[i],MOVE_WINDOW[i],READ_LEN[i],JUMP[i])
        
        with open(TOKENIZERPATH, 'rb') as handle:
            tokenizer = pickle.load(handle)
            max_length = pickle.load(handle)
        tokenizer.fit_on_texts(type_texts)
        encoded_docs = tokenizer.texts_to_sequences(type_texts)
        max_length = max([len(s.split()) for s in type_texts])
        X = pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')
        print(X.shape)
        loaded_model = load_model(MODEL_PATH)
        y_pred = np.array(loaded_model.predict_classes(X))
        counts = np.bincount(y_pred)
        subtype_ = np.argmax(counts)
        print('k:',k[i],'MOVE_WINDOW:',MOVE_WINDOW[i],'READ_LENGTH:',READ_LEN[i],'\nPred_class:',types[subtype_])
        predictions.append(types[subtype_])
    #print(predictions)
    return predictions[0]
#seq_B = 'GGTCTCTCGTTAGACCAGATTTGAGCCTGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGAGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAGCCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAAGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCAGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAAAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAAATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGAAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAACGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTACCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTCAATACCCCTCCTTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACGTTCTATGTAGATGGGGCAGCTAGCAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAATATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATAAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATACTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACAGACAATGGCAGCAATTTCACCAGTACTACGGTTAAGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGCCAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATCTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACATATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCAACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCACTTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCACAACAAAAGCCTTAGGCATCTCCTATGGCAGAAGAAGCGGAGACAGCGACGAAACCTCCTCAAGGCAGTCAGACTCATCAAGTTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATACAAATAGCAATAGCAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGGGGAATGCTACTAATACCAATAGTAGTAATACCAATAGTAGTAGCGGGGAAATGATGATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATACGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTGAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTGAACCAATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTATCCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACATGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGCGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCACGGTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACATTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAAATATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGGTTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATTTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCA'
#seq_C='TGGAAGGGTTAATTTACTCCAAGAAAAGGCAAGAAATCCTTGATTTGTGGGTCTATCACACACAAGGCTTCTTCCCTGATTGGCAAAACTACACACCGGGACCAGGAGTCAGATACCCACTGACTTTTGGGTGGTGCTTCAAGCTGGTACCAGTTGACCCAAGGGAAGTAGAAGAGGCCAACGAAGGAGAAGACAACTGTTTGCTACACCCTGTGTGCCAGCATGGAATGGAGGATGAACACAGAGAAGTATTAAAGTGGAAGTTTGACAGTCAGCTAGCACGCAGACACATGGCCCGCGAGCTACATCCGGAGTTTTACAAAGACTGCTGACACAGAAGGGACTTTCCGCTGGGACTTTCCACTGGGGCGTTCCAGGAGGTGTGGTCTGGGCGGGACTGGGAGTGGTCAACCCTCAGATGCGGCATATAAGCCGCTGCTTTTCGCTTGTACTGGGTCTCTCTAGGTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTATCTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCTGAGCAGTGTGTGCCCGTCTATTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTGGTAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAGACCAGAGAAGATCCTCTAGACGCAGGACTCGGCTTGCTGAAGTGCACTCGGCAAGAGGCGAGAGCGGCGACTGGTGAGTACGCCAATTTTATTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAATATTAAGAGGGGGAAAATTAGATAAATGGGAAAGAATTAGGTTAAGGCCAGGGGGAAAGAAACACTATATGCTAAAACACCTAGTATGGGCAAGCAGGGAGCTGGAAAGATTCGCACTCAACCCTGGCCTTTTAGAGACAGCAGAAGGCTGTAAACAAATAATAAAACAGCTACAACCAGCTCTTCAGACAGGAACAGAGGAACTTAAATCATTACACAACACAGTAGCAACTCTCTATTGTGTACATGCAGGGATAGAAGTACGAGACACCAAAGAAGCCTTAGACAAGATAGAGGAAGAACAAAACAAAATTCAGCAAAAAACACAACAGGCAAAAGAGGCTGACGGGAAGGTCAGTCAAAATTATCCTATAGTGCAGAATCTCCAAGGGCAAATGGTACACCAGGCCATATCACCTAGAACTTTGAATGCATGGGTAAAAGTAATAGAGGAGAAGGCTTTTAGCCCAGAGGTAATACCCATGTTTACAGCATTATCAGAAGGAGCCACCCCACAAGACTTAAACACCATGTTAAATACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGATACCATCAATGAAGAGGCTGCAGAATGGGATAGATTACATCCAATCCATGCAGGGCCTATTGCACCAGGCCAAATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTAGCCTTCAGGAACAAATAGCATGGATGACAGGTAACCCACCTGTTCCAGTGGGAGACATCTATAAAAGATGGATAATTCTGGGGTTAAATAAAATAGTAAGAATGTATAGCCCTGTTAGCATTTTGGACATAAGACAAGGGCCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTTTAAAACTTTAAGAGCTGAACAAGCTACACAAGATGTAAAAAATTGGATGACAGACACCTTGTTGGTCCAAAATGCGAATCCAGATTGTAAGACCATTTTAAGAGCATTAGGACCAGGGGCTTCATTAGAAGAGATGATGACAGCATGTCAGGGAGTGGGAGGACCTGGCCACAAAGCAAGAGTGTTGGCTGAGGCAATGAGCCAAGCAAACAGTACCATACTGATGCAGAGAAGCAATTTTAAAGGCTCTAAAAGAATTGTTAAATGTTTCAACTGTGGCAAGGAGGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAAGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGACTGTACTGAGAGGCAGGCTAATTTTTTAGGGAAAATTTGGCCTTCCCACAAGGGGAGGCCAGGGAATTTCCTCCAGAGCAGACCGGAGCCAACAGCCCCACCAGCAGAGAGCTTCAGGTTCGAGGAGACAACCCCAGCTCCAAAGCAGGAGCCGAAAGACAGGGAACCCTTAACTTCCCTCAAATCACTCTTTGGCAGCGACCTCTTGTCTCAATAAGAGTAGGGGGCCAAATAAAAGAGGCTCTCTTAGACACAGGAGCAGATGATACAGTATTAGAAGAAGTAAATTTGCCAGGAAAATGGAAACCAAAAATGATAGGAGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATCAAATACCTATAGAAATTTGTGGAAAAAAGGCTATAGGTACAGTATTAGTGGGACCCACACCTATCAACATAATTGGAAGAAATATGTTGACTCAGCTTGGATGCACACTAAATTTTCCAATCAGTCCCATTGAAACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAGGTTAAACAATGGCCATTGACAGAAGAGAAAATAAAAGCATTAACAGCAATTTGTGATGAAATGGAGAAGGAAGGAAAAATTACAAAAATTGGGCCTGAAAATCCATATAACACTCCAATATTTGCTATAAAAAAGAAGGACAGTATTAAGTGGAGAAAATTAGTAGATTTCAGGGAACTCAATAAAAGAACTCAAGATTTTTGGGAAGTTCAATTAGGAATACCACACCCAGCAGGGTTAAAAAAGAAAAAATCAGTGACAGTACTGGATGTGGGGGATGCATATTTTTCAGTTCCTTTATATGAAGACTTCAGGAAATATACTGCATTCACCATACCTAGTATAAACAATGAAACACCAGGGATTAGGTATCAATATAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAGAGTAGCATGATAAGAATCTTAGAGCCCTTTAGGGCACAAAATCCAGAAATAGTCATCTATCAATATATGGATGACTTGTATGTAGGATCTGACTTAGAAATAGGGCAACATAGAGCAAAAATAGAGGAGTTAAGAGAACATCTGTTAAAGTGGGGATTTACCACACCAGACAAGAAACATCAGAAGGAACCTCCATTTCTTTGGATGGGGTATGAACTCCATCCTGACAAATGGACAGTACAGCCTATACAGCTGCCAGAAAAGGATAGCTGGACTGTCAATGATATACAGAAGTTAGTGGGAAAATTAAACTGGGCAAGTCAAATTTACCCAGGAATTAAAGTAAGGCAACTTTGTAAACTCCTTAGGGGGGCCAAAGCACTAACAGACATAGTACCACTAACTGAAGAAGCAGAATTAGAATTGGCAGAAAACAGGGAAATTCTAAAAGAACCAGTACATGGAGTATATTATGACCCATCAAAAGACTTGATAGCTGAAATACAGAAACAGGGGCAGGACCAATGGACATATCAAATTTACCAAGAACCATTCAAAAATCTGAAAACAGGGAAGTATGCAAAAAGGAGGACTGCCCACACTAATGATGTAAAACAGTTAACAGAGGCTGTGCAGAAAATAGCCATGGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAGATTACCTATCCAAAAAGAAACATGGGAGACATGGTGGACAGACTATTGGCAAGCCACCTGGATTCCTGAGTGGGAATTTGTTAATACCCCTCCCCTAGTAAAATTATGGTACCAGCTGGAGAAAGATCCCATAGCAGGAGTAGAAACTTTCTATGTAGATGGAGCAGCTAATAGGGAAACTAAGTTAGGAAAAGCAGGGTATGTTACTGACAGAGGAAGGCAGAAAATTGTTTCTCTAACTGAAACCACAAATCAGAAGACTGAGTTGCAAGCAATTTATCTAGCTTTGCAAGATTCAGGATCAGAAGTAAACATAGTAACAGATTCACAGTATGCATTAGGGATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTTAACCAAATAATCGAACAGTTAATAAAAAAGGAAAGGGTCTATCTGTCATGGGTACCAGCACATAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTAAGTAGTGGAATCAGGAAAGTGCTATTTCTAGATGGAATAGATAAAGCTCAAGAAGAGCATGAAAAGTATCACAGCAATTGGAGAGCAATGGCCAGTGACTTTAATCTACCACCCGTAGTAGCAAAAGAAATAGTAGCTAGCTGTGATCAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGGATATGGCAATTAGATTGTACACATTTAGAAGGAAAAATCATCCTGGTAGCAGTCCATGTAGCCAGTGGCTACATAGAAGCAGAGGTTATTCCAGCAGAAACAGGACAAGAAACAGCATACTTTATACTAAAATTAGCAGGAAGATGGCCAGTCAAAGTAATACATACAGACAATGGTAGTAATTTCACCAGTGCTGCAGTCAAGGCAGCCTGTTGGTGGGCAGGTATCCAACAGGAATTTGGGATTCCCTACAATCCCCAAAGTCAGGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGGCAGGTAAGAGATCAAGCTGAGCACCTTAAGACAGCAGTACAAATGGCAGTATTCATTCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAATAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTATAAAAATTCAAAATTTTCGGGTTTATTACAGAGACAGCAGAGACCCCATTTGGAAAGGACCAGCCAAACTACTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAGGTAGTACCACGGAGGAAAGCAAAAATCATTAAGGACTATGGAAAACAGATGGCAGGTGCTGATTGTGTGGCAGGTAGACAGGATGAAGATTAGAACATGGAATAGTTTAGTAAAACACCATATGTATGTTTCAAGGAGAGCTAAAGGATGGTTTTACAGACATCATTATGACAGCAGACATCCAAAAGTAAGTTCAGAAGTACACATCCCATTAGGGGAGGCTAGATTAGTAATAAAAACATATTGGGGGTTGCAAACAGGAGAAAGAGACTGGCATTTGGGTCATGGAGTCTCCATAGAATGGAGATTGAGAAGATATAACACACAAATAGAACCTGGCCTGGCAGACCAGCTAATCCATATGCATTATTTTGATTGTTTTGCAGACTCTGCCATAAGGAAAGCCATATTAGGACACATAGTTATTCCTAGGTGTGACTATCAAGCAGGACATAATAAGGTAGGATCTCTACAATACCTGGCACTGACAGCACTGATAAAACCAAAAAAGATAAAGCCACCTCTGCCTAGTATTAAGAAATTAGTAGAGGATAGATGGAACAATCCCCAGAAGATCAGGGGCCGCAGAGGGAACCATACAATGAATGGACACTAGAGCTTCTAGAGGAACTCAAGCAGGAAGCTGTCAGACACTTTCCTAGACCATGGCTTCATGGCTTAGGACAATATGTCTATGAAACATATGGGGATACTTGGACAGGAGTCGAAGCTATAATAAGACTACTGCAACAACTACTGTTTATTCATTTCAGAATTGGGTGCCAGCATAGCAGAATAGGCATTTTGCGACAGAGAAGAGCAAGAAATGGAGCCAGTAGATCCTAACCTAGAGCCCTGGAACCATCCAGGAAGTCAGCCTAAAACTGCTTGCAATCAATGTTATTGTAAACGCTGTAGCTATCATTGTCTAGTTTGCTTTCAGAAAAAAGGCTTAGGCATTTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGCGCTCCTCCAAGCAGTGAGGATCATCAAAATCTTATATCAAAGCAGTAAGTATCTGTAATGATAGATTTAGATTATAGGTTAGGAGTAGGAGCATTGATAGTAGCACTAATCATAGCAATAGTTGTGTGGACCATAGTATATATAGAATATAGGAAATTGGTAAGACAAAGCAAAATAAACTGGTTAATTAAAAGAATTAGGGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAGGGGGACACTGAGGAATTATCAACAATGGTGGATATGGGGCGTCTTAGGCTTTTGGATGTTAATGATTTGTAATGGGGGAGGAAACTTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAAGAAGCAAAAACCACTCTACTCTGTGCATCAGATGCCAAAGCATATGAGAGGGAAGTGCATAATGTCTGGGCTACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAATAGTTTTGGGAAATGTAACAGAAAATTTTAACATGTGGAAAAATGACATGGTGGATCAGATGCATGAGGATGTAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTGACCCCACTCTGTGTCACTTTAGAATGTAGAAATGTTAGCAGAAATGTTAGCAGTTATAATACCTACAATGGGAGCGTGGAGGAAATAAAAAATTGCTCTTTCAATGCAACCCCAGAAGTAAGAGATAGGAAGCAGAGAATGTATGCTCTCTTTTATGGACTTGATATAGTACCACTTAATAAGAAGAACTCTAGTGAGAACTCCAGTGAGTATAGATTAATAAATTGTAATACCTCAGCCATAACACAAGCCTGTCCAAAGGTCACTTTTGATCCAATTCCTATACACTATTGTGCTCCGGCTGGTTATGCGATTCTAAAGTGTAATAATAAGACATTCAATGGGACAGGACCATGCAATAATGTTAGTACAGTACAATGTACACATGGAATTAAGCCAGTAGTATCAACTCAACTACTGTTAAATGGTAGCCTAGCAGAAGGAGAGATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTGTGTACAAGACCCAATAATAATACAAGAAAAAGTATAAGGATAGGACCAGGACAAACATTCTATGCAACAGGAGACATAATAGGAGACATAAGACAAGCACATTGTAACATTAGTAGAGATAAATGGAATGAAACTTTACAAAGGGTAGGTAAAAAATTAGCAGAACACTTCCATAATAAGACAATAAAATTTGCATCATCCTCAGGAGGGGACCTAGAAATTACAACACATAGCTTTAATTGTAGAGGAGAATTTTTCTATTGTAATACATCAGGCCTGTTTAATGGTACATACATGCCTACATACATGCCTAATGGTACAGAAAGTAATTCAAACTCAACTATCACAATCCCATGCAGAATAAAGCAAATTATAAACATGTGGCAGGAGGTAGGACGAGCAATGTATGCCCCTCCCATTGCAGGAAACATAACATGTACATCAAATATCACAGGACTACTATTGGTACATGATGGAGGAATAAAGGAAAATGATACAGAGAATAAGACAGAGATATTTAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTGGTAGAAATTAAGCCATTGGGAGTAGCACCCACTGCAGCAAAAAGGAGAGTGGTGGAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTGTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCGGCGTCAATAACGCTGACGGCACAGGCCAGACAATTGTTGTCTGGTATAGTGCAACAGCAAAGCAATTTGCTGAGGGCTATAGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATTAAGCAGCTCCAGACAAGAGTCCTGGCTATAGAGAGATACCTAAAGGATCAACAGCTCCTAGGGATTTGGGGCTGCTCTGGAAAACTCATCTGCACTACTGCTGTACCTTGGAACTCCAGTTGGAGTAACAAAACTCAAAGTGAGATTTGGAATAACATGACCTGGATGCAGTGGGATAGAGAAGTTAGTAATTACACAAACATAATATACAGCTTGCTTGAAGAATCGCAAAACCAGCAGGAAAAAAATGAAAAAGATTTATTAGCATTGGACAGTTGGAAAAATCTATGGAGTTGGTTTGACATAACAAATTGGCTGTGGTATATAAAAATATTCATAATGATAGTAGGAGGCTTGATAGGTTTAAGAATAATTTTTGCTGTGCTCTCTATAGTGAATAGAGTTAGGCAGGGATACTCACCTTTGTCGTTTCAGACCCTTACCCCGAACCCAAGGGGACCCGACAGGCTCGGAAGAATCGAAGAAGAAGGTGGAGAGCAAGACAAAGACAGATCCATTCGATTAGTGAACGGATTCTTAGCACTTGCCTGGGACGATCTACGGAACCTGTGCCTCTTCAGCTACCACCGATTGAGAGACTTCATATCGGTGGCAGCGAGAGTGGTGGAACTTCTGGGACGCAGCAGTTGGGAAGCCCTTAAATATCTGGGAAGTCTTGTGCAGTATTGGGGTCTGGAGCTAAAAAAGAGTGCTATTAGTCTGTTTGATAGCATAGCAATAGTAGTAGCTGAAGGAACAGATAGGATTATAGAATTAGTACAAGGATTTTGTAGAGCTATCCGCAACATACCTACAAGAATAAGACAGGGCTTTGAAGCAGCTTTGCAATAAAATGGGGGGCAAGTGGTCAAAATGCAGCATAGTAGGATGGCCTGCTATAAGAGAGAGAATGAGACGAGCTGAGCCAGCAGCAGAAGGAGTAGGAGCAGCGTCTCAAGACTTAGATAAACATGGAGCACTTACAAGCAGCAACACAGACACCACTAATGCTGATTGTGCTTGGCTGAGAGCACAGGAGGAGGAAGGAGAAGTAGGCTTTCCAGTCACACCTCAGGTGCCTTTAAGACCAATGACTTATAAGAGCGCATTTGATCTCAGCTTCTTTTTAAAAGAAAAGGGGGGACTGGAAGGGTTAATTTACTCTAAGAAAAGGCAAGAAATCCTTGATTTGTGGGTCTATCACACACAAGGCTTCTTCCCTGATTGGCAAAACTACACACCGGGACCAGGAGTCAGATACCCACTGACTTTTGGGTGGTGCTTCAAGCTGGTACCAGTTGACCCAAGGGAAGTAGAAGAGGCCAACGAAGGAGAAGACAACTGTTTGCTACACCCTGTGTGCCAGCATGGAATGGAGGATGAACACAGAGAAGTATTAAAGTGGAAGTTTGACAGTCAGCTAGCACGCAGACACATGGCCCGCGAGCTACATCCGGAGTTTTACAAAGACTGCTGACACAGAAGGGACTTTCCGCTGGGACTTTCCACTGGGGCGTTCCAGGAGGTGTGGTCTGGGCGGGACTGGGAGTGGTCAACCCTCAGATGCGGCATATAAGCCGCTGCTTTTCGCTTGTACTGGGTCTCTCTAGGTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTATCTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCTGAGCAGTGTGTGCCCGTCTATTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGT'
seq_A1CD='TTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCAGCGAACGGTGAGTACGCAAAAAATTTTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAATATTAAGTGGGGGAAAATTAGATGCATGGGAGAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAGATTGAAACATCTAGTATGGGCAAGCAGGGAGCTGGACAGATTTGCACTTAACCCTAGCCTTTTAGAAACAACAGAAGGGTGTCAACAAATAATGGACCAGTTACAACCAGCTCTCAAGACAGGAACAGAAGAACTTAGATCATTATATAACACAGTAGCAACCCTCTGGTGCGTACATAAACGGATAGATGTAAAAGACACCAAGGAAGCTCTAGATAAAATAGAGGAAATACAAAAGAAAAGCAAGCAAAAGGCCCAACAGGCAGCAGCTGACACAGGAAATAGCAGCAATGTCAGCCAGAATTACCCTATAGTGCAAAATGCACAAGGGCAAATGGTACACCAGTCCTTGTCACCTAGGACTTTGAATGCATGGGTGAAAGTAATAGAAGAAAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAATATGATGCTGAACATAGTAGGGGGACACCAGGCAGCTATGCAAATGTTAAAAGATACCATCAATGAGGAAGCTGCAGAATGGGACAGGATACATCCAGTACATGCAGGGCTTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGATATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGCATGGATGACAAGCAATCCACCTATCCCAGTAGGAGACATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTGAGAATGTATAGCCCTGTTAGCATCTTGGATATAAGACAAGGGCCAAAAGAACCCTTCAGAGACTATGTAGATAGGTTCTTTAAAACTCTCAGAGCTGAACAAGCTACACAGGAAGTAAAAAATTGGATGACAGAGACCTTGTTAGTCCAAAATGCGAACCCAGATTGTAAAACTATCTTAAAAGCATTGGGACCAGGGGCTACATTAGAAGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGTCATAAAGCAAGAGTTTTGGCTGAGGCAATGAGCCAAGCAAATGCAAATACTGCTATAATGATGCAGAGAGGCAATTTTAAGGGTCCAAAGAAAATCATTAAGTGTTTCAACTGTGGCAAAGAAGGACACATAGCAAAAAATTGCAGGGCTCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAGGGAAGGACACCAGATGAAAGATTGCACTGAAAGACAGGCTAATTTTTTAGGGAAGATATGGCCTTCCCACAAGGGAAGGCCAGGGAATTTCCTTCAGAGCAGACCAGAACCAACAGCCCCACCAGCAGAGAGCTTCGGGTTTGGAGAAGAGATAACCCCCTCCCAGAAGCAGGAGCAGAAAGACAAGGAACTGTATCCTTTAGCCTCCCTCAAATCACTCTTTGGCAACGACCCCTAGTCAAAGTAAAGATAGGGGGACAGCTAAAAGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATAAATTTGCCAGGAAAATGGAAACCAAAAATGATAGGGGGAATTGGAGGCTTTATCAAAGTAAGACAGTATGATCAAATACTCGTAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATTTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCAATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGGATGGATGGCCCAAGAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAATAGAAATTTGTACAGAGATGGAAAAGGAAGGAAAAATTTCAAGAATTGGGCCTGAAAATCCATACAATACTCCAATATTTGCTATAAAGAAAAAAGACAGTACTAAGTGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCGCATCCAGCGGGCTTGAAAAAGAAAAAATCAGTAACAATACTAGATGTGGGGGACGCATATTTTTCAGTCCCCTTAGATGAAAGCTTTAGAAAGTATACTGCATTCACCATACCTAGTACAAACAATGAGACACCAGGAATCAGGTATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCGGCAATATTTCAGAGTAGCATGACAAAAATCTTAGAGCCCTTTAGATCAAAAAATCCAGACATGATTATCTATCAATACATGGATGACTTGTATGTAGGATCTGATTTAGAAATAGGACAGCATAGAACAAAAATAGAGGAGTTAAGAGCTCATCTATTGAGCTGGGGATTTACTACACCAGACAAAAAGCATCAGAAAGAACCCCCATTTCTGTGGATGGGATATGAACTCCATCCTGACAAGTGGACAGTCCAATCTATAAAACTGCCAGAAAAAGAAAGCTGGACTGTCAATGATATACAGAAATTAGTGGGGAAATTAAATTGGGCAAGCCAAATTTATCCAGGAATTAAAGTAAAACAGTTGTGTAAACTCCTTAGGGGAGCCAAAGCACTAACAGATGTAGTAACATTGACTGAGGAAGCAGAATTAGAATTGGCAGAGAACAGGGAGATTCTAAAAGACCCTGTGCATGGGGTATATTATGACCCATCAAAGGACTTAATAGCAGAAATACAGAAACAAGGGCAAGAACAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTAAAAACAGGGAAGTATGCAAAAAAGAGGTCTGCTCACACTAATGATGTAAAACAATTAGCAGAAGTGGTGCAAAAAGTGGTCATGGAAAGCATAGTAATATGGGGAAAGGCTCCTAAATTTAAATTACCCATACAAAAAGAAACATGGGAAACATGGTGGATGGACTATTGGCAGGCCACCTGGATTCCTGAATGGGAATTTGTCAATACCCCTCCTCTAGTAAAATTATGGTACCAGTTAGAGAAAGACCCCATAATAGGAGCAGAGACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAGCTAGGAAAAGCAGGGTATGTCACTGACAGAGGAAGACAAAAGGTTGTTTCCCTAACTGAGACAACAAATCAAAAGACTGAACTACATGCAATCTATCTAGCCTTGCAGGATTCAGGATCAGAAGTAAACATAGTAACAGACTCACAGTATGCATTAGGAATCATTCAGGCACAACCAGACAGGAGTGAATCAGAGTTAGTCAATCAAATAATAGAGAAGCTAATAGGAAAGGACAAAGTCTACCTGTCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTTCTGGAATCAGGAAAGTGCTATTTTTGGATGGGATAGATAAAGCTCAAGAAGAACATGAAAGGTATCACAGCAATTGGAGAGCAATGGCTAGTGACTTTAATCTGCCACCTGTAATAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGATAAAAGGGGAAGCCATGCATGGACAAGTAGACTGCAGTCCAGGGATATGGCAATTAGATTGCACGCATTTAGAAGGAAAAGTAATTCTGGTAGCAGTCCATGTAGCCAGTGGCTATATAGAAGCAGAAGTTATCCCAGCAGAAACAGGACAGGAGACAGCATACTTTCTACTAAAATTAGCAGGAAGATGGCCAGTAAAAGTAGTACACACAGACAATGGCAGCAATTTCACCAGTGCTGCATTTAAAGCAGCCTGTTGGTGGGCAAGTGTCCAACAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTGGAATCTATGAATAAGGAATTAAAGAAAATCATAGGGCAGGTAAGAGAGCAAGCTGAACACCTTAAGACAGCAGTACAAATGGCAGTATTCATTCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGTAGGGGAAAGAATAATAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAAATTCCGGGTTTATTACAGGGACAGCAGAAATCCAATTTGGAAAGGACCAGCAAAACTACTCTGGAAAGGTGAAGGGGCAGTGGTAATACAGGACAATAGTGATATAAAGGTAGTACCAAGAAGAAAGGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAGGTAGACAGGATGAGGATTGGAACATGAAATAGTCTAGTAAAACATCATATGTATGTCTCAAAGAAAGCTAGAGGTTGGTTTTATAGACATCACTATSAAACCAGGCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAAATAGTAGTAAGAACATATTGGGGTCTACACACAGGAGAAAAAGACTGGCACCTGGGTCATGGGGTCTCCATAGAATGGAGGCTAAGAAAGTATAGCACACAAATAGATCCTGACCCGGCAGACCAACTAATTCACCTGCATTATTTTGACTGTTTTTCAGACTCTGCCATAAGGAAAGCCATATTAGGGCAAGTAGTTAGCCCTAGGTGTGACTATACAGCAGGACATAACAAGGTAGGATCTCTACAATATTTAGCACTGAAAGCATTAGTAACACCAACAAGAGTAAAGCCACCTTTGCCTAGTGTTAGGAAATTAGCAGAGGATAGATGGAGCAAGTCCCAAAAGACCAGGGGCCTCAGAGGGAGCCTTACAATGAATGGATGTTAGATCTGCTAGAAGATCTTAAGCATGAAGCTGTCAGACATTTTCCTAGGCCATGGCTTCATGGATTAGGACAACATATCTATAGCACATATGGGGATACTTGGGAAGGAGTTGAAGCTATAATAAGAATTTTGCAGCAACTACTGTTTGTTCATTTCAGAATCGGGTGCCAACACAGCAGAATAGGCATTATTCGAGGGAGGAGAAGAGTCAGGAATGGATCTAGTAGATCCTAACCTAGAGCCCTGGAATCATCCGGGAAGTCAGCCTACAACTCCTTGTAGCAAGTGTTACTGTAAAAAGTGTTGCTATCATTGCCAGCATTGCTTCATAACGAAAGGCTTAGGCATCTCATATGGCAGGAAGAAGCGGAGACAGCGACGAGGACCTCCTCAGAGCAATAAGGATCATCAAAATCCTGTACAAAAGCAGTAAGTATTAGTAATTAATATATGTAATGCAACCTTTAGAAATCTGTTCAATAGTAGGGCTGATAGTAGCCATAATCCTAGCAATAGTTGCGTGGACTATAGTAGGCATAGAAATTAAGAAATTGCTAAGGCAAAAGAAAATAGACAGGTTAATTGAGAGAATAAGAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGATGGGGATACAGAGGAATTGGCAGCACTTGTTGAGATGGGGAACTATGATCCTGGGGATGATATTAATCTGTAGTGCTGTAGATAAATTGTGGGTTACTGTCTATTATGGGGTACCTGTGTGGAAAGATGCAGAGACCACCCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAAGTGCATAATGTCTGGGCTACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTACTTTTGGGAAATGTGACAGAAGATTTTAACATGTGGAAAAATAACATGGTAGAACAGATGCATACAGATATAATCAGTCTATGGGACCAAAGCCTACAGCCATGTGTAAAGTTAACCCCTCTCTGCGTTACTTTAAATTGTACCAATGTCACTATCACTACCAATGCCACTGACAGTAACAATGCCAGTCTCCAAGACATGGCAAAAGAAATGACAAACTGCTCTTTCAATATGACCACAGAACTAAGGGATAAGAAACAAAGAGTATATTCACTTTTTTATAAACTTGATGTAGTACAAATTAACAGCAATCAAAATAACAGCAGTCAGTATAGATTAATAAATTGTAATACCTCAGCCATTACACAAGCTTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCAGCTGGATTTGCAATTCTAAAATGTAATAATAAGGAGTTCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTACAGTGTACACATGGGATTAAGCCAGTAGTGTCAACTCAATTGTTGTTGAATGGCAGTCTAGCAGAAGAAGAGATAATAGTTAGATCTGAAAATCTCACAAATAATGCTAAAATCATAATAATACAGCTTAATGAGACTGTAAAAATTAATTGTACCAGACCTAACAACAATACAAGGAACAGTATACGTATAGGACCAGGACAAGCATTCTATGCAACAGGTGCCATAACAGGGGATATAAGACAAGCACATTGTAATGTCAGTAGATCAGAATGGAATAAAACTTTACAACAGGTAGCTAAAAAATTAGGAGACCCTCTTAACAAGACAGAAATAATTTTTAAACCACCCTCAGGAGGGGATTTAGAAATTACAACACATAGTTTTAATTGTGGAGGAGAATTTTTCTATTGTAATACATCAGGCCTGTTTAATAGCACTTGGGTAAATGGCAGCAGGGAATCAAATAGCACAGATAATGATACTATAACTCTCCCGTGTAGAATAAAGCAAATTATAAATATGTGGCAGAGAGTAGGACAAGCAATGTATGCCCTTCCCATCCGAGGAGTAATAAGGTGTGAATCAAACATTACAGGATTAATATTAACAAGAGATGGTGGGAATAATACCAGTACAAATGAAACCATCAGACCTGCAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCACTAGGAGTAGCACCCACCAAGGCAAGGAGAAGAGTGGTGGAGAGAGAAAAGAGAGCAGTTGGAATAGGAGCTGTGTTCCTTGGGTTCTTAGGAGCAGCAGGAAGCGCTATGGGCGCAGCGGCAGCAACGCTGACGGTACAGGCCAGGCAATTATTGTCTGGCATAGTGCAACAGCAAAGCAATTTGCTGAAGGCTATAGAGGCTCAACAGCATCTGTTGAAACTCACGGTCTGGGGCATTAAACAGCTCCAGGCAAGAGTCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTAGGAATTTGGGGCTGCTCTGGAAAACTCATCTGCGCCACTAATGTGCCCTGGAACTCTAGTTGGAGTAATAAATCACAGGCAGAAATATGGCAGAATATGACCTGGCTGCAATGGGATAAAGAAATTGACAATTACACACAAATAATATATATGCTGCTTGAAGAACCACAAAACCAGCAGGAAAAAAATGAACAAGACTTATTGGCATTGGACAAGTGGGGAAGTTTGTGGAATTGGTTTGAGATATCAAAATGGCTGTGGTATATAAGAATATTTATAATGATAGTAGGAGGCTTAATAGGATTAAGAATAGTTTTTGCTGTGCTTTCTGTAATAAATAGAGTTAGGCAGGGATACTCACCTCTATCGTTTCAGACCCATACCCCAAACCCAGAGGGAGTCGACAGGCCCGGAAGAATCGAAGAAGAAGGTGGAGAGCAAGGCAGAGACAGATCGATTCGATTAGTCAGCGGATTCTTAGCACTTGCCTGGGACGATCTGAGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTAATCTTGATTGCTGCGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCAATCAAATATCTGTGGAATCTCCTGCAGTATTGGATTCAGGAACTAAAGAATAGTGCTATTAACTTGTTTAATACCATAGCAATAGCAGTAGCTGAGGGAACAGATAGGGTTATAGAAATAGGACAAAGAATTGGTAGGGCTATCCTCAACACACCTAGAAGAATAAGACAGGGCTTGGAAAGGGCTTTGCTATAAAATGGGTGGCAAATGGTTAAAAAGTAGTATAGTAGGATGGCCTGCTGTAAGAGAAAGAATAAGACGAACTGAGCCAGCAGCAGAGGGAGTAGGAGCAGCGTCTCAAGACTTAGATAAATATGGGGCACTGACAAGCAGCAACACAGTCACCAATAATCCTGATTGTGCCTGGCTGGAAGCGCAAAAGGAGGAAGAGGAGGTAGGCTTTCCAGTCAGACCACAAGTACCTTTAAGACCAATGACTTATAAGGCAGCAGTCGATCTCAGCTTCTTTTTAAAAGAAAAGGGGGGACCGGAAGGGTTAATTTACTCTAAGAAAAGGCAAGACATCCTTGATTTGTGGGTCTATAACACACAAGGCTTCTTCCCTGATTGGCAAAACTACACACCAGGACCAGGGACCAGATATCCCCTGACCTTCGGATGGTGCTTCAAGCTAGTGCCAGTTGACCCAAGGGAAGTAGAAGAGGCCAATGAAGGAGAGAACAACTGCTTGCTACACCCTATGAGTCAGCATGGAATAGAGGATGAAGACAAAGAAGTATTAAGGTGGAAGTTTGACAGTCAGCTAGCACGCAGACACATGGCCCGCGAGATGCATCCGGAGTATTACAAAGACTGCTGACACAGGAGTTGCAAAGACTGCTAACACAGGAGTTGCTGACAGGGACTTTCTGCAAGGGACTTTCCAGGGGAGGTGTGGTTTGGGCGGAGTTGGGGAGTGGCTAACCCTCAGATGCTGCATATAAGCAGCTGCTTTTCGCTTGTACTGGGTCTCTCTTGTTAGACCAGATCGAGCCTGGGAGCTCTCTGGCTAGCTAGGGAACCCACTGCTTAA'
predict(seq_A1CD)

(3, 7500)
k: 1 MOVE_WINDOW: 1 READ_LENGTH: 7500 
Pred_class: A1
(1, 8934)
k: 21 MOVE_WINDOW: 0 READ_LENGTH: 0 
Pred_class: BF1


'A1'

In [None]:
pip install anvil-uplink


Collecting anvil-uplink
[?25l  Downloading https://files.pythonhosted.org/packages/a1/e7/4eb5859dd68eab5baf07e91e38eebd0fa7fa87aef4f2ebc5ca00490c8bbc/anvil_uplink-0.3.34-py2.py3-none-any.whl (58kB)
[K     |████████████████████████████████| 61kB 5.4MB/s 
Collecting ws4py
[?25l  Downloading https://files.pythonhosted.org/packages/53/20/4019a739b2eefe9282d3822ef6a225250af964b117356971bd55e274193c/ws4py-0.5.1.tar.gz (51kB)
[K     |████████████████████████████████| 61kB 7.8MB/s 
[?25hCollecting argparse
  Downloading https://files.pythonhosted.org/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25l[?25hdone
  Created wheel for ws4py: filename=ws4py-0.5.1-cp36-none-any.whl size=45216 sha256=1e0ff7b4012df1b9fa8a3d1a05e18fb675cbad5a6069745f4774d07f531b19c7
  Stored in directory: /root/.cache/pip/wheels/a2/6e/4e/8b0ae12fb9b8a05715256952

In [None]:

import anvil.server

anvil.server.connect("GSJPSCRUUB7KWMF5ANAVGKXB-MM7YG5X64D2CE5HT")
newnames={'B': 5727, 'C': 2077, '01_AE': 1426, 'A1': 498, '01B': 210, 
                    '02_AG': 168, 'BF1': 143, 'A6': 117, 'A1C': 111, 'G': 96, 'BC': 95, 
                    'A1D': 94, 'AD': 94, 'D': 87, 'F1': 82, 'A1CD': 62, 'CD': 61, 'O': 57,
                    '0107': 57, '01BC': 50, '07_BC': 41, '08_BC': 35, '02A1': 29, 
                    '11_cpx': 25, '35_AD': 22}

key_list=list(newnames)
##will need to get the subtype label.. later

@anvil.server.callable
##define the predict function above. under this @ line.

def predict(seq):
    newnames={'B': 5727, 'C': 2077, '01_AE': 1426, 'A1': 498, '01B': 210, 
                    '02_AG': 168, 'BF1': 143, 'A6': 117, 'A1C': 111, 'G': 96, 'BC': 95, 
                    'A1D': 94, 'AD': 94, 'D': 87, 'F1': 82, 'A1CD': 62, 'CD': 61, 'O': 57,
                    '0107': 57, '01BC': 50, '07_BC': 41, '08_BC': 35, '02A1': 29, 
                    '11_cpx': 25, '35_AD': 22}
    types = list(newnames.keys())
    k = [1,21,15]
    MOVE_WINDOW = [1,0,0]
    READ_LEN = [7500,0,1000]
    JUMP=[400,400,400]
    NUM_MODELS = len(k)
    saved_models = {
            '117500':'/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k_1_sliding.h5',
            '2100':'/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k21_readWhole.h5',
            '1501000':'/content/drive/MyDrive/ML 472/Data/Models/kmer_model_k15_read1000.h5'
    }
    saved_tokenizers = {
            '117500':'/content/drive/MyDrive/ML 472/Data/Models/tokenizer.pickle',
            '2100':'/content/drive/MyDrive/ML 472/Data/Models/tokenizer_k21_readWhole.pickle',
            '1501000':'/content/drive/MyDrive/ML 472/Data/Models/tokenizer_k15_read1000.pickle'
    }
    predictions = []
    str_pred = ''
    for i in range(NUM_MODELS):
        s = str(k[i])+str(MOVE_WINDOW[i])+str(READ_LEN[i])
        
        #Loading the saved model
        MODEL_PATH = saved_models[s]
        
        #Loading saved tokenizer 
        TOKENIZERPATH = saved_tokenizers[s]
        
        # Preprocessing the newly provided seq
        type_texts = process_seq(seq,k[i],MOVE_WINDOW[i],READ_LEN[i],JUMP[i])
        
        with open(TOKENIZERPATH, 'rb') as handle:
            tokenizer = pickle.load(handle)
            max_length = pickle.load(handle)
        tokenizer.fit_on_texts(type_texts)
        encoded_docs = tokenizer.texts_to_sequences(type_texts)
        max_length = max([len(s.split()) for s in type_texts])
        X = pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')
        print(X.shape)
        loaded_model = load_model(MODEL_PATH)
        y_pred = np.array(loaded_model.predict_classes(X))
        counts = np.bincount(y_pred)
        subtype_ = np.argmax(counts)
        print('k:',k[i],'MOVE_WINDOW:',MOVE_WINDOW[i],'READ_LENGTH:',READ_LEN[i],'\nPred_class:',types[subtype_])
        predictions.append(types[subtype_])
        str_pred+='k:'+str(k[i])+' MOVE_WINDOW:'+str(MOVE_WINDOW[i])+' READ_LENGTH:'+str(READ_LEN[i])+'\nPred_class:'+types[subtype_]+'\n'
    print(predictions)
    return str_pred



In [None]:
anvil.server.wait_forever()

(4, 7500)
k: 1 MOVE_WINDOW: 1 READ_LENGTH: 7500 
Pred_class: B
(1, 9166)
