In [103]:
# Import, change module names
import numpy as np
import pandas as pd

# import the uci Molecular Biology (Promoter Gene Sequences) Data Set
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = names)

In [104]:
print (data)

    Class         id                                           Sequence
0       +        S10  \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1       +       AMPC  \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2       +       AROH  \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3       +      DEOP2  \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4       +  LEU1_TRNA  \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
5       +     MALEFG  \taggggcaaggaggatggaaagaggttgccgtataaagaaactag...
6       +       MALK  \t\tcagggggtggaggatttaagccatctcctgatgacgcatagt...
7       +       RECA  \t\ttttctacaaaacacttgatactgtatgagcatacagtataat...
8       +       RPOB  \t\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatc...
9       +   RRNAB_P1  \tttttaaatttcctcttgtcaggccggaataactccctataatgc...
10      +   RRNAB_P2  \tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgc...
11      +  RRNDEX_P2  \tcctgaaattcagggttgactctgaaagaggaaagcgtaatatac...
12      +    RRND_P1  \tgatcaaaaaaatacttgtgcaaaaaattgggatccctata

In [105]:
# Building our Dataset by creating a custom Pandas DataFrame
# Each column in a DataFrame is called a Series. Lets start by making a series for each column.

classes = data.loc[:, 'Class']
print(classes[:5])

0    +
1    +
2    +
3    +
4    +
Name: Class, dtype: object


In [106]:
# generate list of DNA sequences
sequences = list(data.loc[:, 'Sequence'])
dataset = {}

# loop through sequences and split into individual nucleotides
for i, seq in enumerate(sequences):
    
    # split into nucleotides, remove tab characters
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    
    # append class assignment
    nucleotides.append(classes[i])
    
    # add to dataset
    dataset[i] = nucleotides
    
print(dataset[0])

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [107]:
# turn dataset into pandas DataFrame
dframe = pd.DataFrame(dataset)
print(dframe)

   0   1   2   3   4   5   6   7   8   9   ... 96  97  98  99  100 101 102  \
0    t   t   g   a   t   a   c   t   c   t ...   c   c   t   a   g   c   g   
1    a   g   t   a   c   g   a   t   g   t ...   c   g   a   g   a   c   t   
2    c   c   a   t   g   g   g   t   a   t ...   g   c   t   a   g   t   a   
3    t   t   c   t   a   g   g   c   c   t ...   a   t   g   g   a   c   t   
4    a   a   t   g   t   g   g   t   t   a ...   g   a   a   g   g   a   t   
5    g   t   a   t   a   c   g   a   t   a ...   t   g   c   g   c   a   c   
6    c   c   g   g   a   a   g   c   a   a ...   a   g   c   t   a   t   t   
7    a   c   a   a   t   a   t   a   a   t ...   g   a   g   g   t   g   c   
8    a   t   g   t   t   g   g   a   t   t ...   a   c   a   t   g   g   a   
9    t   g   a   g   a   g   g   a   a   t ...   c   t   a   a   t   c   a   
10   a   a   a   t   a   a   a   a   t   c ...   c   t   c   c   c   c   c   
11   c   c   c   g   c   g   g   c   a   c ...   c   t   g   t  

In [108]:
# switch coumns and rows because data in not in the correct orientation 
df = dframe.transpose()
df.rename(columns = {57: 'Class'}, inplace = True)
print(df)

     0  1  2  3  4  5  6  7  8  9  ...  48 49 50 51 52 53 54 55 56 Class
0    t  a  c  t  a  g  c  a  a  t  ...   g  c  t  t  g  t  c  g  t     +
1    t  g  c  t  a  t  c  c  t  g  ...   c  a  t  c  g  c  c  a  a     +
2    g  t  a  c  t  a  g  a  g  a  ...   c  a  c  c  c  g  g  c  g     +
3    a  a  t  t  g  t  g  a  t  g  ...   a  a  c  a  a  a  c  t  c     +
4    t  c  g  a  t  a  a  t  t  a  ...   c  c  g  t  g  g  t  a  g     +
5    a  g  g  g  g  c  a  a  g  g  ...   c  g  t  t  t  a  g  g  t     +
6    c  a  g  g  g  g  g  t  g  g  ...   a  t  c  a  t  g  a  a  t     +
7    t  t  t  c  t  a  c  a  a  a  ...   a  a  c  a  g  a  a  c  a     +
8    c  g  a  c  t  t  a  a  t  a  ...   a  a  a  t  g  g  t  t  t     +
9    t  t  t  t  a  a  a  t  t  t  ...   c  c  a  c  t  g  a  c  a     +
10   g  c  a  a  a  a  a  t  a  a  ...   c  c  c  g  c  g  c  c  g     +
11   c  c  t  g  a  a  a  t  t  c  ...   c  c  t  c  g  c  g  a  c     +
12   g  a  t  c  a  a  a  a  a  a  ...   c  c  g  t

In [109]:
# look into the data
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,+
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [110]:
# change the data to numerical so that machine learning algorithms caan be applied
numerical_df = pd.get_dummies(df)
print(numerical_df)

     0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c   ...     55_a  55_c  \
0      0    0    0    1    1    0    0    0    0    1   ...        0     0   
1      0    0    0    1    0    0    1    0    0    1   ...        1     0   
2      0    0    1    0    0    0    0    1    1    0   ...        0     1   
3      1    0    0    0    1    0    0    0    0    0   ...        0     0   
4      0    0    0    1    0    1    0    0    0    0   ...        1     0   
5      1    0    0    0    0    0    1    0    0    0   ...        0     0   
6      0    1    0    0    1    0    0    0    0    0   ...        1     0   
7      0    0    0    1    0    0    0    1    0    0   ...        0     1   
8      0    1    0    0    0    0    1    0    1    0   ...        0     0   
9      0    0    0    1    0    0    0    1    0    0   ...        0     1   
10     0    0    1    0    0    1    0    0    1    0   ...        0     1   
11     0    1    0    0    0    1    0    0    0    0   ...     

In [111]:
# drop one class (-) 
df = numerical_df.drop(columns=['Class_-'])
df.rename(columns = {'Class_+': 'Class'}, inplace = True)
print(df)

     0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...    54_t  55_a  \
0      0    0    0    1    1    0    0    0    0    1  ...       0     0   
1      0    0    0    1    0    0    1    0    0    1  ...       0     1   
2      0    0    1    0    0    0    0    1    1    0  ...       0     0   
3      1    0    0    0    1    0    0    0    0    0  ...       0     0   
4      0    0    0    1    0    1    0    0    0    0  ...       1     1   
5      1    0    0    0    0    0    1    0    0    0  ...       0     0   
6      0    1    0    0    1    0    0    0    0    0  ...       0     1   
7      0    0    0    1    0    0    0    1    0    0  ...       0     0   
8      0    1    0    0    0    0    1    0    1    0  ...       1     0   
9      0    0    0    1    0    0    0    1    0    0  ...       0     0   
10     0    0    1    0    0    1    0    0    1    0  ...       0     0   
11     0    1    0    0    0    1    0    0    0    0  ...       0     1   
12     0    

In [112]:
#separate training and testing datasets
from sklearn import model_selection

# Create X and Y datasets
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

seed = 1

# split data into training and testing datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=seed)

In [142]:
# Building the model using 2 algorithms Ada boost and also Neural Net

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

# define scoring method
scoring = 'accuracy'

# Define models 
names = ["Neural Net","SVM Linear", "AdaBoost"]

classifiers = [
     MLPClassifier(alpha=1,batch_size=30),
     SVC(kernel = 'linear',), 
     AdaBoostClassifier(),
]

models = zip(names, classifiers)

# evaluate each model
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    print('--------------------------------------------------')
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
    print('--------------------------------------------------')

Neural Net: 0.900000 (0.075000)
--------------------------------------------------
Neural Net
0.8888888888888888
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        17
          1       0.77      1.00      0.87        10

avg / total       0.91      0.89      0.89        27

--------------------------------------------------
SVM Linear: 0.850000 (0.108972)
--------------------------------------------------
SVM Linear
0.9629629629629629
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        17
          1       0.91      1.00      0.95        10

avg / total       0.97      0.96      0.96        27

--------------------------------------------------
AdaBoost: 0.925000 (0.114564)
--------------------------------------------------
AdaBoost
0.8518518518518519
             precision    recall  f1-score   support

          0       1.00      0.76      0.87        17
          1       0.71    