In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class','id','Sequence']
df = pd.read_csv(url, names=names)

In [3]:
df.head()

Unnamed: 0,Class,id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [4]:
classes = df.loc[:,'Class']

In [5]:
sequences = list(df.loc[:, 'Sequence'])
dataset = {}

for i,seq in enumerate(sequences):
    
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x!='\t']
    
    nucleotides.append(classes[i])
    
    dataset[i] = nucleotides
    

In [6]:
df = pd.DataFrame(dataset)

In [7]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,t,t,g,a,t,a,c,t,c,t,...,c,c,t,a,g,c,g,c,c,t
1,a,g,t,a,c,g,a,t,g,t,...,c,g,a,g,a,c,t,g,t,a
2,c,c,a,t,g,g,g,t,a,t,...,g,c,t,a,g,t,a,c,c,a
3,t,t,c,t,a,g,g,c,c,t,...,a,t,g,g,a,c,t,g,g,c
4,a,a,t,g,t,g,g,t,t,a,...,g,a,a,g,g,a,t,a,t,a


In [8]:
df = df.transpose()

In [9]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [10]:
df.rename(columns = {57: 'Class'}, inplace = True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [11]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,+
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [12]:
num_df = pd.get_dummies(df)
num_df.head()

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class_+,Class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [13]:
df = num_df.drop(columns = ['Class_-'])
df.rename(columns = {'Class_+' : 'Class'},inplace = True)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report , accuracy_score

In [19]:
scoring = 'accuracy'

from sklearn.model_selection import train_test_split,KFold,cross_val_score

X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

X_train, X_test , y_train , y_test = train_test_split(X,y)

In [20]:
model = KNeighborsClassifier(n_neighbors = 2)
kfold = KFold(n_splits = 10)
cv_results = cross_val_score(model, X_train , y_train, cv = kfold , scoring = scoring)
print(cv_results.mean())

0.7821428571428571


In [21]:
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.8888888888888888
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        11
           1       0.93      0.88      0.90        16

    accuracy                           0.89        27
   macro avg       0.88      0.89      0.89        27
weighted avg       0.89      0.89      0.89        27



In [22]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
model = DecisionTreeClassifier(max_depth = 7)
kfold = KFold(n_splits = 10)
cv_results = cross_val_score(model, X_train , y_train, cv = kfold , scoring = scoring)
print(cv_results.mean())

0.7071428571428571


In [25]:
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.8148148148148148
              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           1       0.82      0.88      0.85        16

    accuracy                           0.81        27
   macro avg       0.81      0.80      0.81        27
weighted avg       0.81      0.81      0.81        27



In [26]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
model = RandomForestClassifier(n_estimators = 1000,max_depth=7, random_state=0)
kfold = KFold(n_splits = 10)
cv_results = cross_val_score(model, X_train , y_train, cv = kfold , scoring = scoring)
print(cv_results.mean())

0.8982142857142857


In [29]:
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.9259259259259259
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        11
           1       1.00      0.88      0.93        16

    accuracy                           0.93        27
   macro avg       0.92      0.94      0.93        27
weighted avg       0.94      0.93      0.93        27



In [30]:
model = SVC(kernel = 'linear')
kfold = KFold(n_splits = 10)
cv_results = cross_val_score(model, X_train , y_train, cv = kfold , scoring = scoring)
print(cv_results.mean())

0.9375


In [31]:
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.8888888888888888
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        11
           1       1.00      0.81      0.90        16

    accuracy                           0.89        27
   macro avg       0.89      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27



In [47]:
model = SVC(kernel = 'sigmoid', C = 1000.0)
kfold = KFold(n_splits = 10)
cv_results = cross_val_score(model, X_train , y_train, cv = kfold , scoring = scoring)
print(cv_results.mean())

0.9375




In [48]:
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.8888888888888888
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        11
           1       1.00      0.81      0.90        16

    accuracy                           0.89        27
   macro avg       0.89      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27



