https://github.com/sajit9285/DNA-Classification-Project/blob/master/DNA%20Classification%20Code.ipynb \
https://www.kaggle.com/thomasnelson/working-with-dna-sequence-data-for-ml \
https://github.com/SunHaozhe/Kernel-method-DNA-sequence \

In [None]:
!cat /proc/meminfo | grep Mem

MemTotal:       13333552 kB
MemFree:        10712800 kB
MemAvailable:   12380900 kB


In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype 
import numpy as np
import random
#Importing different classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
orig_path = "/content/drive/MyDrive/Kernel_Challenge/kernel_data/"

# Load & Merge Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def merge_X_y(path_X, path_Y):
  X = pd.read_csv(orig_path + path_X)
  y = pd.read_csv(orig_path + path_Y)
  data = pd.merge(X, y, on=['Id']).drop('Id', axis=1).rename(columns={'Bound':'target'})
  return data

In [None]:
data_0 = merge_X_y('Xtr0.csv', 'Ytr0.csv')
data_1 = merge_X_y('Xtr1.csv', 'Ytr1.csv')
data_2 = merge_X_y('Xtr2.csv', 'Ytr2.csv')

# Information

In [None]:
data_0.describe()

In [None]:
data_1.describe()

In [None]:
data_2.describe()

All three of the dataset seem to be balanced :)

In [None]:
seq_length_0 = []
for x in data_0.seq:
  seq_length_0.append(len(x))
print(f'The minimum sequence of data_0 is {min(seq_length_0)}')
print(f'The maximum sequence of data_0 is {max(seq_length_0)}')
seq_length_1 = []
for x in data_1.seq:
  seq_length_1.append(len(x))
print(f'The minimum sequence of data_1 is {min(seq_length_1)}')
print(f'The maximum sequence of data_1 is {max(seq_length_1)}')
seq_length_2 = []
for x in data_2.seq:
  seq_length_2.append(len(x))
print(f'The minimum sequence of data_2 is {min(seq_length_2)}')
print(f'The maximum sequence of data_2 is {max(seq_length_2)}')

The minimum sequence of data_0 is 101
The maximum sequence of data_0 is 101
The minimum sequence of data_1 is 101
The maximum sequence of data_1 is 101
The minimum sequence of data_2 is 101
The maximum sequence of data_2 is 101


# Features Engineering

## Simple HotEncode

In [None]:
def transform_data(X):
  splitted_sequences = []
  for i, seq in enumerate(X.seq):
    splitted_sequences.append([nucleo for nucleo in seq])
  mapping = {num : sequences for num, sequences in enumerate(splitted_sequences)}
  features_data = pd.DataFrame(mapping).T
  for i in range(len(features_data.columns)):
    features_data[i] = features_data[i].astype(CategoricalDtype(['A', 'C', 'G', 'T']))
  encoded_data = pd.get_dummies(features_data)
  return encoded_data

In [None]:
data_0_encoded = transform_data(data_0)
data_1_encoded = transform_data(data_1)
data_2_encoded = transform_data(data_2)

In [None]:
#define a seed for reproducibility
seed = 1
X_0 = data_0_encoded.values
y_0 = data_0['target'].values
# Splitting data into training and testing data
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size = 0.25, random_state = seed)

In [None]:
scoring = 'accuracy'
# Model building to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AddaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
Classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),
    MLPClassifier(alpha = 1),
    AdaBoostClassifier(),
    GaussianNB(),
    svm.SVC(kernel = 'linear'),
    svm.SVC(kernel = 'rbf'),
    svm.SVC(kernel = 'sigmoid')
    
    ]
models = zip(names, Classifiers)
# import KFold
from sklearn.model_selection import KFold, cross_val_score

names = []
result = []
for name, model in models:
    kfold = KFold(n_splits = 10, random_state = 1)
    cv_results = cross_val_score(model, X_train_0, y_train_0, cv = kfold, scoring = 'accuracy')
    result.append(cv_results)
    names.append(name)
    msg = "{0}: {1} ({2})".format(name, cv_results.mean(), cv_results.std())
    print(msg)



K Nearest Neighbors: 0.5793333333333334 (0.03776535743538279)




Gaussian Process: 0.5753333333333334 (0.033339999333466645)




Decision Tree: 0.5653333333333334 (0.02933333333333333)




Random Forest: 0.5519999999999998 (0.03525147751040611)




Neural Net: 0.622 (0.02423037579384833)
AddaBoost: 0.6033333333333333 (0.033099177566150426)
Naive Bayes: 0.6086666666666667 (0.021509687739868902)




SVM Linear: 0.5933333333333334 (0.02529822128134705)




SVM RBF: 0.6206666666666666 (0.019425069712444627)




SVM Sigmoid: 0.6053333333333333 (0.018571184369578827)


## Kmers

### CountVectorizer

In [None]:
def getKmers(sequence, size, overlap=1):
    return [sequence[x:x+size].lower() for x in np.arange(0, len(sequence) - size + 1, overlap)] 

In [None]:
def transform_data(X):
  sequences = []
  for seq in X.seq:
    kmers = getKmers(seq, 6)
    sentence = ' '.join(kmers)
    sequences.append(sentence)
  cv = CountVectorizer()
  encoded = cv.fit_transform(sequences)
  return encoded.toarray()

In [None]:
data_0_encoded = transform_data(data_0)
data_1_encoded = transform_data(data_1)
data_2_encoded = transform_data(data_2)

In [None]:
#define a seed for reproducibility
seed = 42
X_0 = data_0_encoded
y_0 = data_0['target'].values
# Splitting data into training and testing data
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size = 0.20, random_state = seed)

In [None]:
scoring = 'accuracy'
# Model building to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Random Forest', 'AddaBoost', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
Classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    RandomForestClassifier(max_depth = 5, n_estimators = 100, max_features = 1 ),
    AdaBoostClassifier(),
    svm.SVC(kernel = 'linear'),
    svm.SVC(kernel = 'rbf'),
    svm.SVC(kernel = 'sigmoid')
    ]
models = zip(names, Classifiers)
# import KFold
from sklearn.model_selection import KFold, cross_val_score

names = []
result = []
for name, model in models:
    kfold = KFold(n_splits = 10, random_state = 1)
    cv_results = cross_val_score(model, X_train_0, y_train_0, cv = kfold, scoring = 'accuracy')
    result.append(cv_results)
    names.append(name)
    msg = "{0}: {1} ({2})".format(name, cv_results.mean(), cv_results.std())
    print(msg)



K Nearest Neighbors: 0.6 (0.04330127018922193)




Gaussian Process: 0.6025 (0.031523800532296226)




Decision Tree: 0.5875 (0.028777161083053348)




Random Forest: 0.5162500000000001 (0.033095505737184325)




Neural Net: 0.606875 (0.028838179987648312)




AddaBoost: 0.6062500000000001 (0.019764235376052368)




Naive Bayes: 0.5587500000000001 (0.03102418411497714)




SVM Linear: 0.5912499999999998 (0.027414640249326654)




SVM RBF: 0.6331249999999999 (0.03412408423679674)




SVM Sigmoid: 0.6268750000000001 (0.03011047367611477)


### TFIDF

In [None]:
def getKmers(sequence, size, overlap=1):
    return [sequence[x:x+size].lower() for x in np.arange(0, len(sequence) - size + 1, overlap)] 

In [None]:
def transform_data(X):
  sequences = []
  for seq in X.seq:
    kmers = getKmers(seq, 6)
    sentence = ' '.join(kmers)
    sequences.append(sentence)
  tfidf = TfidfVectorizer()
  encoded = tfidf.fit_transform(sequences)
  return encoded.toarray()

In [None]:
data_0_encoded = transform_data(data_0)
data_1_encoded = transform_data(data_1)
data_2_encoded = transform_data(data_2)

In [None]:
#define a seed for reproducibility
seed = 42
X_0 = data_0_encoded
y_0 = data_0['target'].values
# Splitting data into training and testing data
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size = 0.20, random_state = seed)

In [None]:
scoring = 'accuracy'
# Model building to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Random Forest', 'AddaBoost', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
Classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    RandomForestClassifier(max_depth = 5, n_estimators = 100, max_features = 1 ),
    AdaBoostClassifier(),
    svm.SVC(kernel = 'linear'),
    svm.SVC(kernel = 'rbf'),
    svm.SVC(kernel = 'sigmoid')
    ]
models = zip(names, Classifiers)
# import KFold
from sklearn.model_selection import KFold, cross_val_score

names = []
result = []
for name, model in models:
    kfold = KFold(n_splits = 10, random_state = 1)
    cv_results = cross_val_score(model, X_train_0, y_train_0, cv = kfold, scoring = 'accuracy')
    result.append(cv_results)
    names.append(name)
    msg = "{0}: {1} ({2})".format(name, cv_results.mean(), cv_results.std())
    print(msg)



K Nearest Neighbors: 0.595 (0.020310096011589906)




Gaussian Process: 0.62625 (0.027922884879610838)




Random Forest: 0.52625 (0.030078023538789918)




AddaBoost: 0.59 (0.025186802099512358)




SVM Linear: 0.621875 (0.03345355922768158)




SVM RBF: 0.630625 (0.037754345776347384)




SVM Sigmoid: 0.61875 (0.03601215072721983)


In [None]:
def getKmers(sequence, size, overlap=1):
    return [sequence[x:x+size].lower() for x in np.arange(0, len(sequence) - size + 1, overlap)] 

In [None]:
def transform_data(X):
  sequences = []
  for seq in X.seq:
    kmers = getKmers(seq, 6)
    sentence = ' '.join(kmers)
    sequences.append(sentence)
  cv = CountVectorizer()
  encoded = cv.fit_transform(sequences)
  return encoded.toarray()

In [None]:
data_0_encoded = transform_data(data_0)
data_1_encoded = transform_data(data_1)
data_2_encoded = transform_data(data_2)

In [None]:
#define a seed for reproducibility
seed = 42
X_1 = data_1_encoded
y_1 = data_1['target'].values
# Splitting data into training and testing data
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size = 0.20, random_state = seed)

In [None]:
scoring = 'accuracy'
# Model building to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
Classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    svm.SVC(kernel = 'linear'),
    svm.SVC(kernel = 'rbf'),
    svm.SVC(kernel = 'sigmoid')
    ]
models = zip(names, Classifiers)
# import KFold
from sklearn.model_selection import KFold, cross_val_score

names = []
result = []
for name, model in models:
    kfold = KFold(n_splits = 10, random_state = 1)
    cv_results = cross_val_score(model, X_train_1, y_train_1, cv = kfold, scoring = 'accuracy')
    result.append(cv_results)
    names.append(name)
    msg = "{0}: {1} ({2})".format(name, cv_results.mean(), cv_results.std())
    print(msg)



K Nearest Neighbors: 0.57875 (0.041570722870789736)




KeyboardInterrupt: ignored