In [2]:
#****************************************
# This juptyter notebook was created for the project, DNA Classification,
# from the Udemy course, Aplied Machine Learning in Healthcare found at: 
#     https://www.udemy.com/course/applied-machine-learning-for-healthcare
#
# The dataset used is a molecular biology e coli (promoter gene sequence) data set
# located in the UCI repository at
#    https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences
#
#In this project we complete the following activities:
# 1. Fill this in
#
#
#
#
#
#*****************************************************************

# import libraries and check versions
import sys
import numpy   # for data pre-processing
import sklearn # for algorithms and classification reports and metrics
import pandas

print('Python: {}', format(sys.version)) #output of sys.version is put into the variable, {}
print('Numpy: {}', format(numpy.__version__))
print('Sklearn: {}', format(sklearn.__version__))
print('Pandas: {}', format(pandas.__version__))



Python: {} 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
Numpy: {} 1.16.5
Sklearn: {} 0.21.3
Pandas: {} 0.25.1


In [3]:
# import, change module names
import numpy as np
import pandas as pd

# import uci molecular biology (promoter gene sequence) data set
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
# above info comes from the file, promoters.names. Excerpt below
#   -- class (positive or negative)
#   -- instance name
#  -- 57 sequential nucleotide ("base-pair") positions
data = pd.read_csv(url, names = names) # read the csv into a pandas dataframe





In [4]:
print(data.iloc[:5])
# above - look at the first few instances in the dataset

  Class         id                                           Sequence
0     +        S10  \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1     +       AMPC  \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2     +       AROH  \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3     +      DEOP2  \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4     +  LEU1_TRNA  \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [5]:
# after looking at the first few data points, we see 2 issues
# 1. the tab, '\t', in the csv file is being pulled into the data frame
# 2. we don't want the id info as that will skew the training results

# build our dataset using a custom pandas dataframe
# each column in a dataframe is called a series

classes = data.loc[:, 'Class']
print(classes[:5]) # look at first 5


0    +
1    +
2    +
3    +
4    +
Name: Class, dtype: object


In [6]:
# generate list of DNA sequences
sequences = list(data.loc[:, 'Sequence']) # put the sequence info into a list called sequences
dataset = {}

# loop through the sequences and split into individual nucleotides
for i, seq in enumerate(sequences):  #enumerate through our list

    # split into nucleotides, remove tab characters
    # each pass thru the for loop we take 1 sequeces from the list of sequences
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    # above - take every single letter as long as it is not a '\t'
    
    # append class assignment
    nucleotides.append(classes[i])
    
    # add to dataset
    dataset[i] = nucleotides
    
    
print(dataset[0])
    


['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [7]:
# turn the data back into a dataframe
dframe = pd.DataFrame(dataset)
print(dframe)


   0   1   2   3   4   5   6   7   8   9    ... 96  97  98  99  100 101 102  \
0    t   t   g   a   t   a   c   t   c   t  ...   c   c   t   a   g   c   g   
1    a   g   t   a   c   g   a   t   g   t  ...   c   g   a   g   a   c   t   
2    c   c   a   t   g   g   g   t   a   t  ...   g   c   t   a   g   t   a   
3    t   t   c   t   a   g   g   c   c   t  ...   a   t   g   g   a   c   t   
4    a   a   t   g   t   g   g   t   t   a  ...   g   a   a   g   g   a   t   
5    g   t   a   t   a   c   g   a   t   a  ...   t   g   c   g   c   a   c   
6    c   c   g   g   a   a   g   c   a   a  ...   a   g   c   t   a   t   t   
7    a   c   a   a   t   a   t   a   a   t  ...   g   a   g   g   t   g   c   
8    a   t   g   t   t   g   g   a   t   t  ...   a   c   a   t   g   g   a   
9    t   g   a   g   a   g   g   a   a   t  ...   c   t   a   a   t   c   a   
10   a   a   a   t   a   a   a   a   t   c  ...   c   t   c   c   c   c   c   
11   c   c   c   g   c   g   g   c   a   c  ...   c 

In [9]:
# transpose the rows and columns of the dataFrame
df = dframe.transpose()
print(df.iloc[:5])


  0  1  2  3  4  5  6  7  8  9   ... 48 49 50 51 52 53 54 55 56 57
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t  +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a  +
2  g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g  +
3  a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c  +
4  t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g  +

[5 rows x 58 columns]


In [10]:
print(df)

    0  1  2  3  4  5  6  7  8  9   ... 48 49 50 51 52 53 54 55 56 57
0    t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t  +
1    t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a  +
2    g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g  +
3    a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c  +
4    t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g  +
..  .. .. .. .. .. .. .. .. .. ..  ... .. .. .. .. .. .. .. .. .. ..
101  c  c  t  c  a  a  t  g  g  c  ...  g  a  a  c  t  a  t  a  t  -
102  g  t  a  t  t  c  t  c  a  a  ...  t  c  a  a  c  a  t  t  g  -
103  c  g  c  g  a  c  t  a  c  g  ...  a  a  g  g  c  t  t  c  c  -
104  c  t  c  g  t  c  c  t  c  a  ...  a  g  g  a  g  g  a  a  c  -
105  t  a  a  c  a  t  t  a  a  t  ...  t  c  a  a  g  a  a  c  t  -

[106 rows x 58 columns]


In [11]:
# interpretting the print
# 106 rows each containing 57 nucleotides and 1 classification (+ or -)

# rename the last column to Class
df.rename(columns = {57: 'Class'}, inplace = True)
print(df.iloc[:5]) # look at first 5 rows to check
# Class = +, means the sequence is a Promoter
# Class = 1, means the sequence is not a Promoter


   0  1  2  3  4  5  6  7  8  9  ... 48 49 50 51 52 53 54 55 56 Class
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t     +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a     +
2  g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g     +
3  a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c     +
4  t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g     +

[5 rows x 58 columns]


In [12]:
# learn a little bit more about our data
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,-
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [14]:
# interpretting the previous printout
# 4 unique values at each position/column
# top is the most common value for each position/column
# freq - we have 53 promoters and 53 non-promoters (106 rows total)

# record value counts for each sequence
series = []
for name in df.columns:
    series.append(df[name].value_counts())
    
info = pd.DataFrame(series)
details = info.transpose()
print(details)


      0     1     2     3     4     5     6     7     8     9  ...    48  \
t  38.0  26.0  27.0  26.0  22.0  24.0  30.0  32.0  32.0  28.0  ...  21.0   
c  27.0  22.0  21.0  30.0  19.0  18.0  21.0  20.0  22.0  22.0  ...  36.0   
a  26.0  34.0  30.0  22.0  36.0  42.0  38.0  34.0  33.0  36.0  ...  23.0   
g  15.0  24.0  28.0  28.0  29.0  22.0  17.0  20.0  19.0  20.0  ...  26.0   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   

     49    50    51    52    53    54    55    56  Class  
t  22.0  23.0  33.0  35.0  30.0  23.0  29.0  34.0    NaN  
c  42.0  31.0  32.0  21.0  32.0  29.0  29.0  17.0    NaN  
a  24.0  28.0  27.0  25.0  22.0  26.0  24.0  27.0    NaN  
g  18.0  24.0  14.0  25.0  22.0  28.0  24.0  28.0    NaN  
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  

[6 rows x 58 columns]


In [16]:
# interpretting the print above
# Class is Not an Number (NaN), its a + or -

# switch to numerical data using pd.get_dummies() function
numerical_df = pd.get_dummies(df)
numerical_df.iloc[:5]


Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class_+,Class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [18]:
# remove one of the class columns and rename to simply 'Class'
df = numerical_df.drop(columns=['Class_-'])

df.rename(columns = {'Class_+': 'Class'}, inplace = True)
print(df.iloc[:5]) # look at first 5 rows to check


   0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...  54_t  55_a  55_c  \
0    0    0    0    1    1    0    0    0    0    1  ...     0     0     0   
1    0    0    0    1    0    0    1    0    0    1  ...     0     1     0   
2    0    0    1    0    0    0    0    1    1    0  ...     0     0     1   
3    1    0    0    0    1    0    0    0    0    0  ...     0     0     0   
4    0    0    0    1    0    1    0    0    0    0  ...     1     1     0   

   55_g  55_t  56_a  56_c  56_g  56_t  Class  
0     1     0     0     0     0     1      1  
1     0     0     1     0     0     0      1  
2     0     0     0     0     1     0      1  
3     0     1     0     1     0     0      1  
4     0     0     0     0     1     0      1  

[5 rows x 229 columns]


In [25]:
df.shape

(106, 229)