<a href="https://colab.research.google.com/github/pavi-ninjaac/SARS_CoV_2_DNA_Analysis/blob/main/CGR_Classification_HIVvscovid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DNA Classification using CGR

In [2]:
! pip install Bio

Collecting Bio
[?25l  Downloading https://files.pythonhosted.org/packages/83/ec/efbae4632107776961f4a1d3607117a4412b9e798609956c664e9c0157f4/bio-0.3.0-py3-none-any.whl (70kB)
[K     |████████████████████████████████| 71kB 5.5MB/s 
[?25hCollecting biopython>=1.78
[?25l  Downloading https://files.pythonhosted.org/packages/3a/cd/0098eaff841850c01da928c7f509b72fd3e1f51d77b772e24de9e2312471/biopython-1.78-cp37-cp37m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 11.3MB/s 
Installing collected packages: biopython, Bio
Successfully installed Bio-0.3.0 biopython-1.78


In [19]:
from Bio import SeqIO
import numpy as np
import pandas as pd
from collections import defaultdict
import math
import matplotlib.pyplot as plt
from matplotlib import cm

#machine learning 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [5]:
def generate_data(file_list):
  X = []
  y = []
  label = 1 # for hiv
  for i in file_list:
    filepath = '/content/drive/MyDrive/Colab Notebooks/dataFiles/DNA_Genome_Analysis_covid19/'+i+'.fasta'
    records = list(SeqIO.parse(filepath , 'fasta'))

    for index in range(len(records)):
      X.append(str(records[index].seq))
      y.append(label)
    label = 0 #for covid19
  
  data = pd.DataFrame({'DNASequence' : X , 'label' : y})

  return data

In [50]:
file_list = ['HIV' , 'covid19']
data = generate_data(file_list)
data.head(5)

Unnamed: 0,DNASequence,label
0,CCTCAAATCACTCTTTGGCAGCGACCCCTGGTCTCAATAAAAGTAG...,1
1,CCTCAAATCACTCTTTGGCAGCGACCCCTGGTCTCAATAAAAGTAG...,1
2,CCTCAAATCACTCTTTGGCAGCGACCCCTGGTCCCAATAAAAGTAG...,1
3,CCTCAAATCACTCTTTGGCAGCGACCCCTGGTCTCAATAAAAGTAG...,1
4,CCTCAAATCACTCTTTGGCAGCGACCCCTGGTCYCAATAAAAGTAG...,1


In [16]:
#split the sequence in the cgr way
#convert the squenceof strings to k-mers 
def kmer(sequence , size = 7):
  return [sequence[i:i+size].lower() for i in range(len(sequence) - size+1 )]

data['words'] = data.apply(lambda x : kmer(x['DNASequence']) , axis = 1)
data = data.drop('DNASequence' , axis = 1)
data.head()

Unnamed: 0,label,words
0,1,"[cctcaaa, ctcaaat, tcaaatc, caaatca, aaatcac, ..."
1,1,"[cctcaaa, ctcaaat, tcaaatc, caaatca, aaatcac, ..."
2,1,"[cctcaaa, ctcaaat, tcaaatc, caaatca, aaatcac, ..."
3,1,"[cctcaaa, ctcaaat, tcaaatc, caaatca, aaatcac, ..."
4,1,"[cctcaaa, ctcaaat, tcaaatc, caaatca, aaatcac, ..."


In [17]:
#join into  one sentence
data_x = data['words']
for i in range(len(data_x)):
  data_x[i] = ' '.join(data_x[i])
y = np.array(data['label'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [44]:
cv = CountVectorizer( ngram_range = (1,1))
X = cv.fit_transform(data_x).toarray()


In [45]:
X_df = pd.DataFrame(X, columns= cv.get_feature_names())
X_df.head()

Unnamed: 0,aaaaaag,aaaaaat,aaaaaca,aaaaacc,aaaaacg,aaaaact,aaaaaga,aaaaagc,aaaaagg,aaaaagt,aaaaata,aaaaatc,aaaaatg,aaaaatt,aaaacaa,aaaacac,aaaacag,aaaacat,aaaacca,aaaaccc,aaaacct,aaaacgt,aaaacta,aaaactc,aaaactg,aaaactt,aaaagaa,aaaagac,aaaagag,aaaagat,aaaagca,aaaagcc,aaaagct,aaaagga,aaaaggc,aaaaggt,aaaagta,aaaagtc,aaaagtg,aaaagtt,...,ttttgag,ttttgat,ttttgca,ttttgcc,ttttgct,ttttgga,ttttggc,ttttggg,ttttggt,ttttgta,ttttgtc,ttttgtg,ttttgtt,tttttaa,tttttac,tttttag,tttttat,tttttca,tttttcc,tttttcg,tttttct,tttttga,tttttgc,tttttgg,tttttgt,tttttta,ttttttc,ttttttg,ttttttt,yaaaaga,yaaacaa,yataaaa,yatgaag,yatggat,ycaaata,ycaataa,ycctatt,ytccttt,ytgtatg,yttagaa
0,3,0,0,0,0,0,2,0,1,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3,1,0,0,0,0,2,0,1,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,1,0,0,0,0,1,0,1,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0
3,2,1,0,0,0,0,2,0,0,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
4,1,1,0,0,0,0,2,0,1,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [47]:
max(X[0])

3

In [51]:
N = len(data.iloc[0,0])
k=7
total_len_kmer = N-k+1

In [53]:
def propability(row):
  return row/total_len_kmer
X_frequnecy = X_df.apply(lambda row:propability(row) , axis = 1)
X_frequnecy.head()

Unnamed: 0,aaaaaag,aaaaaat,aaaaaca,aaaaacc,aaaaacg,aaaaact,aaaaaga,aaaaagc,aaaaagg,aaaaagt,aaaaata,aaaaatc,aaaaatg,aaaaatt,aaaacaa,aaaacac,aaaacag,aaaacat,aaaacca,aaaaccc,aaaacct,aaaacgt,aaaacta,aaaactc,aaaactg,aaaactt,aaaagaa,aaaagac,aaaagag,aaaagat,aaaagca,aaaagcc,aaaagct,aaaagga,aaaaggc,aaaaggt,aaaagta,aaaagtc,aaaagtg,aaaagtt,...,ttttgag,ttttgat,ttttgca,ttttgcc,ttttgct,ttttgga,ttttggc,ttttggg,ttttggt,ttttgta,ttttgtc,ttttgtg,ttttgtt,tttttaa,tttttac,tttttag,tttttat,tttttca,tttttcc,tttttcg,tttttct,tttttga,tttttgc,tttttgg,tttttgt,tttttta,ttttttc,ttttttg,ttttttt,yaaaaga,yaaacaa,yataaaa,yatgaag,yatggat,ycaaata,ycaataa,ycctatt,ytccttt,ytgtatg,yttagaa
0,0.002933,0.0,0.0,0.0,0.0,0.0,0.001955,0.0,0.000978,0.0,0.0,0.000978,0.000978,0.001955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002933,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.002933,0.000978,0.0,0.0,0.0,0.0,0.001955,0.0,0.000978,0.0,0.0,0.001955,0.000978,0.001955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002933,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.001955,0.000978,0.0,0.0,0.0,0.0,0.000978,0.0,0.000978,0.0,0.0,0.001955,0.000978,0.001955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002933,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.000978,0.0
3,0.001955,0.000978,0.0,0.0,0.0,0.0,0.001955,0.0,0.0,0.0,0.0,0.001955,0.000978,0.001955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002933,0.0,0.000978,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.000978
4,0.000978,0.000978,0.0,0.0,0.0,0.0,0.001955,0.0,0.000978,0.0,0.0,0.001955,0.000978,0.001955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001955,0.0,0.001955,0.0,0.000978,0.0,0.0,0.000978,0.0,0.0,0.000978,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0,0.0,0.000978,0.0,0.0,0.0,0.0
