# Preprocessing

In [None]:
key_names = ['name', 'accno', 'date', 'sloc', 'eloc']
listrec = []

In [None]:
with open("S_protein.fasta") as file:
    for meta in file:
        sprot = next(file)
        rec = dict(zip(key_names, meta.split('|')))
        rec['sprot'] = sprot
        listrec.append(rec)

In [None]:
listrec.sort(key = lambda x:x['date'])

In [None]:
with open('sProt_sorted.fasta','w') as log:
    for rec in listrec:
        log.write('{}|{}|{}|{}|{}{}'.format(rec['name'], rec['accno'], rec['date'], rec['sloc'], rec['eloc'], rec['sprot']))

# Date Analysis

In [None]:
key_names = ['name', 'accno', 'date', 'sloc', 'eloc']
listrec = []

In [None]:
with open("sProt_sorted.fasta") as file:
    for meta in file:
        sprot = next(file)
        rec = dict(zip(key_names, meta.split('|')))
        rec['sprot'] = sprot
        listrec.append(rec)

In [None]:
from collections import Counter

c = Counter()
for item in listrec:
    c[item["date"]] += 1

print(c)
len(c)

# Reference S Gene

In [None]:
lenRec = len(listrec[0]['sprot'])
nRec = len(listrec)

In [None]:
refGen = ''

In [None]:
for i in range(lenRec-1):
    nA = nT = nG = nC = 0
    for rec in listrec:
        if rec['sprot'][i] == 'A':
            nA += 1
        elif rec['sprot'][i] == 'T':
            nT += 1
        elif rec['sprot'][i] == 'G':
            nG += 1
        elif rec['sprot'][i] == 'C':
            nC += 1
    print(i, nA,nT,nG,nC)
    if nA == max(nA, nT, nG, nC):
        refGen = refGen + 'A'
    elif nT == max(nA, nT, nG, nC):
        refGen = refGen + 'T'
    elif nG == max(nA, nT, nG, nC):
        refGen = refGen + 'G'
    elif nC == max(nA, nT, nG, nC):
        refGen = refGen + 'C'

In [None]:
refGen

In [None]:
file1 = open('refgen.txt', 'r')
refgen = file1.readlines()
refgen = refgen[0]
refgen

# Finding Base Substitutions using Ref Genome

In [None]:
j = 0
with open('nBS.txt','w') as log:
    for rec in listrec:
        nBS = 0
        for i in range(len(refgen)-1):
            if rec['sprot'][i] != refgen[i]:
                nBS += 1
        j +=1
        log.write('{},{},{},{}\n'.format(rec['name'], rec['accno'], rec['date'], nBS))

# Finding Base Substitutions using the first gene

In [None]:
j = 0
refgen = listrec[0]['sprot']

In [None]:
for rec in listrec:
    nBS = 0
    for i in range(len(refgen)-1):
        if rec['sprot'][i] != refgen[i]:
            nBS += 1
    j +=1
    print(j,nBS)

# nBS Analysis

In [None]:
key_names = ['name', 'accno', 'date', 'nBS']
listrec = []

In [None]:
with open("nBS.txt") as file:
    for meta in file:
        rec = dict(zip(key_names, meta.split(',')))
        listrec.append(rec)

In [None]:
for rec in listrec:
    rec['nBS'] = rec['nBS'].rstrip()

In [None]:
from collections import Counter

c = Counter()
for item in listrec:
    c[item["nBS"]] += 1

print(c)
len(c)

# Creating Protein Sequences

In [None]:
gencode = {
    'TTT' : 'F','TTC' : 'F','TTA' : 'L','TTG' : 'L','TCT' : 'S','TCC' : 'S','TCA' : 'S','TCG' : 'S','TAT' : 'Y','TAC' : 'Y',
    'TAA' : '*','TAG' : '*','TGT' : 'C','TGC' : 'C','TGA' : '*','TGG' : 'W','CTT' : 'L','CTC' : 'L','CTA' : 'L','CTG' : 'L',
    'CCT' : 'P','CCC' : 'P','CCA' : 'P','CCG' : 'P','CAT' : 'H','CAC' : 'H','CAA' : 'Q','CAG' : 'Q','CGT' : 'R','CGC' : 'R',
    'CGA' : 'R','CGG' : 'R','ATT' : 'I','ATC' : 'I','ATA' : 'I','ATG' : 'M','ACT' : 'T','ACC' : 'T','ACA' : 'T','ACG' : 'T',
    'AAT' : 'N','AAC' : 'N','AAA' : 'K','AAG' : 'K','AGT' : 'S','AGC' : 'S','AGA' : 'R','AGG' : 'R','GTT' : 'V','GTC' : 'V',
    'GTA' : 'V','GTG' : 'V','GCT' : 'A','GCC' : 'A','GCA' : 'A','GCG' : 'A','GAT' : 'D','GAC' : 'D','GAA' : 'E','GAG' : 'E',
    'GGT' : 'G','GGC' : 'G','GGA' : 'G','GGG' : 'G'
    
}

In [None]:
key_names = ['name', 'accno', 'date', 'sloc', 'eloc']
listrec = []
with open("sProt_sorted.fasta") as file:
    for meta in file:
        sprot = next(file)
        rec = dict(zip(key_names, meta.split('|')))
        rec['sprot'] = sprot
        listrec.append(rec)

In [None]:
with open('proteinSeq.fasta','w') as log:
    for rec in listrec:
        rec = listrec[0]
        pSeq = ''
        for i in range(0, len(rec['sprot'])-1, 3):
            codon = rec['sprot'][i] + rec['sprot'][i+1] + rec['sprot'][i+2]
            if codon in gencode.keys():
                pSeq += gencode[codon] 
        log.write('{}|{}|{}\n{}\n'.format(rec['name'], rec['accno'], rec['date'], pSeq))

# Finding number of silent, missense and nonsense mutations

In [None]:
key_names = ['name', 'accno', 'date', 'sloc', 'eloc']
listrec = []

with open("sProt_sorted.fasta") as file1, open("proteinSeq.fasta") as file2:
    for meta1, meta2 in zip(file1, file2):
        sprot = next(file1)
        pseq = next(file2)
        rec = dict(zip(key_names, meta1.split('|')))
        rec['sprot'] = sprot.strip()
        rec['pSeq'] = pseq.strip()
        listrec.append(rec)

In [None]:
refgen = listrec[0]['sprot']
refprot = listrec[0]['pSeq'] 

In [None]:
#testing code
rec = listrec[1]
k = 0
nSil = 0
nMis = 0
nNon = 0
for i in range(0, len(refgen)-1):
    if rec['sprot'][i] != refgen[i]:
        k = i//3
        if rec['pSeq'][k] == refprot[k]:
            nSil += 1
        elif rec['pSeq'][k] != refprot[k]:
            nMis += 1
        elif rec['pSeq'][k] == '*' and refprot[k] != '*':
            nNon += 1
print(nSil, nMis, nNon)

In [None]:
j = 0
with open('mutations.txt','w') as log:
    for rec in listrec:
        nSil = 0
        nMis = 0
        nNon = 0
        for i in range(0, len(refgen)-1):
            if rec['sprot'][i] != refgen[i]:
                k = i//3
                if rec['pSeq'][k] == refprot[k]:
                    nSil += 1
                elif rec['pSeq'][k] != refprot[k]:
                    nMis += 1
                elif rec['pSeq'][k] == '*' and refprot[k] != '*':
                    nNon += 1
        log.write('{},{},{},{},{},{}\n'.format(rec['name'], rec['accno'], rec['date'], nSil, nMis, nNon))

# Total number of Sil, Mis and Non mutations

In [None]:
key_names = ['name', 'accno', 'date', 'nSil', 'nMis', 'nNon']
listrec = []

In [None]:
with open("mutations.txt") as file:
    for meta in file:
        rec = dict(zip(key_names, meta.split(',')))
        listrec.append(rec)

In [None]:
for rec in listrec:
    rec['nNon'] = rec['nNon'].rstrip()

In [None]:
tSil = 0
tMis = 0
tNon = 0

for rec in listrec:
    tSil += int(rec['nSil'])
    tMis += int(rec['nMis'])
    tNon += int(rec['nNon'])

In [None]:
print(tSil, tMis, tNon)

# Visual Representation of Mutations

In [None]:
key_names = ['name', 'accno', 'date', 'sloc', 'eloc']
listrec = []
with open("sProt_sorted.fasta") as file:
    for meta in file:
        sprot = next(file)
        rec = dict(zip(key_names, meta.split('|')))
        rec['sprot'] = sprot
        listrec.append(rec)

In [None]:
file1 = open('refgen.txt', 'r')
refgen = file1.readlines()
refgen = refgen[0]

In [None]:
with open('visualMut.txt','w') as log:
    for rec in listrec:
        cleanrec = ''
        for i in range(len(refgen)-1):
            if rec['sprot'][i] != refgen[i]:
                cleanrec = cleanrec + rec['sprot'][i]
            else:
                cleanrec = cleanrec + '.'
        log.write('{},{},{}\n{}\n'.format(rec['name'], rec['accno'], rec['date'], cleanrec))

# Average Monthly Mutations

In [None]:
key_names = ['name', 'accno', 'date', 'nBS']
listrec = []
with open("nBS.txt") as file:
    for meta in file:
        rec = dict(zip(key_names, meta.split(',')))
        listrec.append(rec)
for rec in listrec:
    rec['nBS'] = rec['nBS'].rstrip()


In [None]:
from itertools import groupby
import statistics
for k,v in groupby(listrec,key=lambda x:x['date'][:7]):
    print ('date:'+k+'-01','nbs:'+str(statistics.mean(int(d['nBS']) for d in v)))

# Mutations in Functional Domains