<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li></ul></div>

# Covid 19 Notebook

<strong>Goals and notes</strong>

In [11]:
%%bash

pwd
ls

/home/ruggm/BioInformatics/000_Research/Covid19
Covid_Notebook.ipynb
sequences.fasta


## Libraries

In [12]:
# Basic Packages
import numpy as np
import pandas as pd

# Ploting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Bio Libraries
from Bio import SeqIO

## Exploratory Data Analysis

In [13]:
# Initialize dictionary
sequence_dict = dict()
protein_set = set()
with open("sequences.fasta", "rU") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence_dict[record.id] = {}
        sequence_dict[record.id]["name"] = record.id
        sequence_dict[record.id]["length"] = len(record.seq)
        sequence_dict[record.id]["sequence"] = record.seq
        
        # Spliting up the pipes in the description     
        sequence_dict[record.id]["description"] = record.description.split("|")[1]
        sequence_dict[record.id]["type"] = record.description.split("|")[3]
        sequence_dict[record.id]["species"] = record.description.split("|")[4]
        sequence_dict[record.id]["country"] = record.description.split("|")[6]
        
        # These sequences are very much out of alignment, but I want to look at protein to 
        # consider phylogenetics this line below builds an amino acid translation from DNA sequences.
        sequence_dict[record.id]["amino_list"] = str(record.seq.translate()).split("*")
        
        # I am curious about the proteins used in each variant. Making set to explore values
        protein_set |= set(sequence_dict[record.id]["amino_list"])        

  with open("sequences.fasta", "rU") as fasta_file:


In [14]:
# There are a lot of entries here, so we need to figure out a way to categorize them:
protein_list = list(protein_set)

# Make a sorted list of proteins
protein_list.sort(key=lambda item: (-len(item), item))

In [27]:
count = 0
protein_dict = dict()

for index in range(len(protein_list)):
    if protein_list[index][0:5] == protein_list[index - 1][0:5]:
        count += 1
    else:
        count = 0
        
    protein_dict[protein_list[index]] = dict()
    
    size = len(protein_list[index]) 
    if size > 30: 
        print(size, index)
    #if size > 30:
    # Here we are going to make two different classifications, family and variant
    protein_dict[protein_list[index]]["variant"] = "{}-{}-{}".format(protein_list[index][0:5], size, count)
    protein_dict[protein_list[index]]["family"] = "{}-{}".format(protein_list[index][0:5], size)

4447 0
4447 1
4444 2
4441 3
4409 4
4409 5
4409 6
4409 7
4409 8
4409 9
4409 10
4409 11
4409 12
4409 13
4409 14
4409 15
4409 16
4409 17
4409 18
4409 19
4409 20
4409 21
4409 22
4409 23
4409 24
4409 25
4409 26
4409 27
4409 28
4409 29
4409 30
4409 31
4409 32
4409 33
4409 34
4409 35
4409 36
4409 37
4409 38
4409 39
4409 40
4409 41
4409 42
4409 43
4409 44
4409 45
4409 46
4409 47
4409 48
4409 49
4409 50
4409 51
4409 52
4409 53
4409 54
4409 55
4409 56
4409 57
4409 58
4409 59
4409 60
4409 61
4408 62
4407 63
4407 64
4407 65
4406 66
4406 67
4406 68
4406 69
2702 70
2701 71
2701 72
2701 73
2701 74
2701 75
2701 76
2701 77
2701 78
2701 79
2701 80
2701 81
2701 82
2701 83
2701 84
2701 85
2701 86
2701 87
2701 88
2701 89
2701 90
2701 91
2701 92
2701 93
2701 94
2701 95
2701 96
2701 97
2701 98
2701 99
2701 100
2701 101
2701 102
2701 103
2701 104
2701 105
2701 106
2701 107
2701 108
2701 109
2701 110
2701 111
2701 112
2701 113
2701 114
2701 115
2701 116
2701 117
2701 118
2701 119
2701 120
2701 121
2701 122
270

In [26]:
# This next step can be done in pandas, but it is quicker to iterate through the dict
for record in sequence_dict:  
    for entry in protein_set:
            sequence_dict[record][protein_dict[entry]["variant"]] = sequence_dict[record]["amino_list"].count(entry)

In [7]:
df = pd.DataFrame.from_dict(sequence_dict).T

In [8]:
df = df.apply(pd.to_numeric, errors='ignore')



In [32]:
df.dtypes

name           object
length          int64
sequence       object
description    object
type           object
                ...  
CVKLI-32-14     int64
FIT-3-0         int64
LLHCF-22-0      int64
QCCFS-10-0      int64
DGEPC-46-4      int64
Length: 2796, dtype: object

In [33]:
df

Unnamed: 0,name,length,sequence,description,type,species,country,amino_list,-0-0,LPR-3-0,...,XSTFD-7-0,MLVII-7-0,CLCRF-8-0,RSL-3-0,LMTRV-16-0,CVKLI-32-14,FIT-3-0,LLHCF-22-0,QCCFS-10-0,DGEPC-46-4
NC_045512,NC_045512,29903,"(A, T, T, A, A, A, G, G, T, T, T, A, T, A, C, ...",Severe acute respiratory syndrome coronavirus ...,refseq,complete,Severe acute respiratory syndrome-related coro...,"[IKGLYLPR, QTNQLSISCRSVL, TNFKICVAVTRLHA, CTHA...",69,1,...,0,0,1,0,1,0,0,0,0,0
MT350236,MT350236,29865,"(A, C, T, T, T, C, G, A, T, C, T, C, T, T, G, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[TFDLL, ICSLNEL, NLCGCHSAACLVHSRSIINN, LLSLTGH...",54,0,...,0,0,0,1,0,0,1,0,1,0
MT350237,MT350237,29866,"(A, A, C, T, T, T, C, G, A, T, C, T, C, T, T, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[NFRSLVDLFSKRTLKSVWLSLGCMLSALTQYN, , LITVVDRTR...",14,0,...,0,0,0,0,0,0,0,0,0,0
MT350238,MT350238,29826,"(C, C, A, A, C, T, T, T, C, G, A, T, C, T, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[PTFDLL, ICSLNEL, NLCGCHSAACLVHSRSIINN, LLSLTG...",55,0,...,0,0,0,1,0,0,1,0,1,0
MT350239,MT350239,29897,"(T, G, T, T, T, A, T, A, C, C, T, T, C, C, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[CLYLPRXQTNQLSISCRSVL, TNFKICVAVTRLHA, CTHAV, ...",69,1,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MN988713,MN988713,29882,"(A, T, T, A, A, A, G, G, T, T, T, A, T, A, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[IKGLYLPR, QTNQLSISCRSVL, TNFKICVAVTRLHA, CTHA...",69,1,...,0,0,1,0,1,0,0,0,0,0
MN938384,MN938384,29838,"(C, A, A, C, C, A, A, C, T, T, T, C, G, A, T, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,China,"[QPTFDLL, ICSLNEL, NLCGCHSAACLVHSRSIINN, LLSLT...",54,0,...,0,0,0,1,0,0,1,0,1,0
MN975262,MN975262,29891,"(A, T, T, A, A, A, G, G, T, T, T, A, T, A, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,China,"[IKGLYLPR, QTNQLSISCRSVL, TNFKICVAVTRLHA, CTHA...",69,1,...,0,0,1,0,1,0,0,0,0,0
MN985325,MN985325,29882,"(A, T, T, A, A, A, G, G, T, T, T, A, T, A, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[IKGLYLPR, QTNQLSISCRSVL, TNFKICVAVTRLHA, CTHA...",69,1,...,0,0,1,0,1,0,0,0,0,0


In [21]:
for col in df.columns: 
    print(col, df[col].sum()) 

name NC_045512MT350236MT350237MT350238MT350239MT350240MT350241MT350242MT350243MT350244MT350245MT350246MT350247MT350248MT350249MT350250MT350251MT350252MT350253MT350254MT350255MT350256MT350257MT350263MT350264MT350265MT350266MT350267MT350268MT350269MT350270MT350271MT350272MT350273MT350274MT350275MT350276MT350277MT350278MT350279MT350280MT350282MT344944MT344945MT344946MT344947MT344948MT344949MT344950MT344951MT344952MT344953MT344954MT344955MT344956MT344957MT344958MT344959MT344960MT344961MT344962MT344963MT345798MT345799MT345801MT345802MT345803MT345805MT345806MT345808MT345809MT345811MT345812MT345814MT345815MT345816MT345817MT345819MT345821MT345822MT345823MT345824MT345825MT345826MT345827MT345828MT345829MT345830MT345832MT345833MT345834MT345835MT345836MT345838MT345840MT345841MT345842MT345843MT345844MT345846MT345847MT345848MT345849MT345850MT345851MT345852MT345853MT345854MT345855MT345856MT345857MT345858MT345859MT345860MT345861MT345862MT345865MT345866MT345867MT345868MT345869MT345870MT345871MT345872MT

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 165
SHSFN-8-0 430
LLLIV-28-0 1
LRDTL-34-0 1
QRKLS-5-0 165
PKGKM-4409-8 2
RQISS-7-0 165
IS-2-0 595
TVHLT-13-0 164
LVGLQ-7-0 163
LNFH-4-0 430
HCECF-7-0 165
QKKKK-9-0 5
DFSIK-7-0 1
CTIVF-2701-6 1
QKKKK-14-3 1
PYRCC-12-1 429
HLLYT-12-0 1
LKLLI-25-0 2
SIYTL-8-0 165
LLSFY-8-0 429
NL-2-0 1
SHLRI-8-0 430
PLTFD-7-0 5
EXXXX-11-1 1
VGT-3-0 1
CVISL-10-0 165
NLLTM-19-0 165
GGLER-21-0 1
QKKKK-15-1 1
PSLLR-17-0 231
ILLLH-6-0 429
KLKKA-34-1 161
ML-2-0 165
QCVSL-12-0 429
STSCR-6-0 231
LCVPA-20-0 1
VEYGY-9-0 429
CVKLI-32-1 1
IWLEN-8-0 1
FGFYY-5-0 164
QDTSN-35-4 1
CHRCD-16-0 165
KVTFM-27-0 163
FTVMY-12-0 231
AHLLS-6-0 165
END-3-0 10
TRQKH-21-1 1
DS-2-0 860
FQLDT-9-0 230
EARNS-32-1 4
IIKIS-26-0 231
LLRCS-20-0 430
RNAGQ-16-0 430
QLNEQ-14-0 165
KSHHI-36-0 3
AYGFV-14-0 2
LLXXX-16-1 3
TSTFT-11-0 2
LKILL-24-0 1
SIQCN-30-2 3
KVWII-38-0 1
PKGKM-4409-22 1
FC-2-0 165
NLIIH-42-1 163
FTTIR-8-0 429
IRH-3-0 165
NXXXX-29-0 1
KTQAF-13-0 165
FLQTI-12-0 430
IQRRN-39-3 1
TDKYY-30-2 1
RSTFQ-19-0 408
KEQSR-15-0 165
RPTFQ-19

IKTLL-10-0 165
GLYGT-13-0 1
CTIVF-2701-16 1
GRKFC-5-0 165
QDFLK-9-0 74
ENDXK-12-0 1
KYSII-14-0 231
CYADN-10-0 429
KPFFV-37-0 165
CV-2-0 231
YGTLK-30-0 165
HMDNS-18-1 162
YTSVY-30-1 428
LQTLA-43-3 1
KL-2-0 231
CRIV-4-0 165
VTER-4-0 1
FCK-3-0 2
CGSCS-5-0 430
RKQQS-8-0 165
CYPHV-10-0 1
STPRT-10-0 430
KCFC-4-0 164
LHHKP-7-0 165
VVXXX-34-0 1
TDKYY-30-1 424
INMHX-17-0 1
CLS-3-0 430
ESS-3-0 1
SHA-3-0 165
ENDKK-17-0 1
ESSQL-5-0 429
KPLNF-35-0 2
CXXXX-14-0 1
HRSYW-6-0 429
LLVIL-27-0 165
ESSKL-7-0 2
VDHRQ-18-0 430
SEVFI-7-0 430
CQXXX-8-0 1
TNMKI-123-6 1
GGLER-21-2 2
KEL-3-0 430
CCP-3-0 430
LQTLA-43-0 2
PKGKM-4409-55 1
RHA-3-0 430
HLSFG-15-0 1
QKKKK-11-0 36
GLAVL-5-0 1
RIKQP-12-0 1
QXXKK-7-0 1
PKGKM-4409-47 1
RSCCF-433-6 1
LPGNK-39-0 1
CVKLI-19-0 15
YSLS-4-0 2
SLVSV-89-2 3
QSNQL-13-0 3
NICL-4-0 231
IRLLT-15-0 165
CVKLI-32-12 1
LEKTT-1293-4 1
VIHLK-21-0 1
KD-2-0 430
ENSLH-24-0 231
KCKVT-7-0 165
PTVIR-6-0 165
ITHSV-9-0 231
ENXKK-14-0 1
LFAFL-6-0 430
PGCCS-9-0 430
LP-2-0 165
KVEIS-7-0 1
FKNYR-20-0 4

RPTTT-10-0 429
KLVIY-7-0 165
CVKLI-33-1 1
QTIPS-21-0 165
WWQFV-7-0 163
RHNDE-25-0 165
VKHLS-27-1 161
ENDKK-18-1 1
SCTKP-13-2 425
LSKRD-7-0 1
LA-2-0 595
LLNKT-55-2 1
NASES-35-2 1
QKKKK-14-0 2
YL-2-0 165
TMLRC-46-2 396
RLFRV-440-0 1
LTSLA-25-0 165
FPGNK-39-1 1
TNMKI-96-0 1
LRDTL-34-1 229
INMHS-17-0 224
CTIVF-2694-76 1
LLRLL-9-0 165
PKGKM-4409-27 1
DGXXX-46-0 1
TKFKI-14-0 1
MGR-3-0 429
QTFKL-13-0 7
MVY-3-0 231
ENHKK-7-0 1
MLLRP-11-0 165
CLSFV-13-1 1
HQEQI-51-0 89
ENDKK-16-1 1
TNLWI-7-0 231
IVRG-4-0 228
WLSYF-7-0 430
CCSRS-14-0 1
CYDVH-6-0 429
QIHRW-15-1 1
TVHQT-79-1 163
WKILP-14-0 165
PKGKM-4409-39 3
LEKTT-1293-25 3
DLFSK-26-0 1
EGCIL-7-0 430
QXNQL-13-0 1
GCRSC-13-1 425
SYHIP-8-0 429
LLWLY-16-0 165
VVLSV-20-0 228
ALCVH-19-0 165
FGYQQ-25-0 1
SCCTT-19-0 430
SCTKP-12-0 1
LVVQL-14-0 164
VLFMH-36-3 161
GNNPI-37-0 1
RTNGK-15-0 1
RSK-3-0 430
KVC-3-0 29
HSTC-4-0 430
QCVIL-12-0 1
ENDKK-8-0 2
NHW-3-0 165
FFFRF-19-0 2
CTIVF-2701-70 1
PIDSF-9-0 231
AGCC-4-0 429
LNSGL-27-1 425
ETHFL-5-0 165
LNKAL-6-0 

TSLRA-42-0 165
LVDLF-28-0 7
KLIVV-15-0 1
LYCML-12-0 2
CTIVF-2701-74 1
CL-2-0 230
EKYLL-10-0 231
SCRIP-5-0 430
GPAVL-5-0 230
EARNS-32-0 425
GDI-3-0 165
LLRRM-21-2 1
STQPL-6-0 165
WTPKS-22-1 1
FLAYV-8-0 430
GFIP-4-0 430
VTTDV-8-0 164
INVVE-12-0 2
RRF-3-0 1
FRCRN-5-0 430
VLSGC-62-1 67
CFFFC-38-0 1
LXTFD-7-0 1
RWNLH-12-0 165
AVLTQ-20-0 135
CVKLI-35-1 1
YCC-3-0 430
NLLSS-13-2 428
CRFNF-7-0 165
IKPFP-9-0 164
QSR-3-0 430
EGIKI-9-0 165
KTLTL-45-0 1
QY-2-0 165
LLRRM-19-0 2
KVLLL-36-0 165
HTLCA-14-2 1
SGNNF-5-0 430
SHLT-4-0 164
PKGKM-4409-10 3
KLQKR-10-0 165
ENDKK-16-0 1
CVTNV-70-0 1
NCLKI-12-0 231
QLSIS-10-0 5
TCISL-18-0 1
LTSIC-7-0 430
HFKVL-7-0 165
PSHRQ-8-0 1
TLAAN-41-0 1
IKLFP-9-0 1
TVFVL-36-0 165
RSSGQ-13-0 1
LLLA-4-0 165
ASAQR-290-1 1
QKKKK-14-1 1
CYPHV-10-3 1
KNGFT-22-0 429
QDTSN-35-2 246
INALR-8-0 231
LLRRM-16-1 44
LVIVQ-49-0 230
QRSKF-13-0 427
LLRRV-16-0 1
CSLPT-19-1 1
PAWW-4-0 279
HIFSG-11-0 165
RSCCF-433-4 10
THA-3-0 430
RTS-3-0 165
MKSNQ-12-1 7
PXQIP-22-0 1
IQLCS-7-0 1
CFFFS-33-0 1


In [15]:
df["-0-0"].sum()

41772