<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li></ul></div>

# Covid 19 Notebook

<strong>Goals and notes</strong>

In [1]:
%%bash

pwd
ls

/home/ruggm/BioInformatics/000_Research/Covid19
Covid_Notebook.ipynb
sequences.fasta


## Libraries

In [1]:
import numpy as np
import pandas as pd

from Bio import SeqIO

## Exploratory Data Analysis

In [2]:
# Initialize dictionary
sequence_dict = dict()
protein_set = set()
with open("sequences.fasta", "rU") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence_dict[record.id] = {}
        sequence_dict[record.id]["name"] = record.id
        sequence_dict[record.id]["length"] = len(record.seq)
        sequence_dict[record.id]["sequence"] = record.seq
        
        # Spliting up the pipes in the description     
        sequence_dict[record.id]["description"] = record.description.split("|")[1]
        sequence_dict[record.id]["type"] = record.description.split("|")[3]
        sequence_dict[record.id]["species"] = record.description.split("|")[4]
        sequence_dict[record.id]["country"] = record.description.split("|")[6]
        
        # These sequences are very much out of alignment, but I want to look at protein to 
        # consider phylogenetics this line below builds an amino acid translation from DNA sequences.
        sequence_dict[record.id]["amino_list"] = str(record.seq.translate()).split("*")
        
        # I am curious about the proteins used in each variant. Making set to explore values
        protein_set |= set(sequence_dict[record.id]["amino_list"])        

  with open("sequences.fasta", "rU") as fasta_file:


In [3]:
# There are a lot of entries here, so we need to figure out a way to categorize them:
protein_list = list(protein_set)

# Make a sorted list of proteins
protein_list.sort(key=lambda item: (-len(item), item))

In [4]:
count = 0
protein_dict = dict()

for index in range(len(protein_list)):
    if protein_list[index][0:5] == protein_list[index - 1][0:5]:
        count += 1
    else:
        count = 0
        
    protein_dict[protein_list[index]] = dict()
    
    # Here we are going to make two different classifications, family and variant
    protein_dict[protein_list[index]]["variant"] = "{}-{}-{}".format(protein_list[index][0:5], len(protein_list[index]), count)
    protein_dict[protein_list[index]]["family"] = "{}-{}".format(protein_list[index][0:5], len(protein_list[index]))

In [5]:
# This next step can be done in pandas, but it is quicker to iterate through the dict
for record in sequence_dict:  
    for entry in protein_set:
            sequence_dict[record][protein_dict[entry]["variant"]] = sequence_dict[record]["amino_list"].count(entry)

In [49]:
df = pd.DataFrame.from_dict(sequence_dict).T

In [53]:
df.head()

Unnamed: 0,name,length,sequence,description,type,species,country,amino_list,-0-0,SSIGW-10-0,...,TALCV-21-0,PKGKM-4409-15,NLITH-42-0,KLLCT-31-0,TVHQT-79-0,LKLLI-25-0,RQISS-7-0,PXXXX-22-0,GTL-3-0,WPTLQ-10-0
NC_045512,NC_045512,29903,"(A, T, T, A, A, A, G, G, T, T, T, A, T, A, C, ...",Severe acute respiratory syndrome coronavirus ...,refseq,complete,Severe acute respiratory syndrome-related coro...,"[IKGLYLPR, QTNQLSISCRSVL, TNFKICVAVTRLHA, CTHA...",69,1,...,1,0,0,0,0,0,0,0,0,0
MT350236,MT350236,29865,"(A, C, T, T, T, C, G, A, T, C, T, C, T, T, G, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[TFDLL, ICSLNEL, NLCGCHSAACLVHSRSIINN, LLSLTGH...",54,0,...,0,0,0,1,0,0,1,0,1,1
MT350237,MT350237,29866,"(A, A, C, T, T, T, C, G, A, T, C, T, C, T, T, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[NFRSLVDLFSKRTLKSVWLSLGCMLSALTQYN, , LITVVDRTR...",14,0,...,0,0,0,0,0,0,0,0,0,0
MT350238,MT350238,29826,"(C, C, A, A, C, T, T, T, C, G, A, T, C, T, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[PTFDLL, ICSLNEL, NLCGCHSAACLVHSRSIINN, LLSLTG...",55,0,...,0,0,0,1,0,0,1,0,1,1
MT350239,MT350239,29897,"(T, G, T, T, T, A, T, A, C, C, T, T, C, C, C, ...",Severe acute respiratory syndrome coronavirus ...,complete,Homo sapiens,USA,"[CLYLPRXQTNQLSISCRSVL, TNFKICVAVTRLHA, CTHAV, ...",69,1,...,1,0,0,0,0,0,0,0,0,0


In [8]:
protein_set = set()
for index in sequence_dict:
    protein_set |= set(sequence_dict[index]["amino_list"])

In [9]:
len(protein_set)

2788