#### corona genome explorer

In [1]:
from Bio import SeqIO
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as plt
import zlib
import os
from IPython.display import display, HTML

# data folders
root_folder = "./data/genbank/"
viruses = ["coronaviridae"]

parse the genome for each virus sample (stored as a genbank file) and condense the information in a data frame.

In [10]:
for vrs in viruses:    
    print ("[+] reading genomes")
    
    # define data frame structure
    feature_columns = ["country", "collection_date", "host", "strain"]
    sample_columns = ["id", "length", "information", "sequence"]
    total_columns = sample_columns + feature_columns
    vrs_frame = {i:list() for i in total_columns}

    # loop over gb files
    data_folder = root_folder + vrs + "/"
    for filename in os.listdir(data_folder):
        if filename.endswith(".gb"): 
            for seq_record in SeqIO.parse(data_folder + filename, "genbank"):
                
                # get several additional variables from genbank file featues (if availbale)
                for ftrs in seq_record.features:
                    if ftrs.type == "source":                        
                        for var in feature_columns:
                            if var in ftrs.qualifiers:
                                vrs_frame[var].append(ftrs.qualifiers[var][0])
                            else:
                                vrs_frame[var].append("unknown")
                        break
                    else:
                        print("not source on : ", filename)
                    
                
                # compress virus genome to get an idea of the amount of information. 
                info = len(zlib.compress(str(seq_record.seq).encode('utf-8')))

                # genome length
                genome_length = len(seq_record.seq)

                # genome sequence (as string)
                genome_sequence = str(seq_record.seq)

                vrs_frame["id"].append(seq_record.id)
                vrs_frame["information"].append(info)
                vrs_frame["length"].append(genome_length)
                vrs_frame["sequence"].append(genome_sequence)
        else:
            continue

    df = pd.DataFrame.from_dict(vrs_frame)
    print(df)

[+] reading genomes
              id  length  information  \
0     LC063818.1   27904         8340   
1     GU553365.1   29644         8863   
2     KX425847.1   27709         8269   
3     LP731475.1   28038         8372   
4     LC063838.1   27481         8224   
...          ...     ...          ...   
3410  KJ569769.1   25422         7693   
3411  JX860640.1   31028         9211   
3412  KX348117.1   27685         8255   
3413  MG197712.1   30497         9036   
3414  KT253328.1   27270         8087   

                                               sequence          country  \
0     CAATTCAACTAAACGAAATTTTGTCCTTCCGGCCGCATGTCCATGC...  Japan: Kumamoto   
1     CGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGT...              USA   
2     ACTGAAAATAGATATTATTATATATCTATTACACTAGCCTTGCGCT...            China   
3     ACTTAAAAAGATTTTCTATCTACAGATAGTTAGCTCTTTTTCTAGA...          unknown   
4     CAATTCAACTAAACGAAATTTTGTCCTTCCGGCCGCATGTCCATGC...    Japan: Miyagi   
...                            