## Creates a cleaned csv with sequences and metadata from Ncbi virus

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
fasta = "../src/prot/sequences.fasta"
csv = "../src/prot/sequences.csv"

output = "../src/sarscov2_beta_prot.csv"

In [11]:
def fasta_to_df(path):
    """only gets accessions and sequences, to be later joined with the metadata in the other csv
    """
    with open(path) as f:
        line=f.readline()
        d= {'Accession':[], 'Fasta':[]}
        ctrl = 0
        while(line):
            if line.startswith(">"):
                acc = line.strip().split("|")[0].lstrip(">").strip()
                d['Accession'].append(acc)
                
                seq = ""
                line=f.readline()
                while(line and not line.startswith(">")):
                    seq += line.strip()
                    line=f.readline()
                d['Fasta'].append(seq)
            else:
                line=f.readline()
            ctrl+=1
            if ctrl%1000 == 0:
                print(ctrl)
        else: print(ctrl)
        return pd.DataFrame(d)
                
df_seq = fasta_to_df(fasta)

df_seq

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
26580


Unnamed: 0,Accession,Fasta
0,AAU04661,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...
1,AAU04648,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...
2,AAU04658,MSDNGPQSNQRSAPRITFGGPTDSTDNNQNGGRNGARPKQRRPQGL...
3,AAU04650,MDLFMRIFTLGSITAQPVKIDNASHASTVRATATIPLQASLPFGWL...
4,AAU04656,MNELTLIDFYLCFLAFLLFLVLIMLIIFWFSLEIQDLEEPCTKV
...,...,...
26575,CAA28657,MALIGPKTTIAAVFIGPFLVACMLGIGLVYLLQLQVQIFHVKDTIR...
26576,CAA27802,MSSTTQAPGPVYQWTADEAVQFLKEWNFSLGIILLFITIILQFGYT...
26577,CAA25497,MSFVPGQENAGSRSSSGNRAGNGILKKTTWADQTERGLNNQNRGRK...
26578,CAA25197,MSSTTQAPEPVYQWTADEAVQFLKEWNFSLGIILLFITIILQFGYT...


In [12]:
df_meta = pd.read_table(csv, sep=",")
df_meta

Unnamed: 0,Accession,Release_Date,Species,Genus,Family,Length,Sequence_Type,Nuc_Completeness,Genotype,Segment,Authors,Publications,Geo_Location,Host,Isolation_Source,Collection_Date,BioSample,GenBank_Title
0,AAU04661,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,1255,GenBank,partial,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,spike glycoprotein [SARS coronavirus civet014]
1,AAU04648,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,7073,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,replicase p1AB [SARS coronavirus civet010]
2,AAU04658,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,422,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,nucleocapsid protein [SARS coronavirus civet010]
3,AAU04650,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,274,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,orf3 [SARS coronavirus civet010]
4,AAU04656,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,44,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,orf9 [SARS coronavirus civet010]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26575,CAA28657,1987-06-07T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,139,GenBank,partial,,,"Skinner,M.A., Siddell,S.G.",3973564,,,,,,unnamed protein product [Murine hepatitis virus]
26576,CAA27802,1987-03-09T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,228,GenBank,partial,,,"Pfleiderer,M., Skinner,M.A., Siddell,S.G.",3748812,,,,,,unnamed protein product [Murine hepatitis virus]
26577,CAA25497,1985-06-13T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,455,GenBank,partial,,,"Skinner,M.A., Siddell,S.G.",6308569,,,,,,nucleocapsid protein [Murine hepatitis virus]
26578,CAA25197,1983-12-06T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,228,GenBank,partial,,,"Spaan,W., Delius,H., Skinner,M., Armstrong,J.,...","6196191, 6687635, 6325918, 1604932",,,,,,E1 glycoprotein [Murine hepatitis virus]


In [13]:
"""joining tables"""
df = df_seq.merge(df_meta,on="Accession", how='left')
df

Unnamed: 0,Accession,Fasta,Release_Date,Species,Genus,Family,Length,Sequence_Type,Nuc_Completeness,Genotype,Segment,Authors,Publications,Geo_Location,Host,Isolation_Source,Collection_Date,BioSample,GenBank_Title
0,AAU04661,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,1255,GenBank,partial,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,spike glycoprotein [SARS coronavirus civet014]
1,AAU04648,MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,7073,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,replicase p1AB [SARS coronavirus civet010]
2,AAU04658,MSDNGPQSNQRSAPRITFGGPTDSTDNNQNGGRNGARPKQRRPQGL...,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,422,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,nucleocapsid protein [SARS coronavirus civet010]
3,AAU04650,MDLFMRIFTLGSITAQPVKIDNASHASTVRATATIPLQASLPFGWL...,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,274,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,orf3 [SARS coronavirus civet010]
4,AAU04656,MNELTLIDFYLCFLAFLLFLVLIMLIIFWFSLEIQDLEEPCTKV,2005-08-23T00:00:00Z,Severe acute respiratory syndrome-related coro...,Betacoronavirus,Coronaviridae,44,GenBank,complete,,,"Wang,M., Yan,M., Xu,H., Liang,W., Kan,B., Zhen...",16485471,China: Southern China,Viverridae,feces,,,orf9 [SARS coronavirus civet010]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26575,CAA28657,MALIGPKTTIAAVFIGPFLVACMLGIGLVYLLQLQVQIFHVKDTIR...,1987-06-07T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,139,GenBank,partial,,,"Skinner,M.A., Siddell,S.G.",3973564,,,,,,unnamed protein product [Murine hepatitis virus]
26576,CAA27802,MSSTTQAPGPVYQWTADEAVQFLKEWNFSLGIILLFITIILQFGYT...,1987-03-09T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,228,GenBank,partial,,,"Pfleiderer,M., Skinner,M.A., Siddell,S.G.",3748812,,,,,,unnamed protein product [Murine hepatitis virus]
26577,CAA25497,MSFVPGQENAGSRSSSGNRAGNGILKKTTWADQTERGLNNQNRGRK...,1985-06-13T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,455,GenBank,partial,,,"Skinner,M.A., Siddell,S.G.",6308569,,,,,,nucleocapsid protein [Murine hepatitis virus]
26578,CAA25197,MSSTTQAPEPVYQWTADEAVQFLKEWNFSLGIILLFITIILQFGYT...,1983-12-06T00:00:00Z,Murine coronavirus,Betacoronavirus,Coronaviridae,228,GenBank,partial,,,"Spaan,W., Delius,H., Skinner,M., Armstrong,J.,...","6196191, 6687635, 6325918, 1604932",,,,,,E1 glycoprotein [Murine hepatitis virus]


In [14]:
df.to_csv(output)