In [2]:
from Bio import SeqIO
import re
from collections import Counter
import pandas as pd
import os.path
import panel as pn
genes = ['A6', 'A8', 'CO1', 'CO2', 'CO3', 'Cytb', 'ND1', 'ND2', 'ND3', 'ND4', 'ND4L', 'ND5', 'ND6']
PATH_TO_MIDORI_FOLDER = '../../../MIDORI'
PATH_TO_MIDORI_TABLE = '../../interim/MIDORI'

In [15]:
def parser(taxa, gene):
    all_sp = []
    db = f'{PATH_TO_MIDORI_FOLDER}/{gene}.fasta'
    for entry in SeqIO.parse(db, 'fasta'):
        if taxa in entry.id:
            taxonomy = entry.id.split(';')
            #Species and Taxonomy are merged for now
            all_sp.append(f'{taxonomy[7]}|{taxonomy[4]}|{taxonomy[5]}|{";".join(taxonomy[1:6])}')
    counted_sp = Counter(all_sp)
    
    #Check if we already have data on other genes
    if os.path.isfile(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv'):
        df = pd.read_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv')
    else: 
        df = pd.DataFrame()
    sp_name = []
    gene_count = []
    for k,v in counted_sp.items():
        sp_name.append(k)
        gene_count.append(v)
    
    #Properly appending or merging df with gathered data
    tmp_df = pd.DataFrame({'Species':sp_name, gene:gene_count})
    if df.empty:   
        df = df.append(tmp_df)
    else:
        df = pd.merge(df, tmp_df, how="outer")
    df = df.fillna(0)
    df.to_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv', index=False)
    




In [16]:
#Parse
for gene in genes:
    parser('Insecta', gene)

#Separate taxonomy from species
df = pd.read_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv')
sp_name = []
orders= []
fams = []
taxonomy = []
for sp_fam in df['Species']:
    taxas = sp_fam.split('|')
    sp_name.append(taxas[0])
    orders.append(taxas[1])
    fams.append(taxas[2])
    taxonomy.append(taxas[3])

df['Species'] = sp_name
df['Orders'] = orders
df['Families'] = fams
df['Complete Taxonomy'] = taxonomy
df = df[['Species', 'Orders', 'Families', 'Complete Taxonomy', 'A6', 'A8', 'CO1', 'CO2', 'CO3', 'Cytb', 'ND1', 'ND2', 'ND3', 'ND4', 'ND4L', 'ND5', 'ND6']] 
df.sort_values(by=['Orders', 'Families'], inplace=True)
df.to_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv', index=False)

  df = df.append(tmp_df)


In [17]:
df = pd.read_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv')
#Complete stats of df
print(f'Sp_Num = {len(df["Species"])}')
for gene in genes:
    print(f'{gene} - {sum(df[gene])}')

Sp_Num = 125214
A6 - 9768.0
A8 - 7705.0
CO1 - 834893.0
CO2 - 52078.0
CO3 - 10269.0
Cytb - 41420.0
ND1 - 15596.0
ND2 - 12076.0
ND3 - 8409.0
ND4 - 13018.0
ND4L - 7957.0
ND5 - 21779.0
ND6 - 9012.0


In [19]:
df = pd.read_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv') # rewrite df, so that it's unmodified
for gene in genes:
    at_least_5 = len(df[df[gene] >= 5])
    print(f'Number of species with at least 5 sequences of {gene} - {at_least_5}')

# Avoiding counting Species and Taxonomy columns    
col_list= list(df)
col_list.remove('Species')
col_list.remove('Orders')
col_list.remove('Families')
col_list.remove('Complete Taxonomy')
df['Total'] = df[col_list].sum(axis=1) # Sum of all genes for a sp
print(f'At least 5 sequences of any gene - {len(df[df["Total"] >= 5])}')

Number of species with at least 5 sequences of A6 - 119
Number of species with at least 5 sequences of A8 - 55
Number of species with at least 5 sequences of CO1 - 31690
Number of species with at least 5 sequences of CO2 - 1627
Number of species with at least 5 sequences of CO3 - 139
Number of species with at least 5 sequences of Cytb - 1145
Number of species with at least 5 sequences of ND1 - 256
Number of species with at least 5 sequences of ND2 - 199
Number of species with at least 5 sequences of ND3 - 78
Number of species with at least 5 sequences of ND4 - 182
Number of species with at least 5 sequences of ND4L - 61
Number of species with at least 5 sequences of ND5 - 476
Number of species with at least 5 sequences of ND6 - 113
At least 5 sequences of any gene - 38938


In [26]:
#Make table of only CO1 with at least 5 seqs per sp
df = pd.read_csv(f'{PATH_TO_MIDORI_TABLE}/midori_sp_table.csv')
df = df[df['CO1'] >= 5]
for gene in genes:
    if gene != 'CO1':
        del df[gene]
df.to_csv(f'{PATH_TO_MIDORI_TABLE}/midori_CO1_table.csv', index=False)
df.to_excel(f'{PATH_TO_MIDORI_TABLE}/midori_CO1_table.xlsx', index=False)