In [1]:
#Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import re

pd.set_option('display.max_columns', None)

In [19]:
# Insert the names (or paths) of the tsv files
files = [
    "bombina/bombina_corset_DEGS__unk_not-unk.fasta.transdecoder.pep_nr.tsv",
    "bombina/bombina_corset_DEGS__unk_not-unk.fasta.transdecoder.pep_tr.tsv",
    "bombina/bombina_corset_DEGS__unk_not-unk.fasta.transdecoder.pep_sp.tsv"   
]

# Insert the titles of the graph
title = "bombina_pachypus_blastp"

# Insert the databases names
databases_names =[
    "Nr", 
    "TrEMBL",
    "Swiss-Prot",
]

table_path = "./bombina/bombina_unref_vs_not_unkref_table_padj_0.05----log2fc_1 (4).tsv"

# Insert the path of the graphs
path = "bombina/report/" + title

# Set the outformat
# e.g. 
# outfmt = "qseqid qlen sseqid sallseqid slen qstart qend sstart send qseq full_qseq sseq full_sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos qframe btop cigar staxids sscinames sskingdoms skingdoms sphylums stitle salltitles qcovhsp scovhsp qtitle qqual full_qqual qstrand"
# If there are column names in the file then set outfmt = None
outfmt = "qseqid qlen sseqid sallseqid slen qstart qend sstart send qseq full_qseq sseq full_sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos qframe btop cigar staxids sscinames sskingdoms skingdoms sphylums stitle salltitles qcovhsp scovhsp qtitle qqual full_qqual qstrand"

# Columns names
features = ["transcript", "row", "log2FoldChange", "padj", 
            "protein_accession", "sequence_identity", "alignment_length", 
            "evalue", "database", "locus_name", "sequence_description",
            "sequence_length", "organism", "protein_product"]

table = pd.read_csv(table_path, sep='\t')

In [20]:
table.head()

Unnamed: 0,transcript,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,NODE_100481_length_1662_cov_45.942320_g39073_i0,113.572774,3.729238,0.749879,4.973122,6.588303e-07,0.001008
1,NODE_100687_length_1658_cov_35.724701_g39182_i0,34.441283,2.713316,0.640121,4.238757,2.247607e-05,0.016458
2,NODE_100697_length_1658_cov_14.098680_g3928_i1,20.244586,-5.263078,1.052069,-5.002597,5.656316e-07,0.000894
3,NODE_101001_length_1653_cov_22.559899_g39352_i0,34.277619,2.555678,0.589651,4.334222,1.462765e-05,0.012317
4,NODE_101191_length_1650_cov_9.495262_g39455_i0,19.673311,-5.888866,1.467984,-4.011534,6.032561e-05,0.036529


In [21]:
def get_transcript(row):
    for t in table.transcript:
        if re.match(t, row):
            return t
    return None

df = pd.DataFrame()

for i in range(len(files)):

    #Import the dataset
    df_tmp = pd.read_csv(files[i], sep="\t", names=outfmt.split())

    df_tmp['transcript'] = df_tmp['qseqid'].apply(get_transcript)
    df_tmp['row'] = title
    df_tmp['sequence_identity'] = df_tmp.pident
    df_tmp['alignment_length'] = df_tmp.length
    df_tmp['evalue'] = df_tmp.evalue
    df_tmp['sequence_description'] = df_tmp.stitle
    df_tmp['sequence_length'] = df_tmp.slen
    df_tmp['database'] = databases_names[i]

    if "OS=" not in df_tmp.stitle[0]:    
        def get_sciname(x):
            
            os_index = - x[::-1].index('[')

            return x[os_index:-1]

        # Useful functions
        def get_protein_function(x):

            x_l = x.split(" ")

            return ' '.join(x_l[1:x_l.index(next(x for x in x_l if x.startswith('[')))])
        
        def get_protein_accession(x):
            return x.split(" ")[0]
        
        def get_locus_name(x):
            return None
    else:
        def get_sciname(x):

            os_index = x.index('OS=')
            ox_index = x.index('OX=')

            return x[os_index+3:ox_index-1]

        # Useful functions
        def get_protein_function(x):

            x_l = x.split(" ")

            return ' '.join(x_l[1:x_l.index(next(x for x in x_l if x.startswith('OS=')))])
        
        def get_protein_accession(x):
            return x.split("|")[1]
        
        def get_locus_name(x):
            return x.split("|")[2]
    
    df_tmp['organism'] = df_tmp.stitle.apply(lambda x: get_sciname(x))
    df_tmp['protein_accession'] = df_tmp.sseqid.apply(lambda x: get_protein_accession(x))
    df_tmp['protein_product'] = df_tmp.stitle.apply(lambda x: get_protein_function(x))
    df_tmp['locus_name'] = df_tmp.sseqid.apply(lambda x: get_locus_name(x))

    df_tmp = pd.merge(df_tmp, table, on='transcript')

    df = pd.concat([df, df_tmp[features]])

In [31]:
df.sort_values(['transcript', 'evalue']).head(10)

Unnamed: 0,transcript,row,log2FoldChange,padj,protein_accession,sequence_identity,alignment_length,evalue,database,locus_name,sequence_description,sequence_length,organism,protein_product
0,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,A0A6P7WME3,38.9,332,1.87e-64,TrEMBL,A0A6P7WME3_9AMPH,tr|A0A6P7WME3|A0A6P7WME3_9AMPH epidermal diffe...,340,Microcaecilia unicolor,epidermal differentiation-specific protein-like
0,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_030044427.1,38.9,332,4.8099999999999996e-64,Nr,,XP_030044427.1 epidermal differentiation-speci...,340,Microcaecilia unicolor,epidermal differentiation-specific protein-like
1,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,A0A6P8NSY5,38.9,337,1.02e-63,TrEMBL,A0A6P8NSY5_GEOSA,tr|A0A6P8NSY5|A0A6P8NSY5_GEOSA epidermal diffe...,339,Geotrypetes seraphini,epidermal differentiation-specific protein-like
1,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_033779482.1,38.9,337,2.62e-63,Nr,,XP_033779482.1 epidermal differentiation-speci...,339,Geotrypetes seraphini,epidermal differentiation-specific protein-like
2,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_028822286.1,37.5,339,7.369999999999999e-63,Nr,,XP_028822286.1 epidermal differentiation-speci...,339,Denticeps clupeoides,epidermal differentiation-specific protein-like
3,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_028822242.1,37.5,339,7.369999999999999e-63,Nr,,XP_028822242.1 epidermal differentiation-speci...,339,Denticeps clupeoides,epidermal differentiation-specific protein-like
4,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_041100159.1,36.8,337,1.0100000000000002e-62,Nr,,XP_041100159.1 epidermal differentiation-speci...,338,Polyodon spathula,epidermal differentiation-specific protein-like
5,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_041099455.1,36.5,337,1.4300000000000001e-62,Nr,,XP_041099455.1 epidermal differentiation-speci...,338,Polyodon spathula,epidermal differentiation-specific protein-like
6,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_034770713.1,38.2,335,2.45e-62,Nr,,XP_034770713.1 epidermal differentiation-speci...,345,Acipenser ruthenus,epidermal differentiation-specific protein-like
7,NODE_100481_length_1662_cov_45.942320_g39073_i0,bombina_pachypus_blastp,3.729238,0.001008,XP_041117575.1,36.0,336,2.84e-62,Nr,,XP_041117575.1 epidermal differentiation-speci...,338,Polyodon spathula,epidermal differentiation-specific protein-like


In [1]:
#Creating a dictonary list where each dict contains all sequences hits count
sets_list = []

for f in files:
    #Import the dataset
    df = pd.read_csv(f, sep="\t", names=outfmt.split())
    
    #Drop Nan rows
    df.dropna(subset=[index], inplace=True)
    
    #Create and append the dict
    sets_list.append(set(df[index]) - {index})

NameError: name 'files' is not defined