In [1]:
#Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
pd.set_option('display.max_columns', None)

In [51]:
# Insert the names (or paths) of the tsv files
files = [
    "salamandra/fast-slow.pep_nr.tsv",
    "salamandra/fast-slow.pep_tr.tsv",
    "salamandra/fast-slow.pep_sp.tsv"   
]

# Insert the titles of the graph
title = "salamandra_salamandra_fast-slow_blastp"

# Insert the databases names
databases_names =[
    "Nr", 
    "TrEMBL",
    "Swiss-Prot",
]

table_path = "./salamandra/salamandra_fast-slow_table_padj_0.05----log2fc_1.tsv"

# Insert the path of the graphs
path = "salamandra/" + title

# Set the outformat
# e.g. 
# outfmt = "qseqid qlen sseqid sallseqid slen qstart qend sstart send qseq full_qseq sseq full_sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos qframe btop cigar staxids sscinames sskingdoms skingdoms sphylums stitle salltitles qcovhsp scovhsp qtitle qqual full_qqual qstrand"
# If there are column names in the file then set outfmt = None
outfmt = "qseqid qlen sseqid sallseqid slen qstart qend sstart send qseq full_qseq sseq full_sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos qframe btop cigar staxids sscinames sskingdoms skingdoms sphylums stitle salltitles qcovhsp scovhsp qtitle qqual full_qqual qstrand"

# Columns names
features = ["transcript", "row", "log2FoldChange", "padj", 
            "protein_accession", "sequence_identity", "alignment_length", 
            "evalue", "database", "gene", "locus_name", "sequence_description",
            "sequence_length", "organism", "protein_product"]

table = pd.read_csv(table_path, sep='\t')

In [36]:
table.head()

Unnamed: 0,transcript,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,Cluster-100188.0_A0.TRINITY_DN3407_c2_g1_i7,46.75532,2.474862,0.42474,5.826772,5.650976e-09,2e-06
1,Cluster-103630.0_A1.NODE_107801_length_1491_co...,6.341873,5.436436,1.285694,4.228405,2.353534e-05,0.006518
2,Cluster-104712.3_A1.NODE_93399_length_1771_cov...,6.240299,5.416404,1.132887,4.781062,1.743718e-06,0.00057
3,Cluster-108867.3_A0.TRINITY_DN715_c0_g1_i4,7.525759,6.290294,1.694775,3.711581,0.0002059683,0.044778
4,Cluster-109105.0_A1.NODE_118123_length_1331_co...,4.115501,4.782975,1.069578,4.471831,7.755254e-06,0.002337


In [37]:
pd.Series([1, 1, 2, 3]).unique()

array([1, 2, 3])

In [59]:
def get_transcripts_from_id(transcripts, table):
    transcripts = transcripts.unique()

    dic = dict()

    for t in transcripts:
        for x in table.transcript:
            if re.match(x, t):
                dic[t] = x
    return dic

In [60]:
def get_transcript(row):
    for t in table.transcript:
        if re.match(t, row):
            return t
    return None

df = pd.DataFrame()

for i in range(len(files)):

    #Import the dataset
    df_tmp = pd.read_csv(files[i], sep="\t", names=outfmt.split())

    df_tmp['transcript'] = df_tmp['qseqid'].map(get_transcripts_from_id(df_tmp['qseqid'], table))
    #df_tmp['transcript'] = df_tmp['qseqid'].apply(get_transcript)
    df_tmp['row'] = title
    df_tmp['sequence_identity'] = df_tmp.pident
    df_tmp['alignment_length'] = df_tmp.length
    df_tmp['evalue'] = df_tmp.evalue
    df_tmp['sequence_description'] = df_tmp.stitle
    df_tmp['sequence_length'] = df_tmp.slen
    df_tmp['database'] = databases_names[i]

    if "OS=" not in df_tmp.stitle[0]:    
        def get_sciname(x):
            
            os_index = - x[::-1].index('[')

            return x[os_index:-1]

        # Useful functions
        def get_protein_function(x):

            x_l = x.split(" ")

            return ' '.join(x_l[1:x_l.index(next(x for x in x_l if x.startswith('[')))])
        
        def get_protein_accession(x):
            return x.split(" ")[0]
        
        def get_locus_name(x):
            return None
        
        def get_gene(x):
            return None
    else:
        def get_sciname(x):

            os_index = x.index('OS=')
            ox_index = x.index('OX=')

            return x[os_index+3:ox_index-1]

        # Useful functions
        def get_protein_function(x):

            x_l = x.split(" ")

            return ' '.join(x_l[1:x_l.index(next(x for x in x_l if x.startswith('OS=')))])
        
        def get_protein_accession(x):
            return x.split("|")[1]
        
        def get_locus_name(x):
            return x.split("|")[2]
        
        def get_gene(x):

            try:
                gn_index = x.index('GN=')
                pe_index = x.index('PE=')
            except:
                return None
            return x[gn_index+3:pe_index-1]
        
    df_tmp['gene'] = df_tmp.stitle.apply(lambda x: get_gene(x))
    df_tmp['organism'] = df_tmp.stitle.apply(lambda x: get_sciname(x))
    df_tmp['protein_accession'] = df_tmp.sseqid.apply(lambda x: get_protein_accession(x))
    df_tmp['protein_product'] = df_tmp.stitle.apply(lambda x: get_protein_function(x))
    df_tmp['locus_name'] = df_tmp.sseqid.apply(lambda x: get_locus_name(x))

    df_tmp = pd.merge(df_tmp, table, on='transcript')

    df = pd.concat([df, df_tmp[features]])

df.sort_values(['transcript', 'evalue'], inplace=True)

df.reset_index(drop=True, inplace=True)

In [61]:
df.loc[df.duplicated(subset=['transcript', 'row', 'log2FoldChange', 'padj']), 'transcript':'padj'] = ''

In [63]:
df.head(15)

Unnamed: 0,transcript,row,log2FoldChange,padj,protein_accession,sequence_identity,alignment_length,evalue,database,gene,locus_name,sequence_description,sequence_length,organism,protein_product
0,Cluster-109348.0_A1.NODE_1820_length_13293_cov...,salamandra_salamandra_fast-slow_blastp,-9.292442,4.9e-05,KAG6930936.1,85.1,1083,0.0,Nr,,,KAG6930936.1 dishevelled associated activator ...,1084,Chelydra serpentina,dishevelled associated activator of morphogene...
1,,,,,XP_019405160.1,84.8,1082,0.0,Nr,,,XP_019405160.1 PREDICTED: disheveled-associate...,1084,Crocodylus porosus,PREDICTED: disheveled-associated activator of ...
2,,,,,XP_006131817.1,84.3,1084,0.0,Nr,,,XP_006131817.1 disheveled-associated activator...,1083,Pelodiscus sinensis,disheveled-associated activator of morphogenes...
3,,,,,XP_034620688.1,84.6,1084,0.0,Nr,,,XP_034620688.1 disheveled-associated activator...,1083,Trachemys scripta elegans,disheveled-associated activator of morphogenes...
4,,,,,XP_048699425.1,84.9,1083,0.0,Nr,,,XP_048699425.1 disheveled-associated activator...,1084,Caretta caretta,disheveled-associated activator of morphogenes...
5,,,,,XP_042308588.1,85.3,1085,0.0,Nr,,,XP_042308588.1 disheveled-associated activator...,1087,Sceloporus undulatus,disheveled-associated activator of morphogenes...
6,,,,,XP_044865070.1,84.0,1085,0.0,Nr,,,XP_044865070.1 disheveled-associated activator...,1084,Mauremys mutica,disheveled-associated activator of morphogenes...
7,,,,,XP_040207747.1,85.4,1081,0.0,Nr,,,XP_040207747.1 disheveled-associated activator...,1080,Rana temporaria,disheveled-associated activator of morphogenes...
8,,,,,XP_037751474.1,84.9,1083,0.0,Nr,,,XP_037751474.1 disheveled-associated activator...,1084,Chelonia mydas,disheveled-associated activator of morphogenes...
9,,,,,XP_030413148.1,84.5,1083,0.0,Nr,,,XP_030413148.1 disheveled-associated activator...,1084,Gopherus evgoodei,disheveled-associated activator of morphogenes...


In [64]:
df.to_excel(path + '-2.xlsx', index=False)