In [1]:
import pandas as pd
import numpy as np
import subprocess

In [5]:
!cat ../proteinfer_ssid.sh ; sbatch ../proteinfer_ssid.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=64GB
#SBATCH --time=48:00:00
#SBATCH --job-name=proteinfer

source ~/.bashrc
conda activate proteinfer
cd /storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/proteinfer
python proteinfer.py \
    --i ../funannotate_annotation/ssid_annotation/annotate_results/Siderastrea_siderea.proteins.fa \
    --o ../funannotate_annotation/ssid_annotation/annotate_results/Siderastrea_siderea.proteinfer.tsv
Submitted batch job 14349557


In [10]:
!cat ../proteinfer_dcyl.sh ; sbatch ../proteinfer_dcyl.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=64GB
#SBATCH --time=48:00:00
#SBATCH --job-name=proteinfer

source ~/.bashrc
conda activate proteinfer
cd /storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/proteinfer
python proteinfer.py \
    --i ../funannotate_annotation/dcyl_annotation/annotate_results/Dendrogyra_cylindrus.proteins.fa \
    --o ../funannotate_annotation/dcyl_annotation/annotate_results/Dendrogyra_cylindrus.proteinfer.tsv
Submitted batch job 14349744


In [11]:
!cat ../proteinfer_cnat.sh ; sbatch ../proteinfer_cnat.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=64GB
#SBATCH --time=48:00:00
#SBATCH --job-name=proteinfer

source ~/.bashrc
conda activate proteinfer
cd /storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/proteinfer
python proteinfer.py \
    --i ../funannotate_annotation/cnat_annotation/annotate_results/Colpophyllia_natans.proteins.fa \
    --o ../funannotate_annotation/cnat_annotation/annotate_results/Colpophyllia_natans.proteinfer.tsv
Submitted batch job 14349779


In [79]:
for spp in ["../funannotate_annotation/cnat_annotation/annotate_results/Colpophyllia_natans", 
            "../funannotate_annotation/dcyl_annotation/annotate_results/Dendrogyra_cylindrus", 
            "../funannotate_annotation/ssid_annotation/annotate_results/Siderastrea_siderea"]:
    proteinfer = pd.read_csv(spp + ".proteinfer.tsv", sep = "\t")
    proteinfer.columns = ["TranscriptID", "predicted_label", "confidence", "description"]
    annotations = pd.read_csv(spp + ".annotations.txt", sep = "\t")
    annotations_adjusted = annotations.set_index("TranscriptID").copy()
    goterms = proteinfer[proteinfer["predicted_label"].str.contains("^GO:")].reset_index(drop=True)
    goterms["annotation_format"] = "GO_component: " + goterms["predicted_label"] + " - " + goterms["description"] + " [ProteInfer]"
    goterms_dict = goterms.groupby("TranscriptID")["annotation_format"].apply(list).apply(";".join).to_dict()
    goterms = pd.DataFrame(goterms_dict.items())
    goterms.columns = ["TranscriptID", "GO Terms"]
    goterms = goterms.set_index("TranscriptID")
    pfam = proteinfer[proteinfer["predicted_label"].str.contains("^Pfam:PF")].reset_index(drop=True)
    pfam["annotation_format"] = pfam["predicted_label"].str.split(":", expand = True)[1]
    pfam_dict = pfam.groupby("TranscriptID")["annotation_format"].apply(list).apply(";".join).to_dict()
    pfam = pd.DataFrame(pfam_dict.items())
    pfam.columns = ["TranscriptID", "PFAM"]
    pfam = pfam.set_index("TranscriptID")
    annotations_adjusted = annotations_adjusted.fillna(goterms).fillna(pfam)
    annotations_adjusted = annotations_adjusted.reset_index(drop=False)
    annotations_adjusted = annotations_adjusted[['GeneID', 'TranscriptID', 'Feature', 'Contig', 'Start', 'Stop',
           'Strand', 'Name', 'Product', 'Alias/Synonyms', 'EC_number', 'BUSCO',
           'PFAM', 'InterPro', 'EggNog', 'COG', 'GO Terms', 'Secreted', 'Membrane',
           'Protease', 'CAZyme', 'Notes', 'gDNA', 'mRNA', 'CDS-transcript',
           'Translation']]
    annotations_adjusted.to_csv(spp + ".annotations_proteinfer.txt", sep = "\t", index = False)