In [9]:
import os
import sys
import pandas as pd
import math
import re
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Making summary file(s) for Cenote-Taker

In [10]:
# length/repeat file
length_table = "/Users/u241374/mike_tisza/sandbox/test0811d/ct2_tmp/hallmark_contigs_terminal_repeat_summary.tsv"
# ct name/original name file
name_table = "/Users/u241374/mike_tisza/sandbox/test0811d/ct2_tmp/contig_name_map.tsv"
# gene to contig file
gene_to_contig_table = "/Users/u241374/mike_tisza/sandbox/test0811d/final_genes_to_contigs_annotation_summary.tsv"
# taxonomy file
tax_table = "/Users/u241374/mike_tisza/sandbox/test0811d/ct2_tmp/final_taxonomy/virus_taxonomy_summary.tsv"

sequin_dir = "/Users/u241374/mike_tisza/sandbox/test0811d/sequin_and_genome_maps"


In [14]:
main_annot_df = pd.read_csv(gene_to_contig_table, sep = "\t")

length_df = pd.read_csv(length_table, sep = "\t")

name_df = pd.read_csv(name_table, sep = "\t", names=['contig', 'input_name'])

tax_df = pd.read_csv(tax_table, sep = "\t")


In [16]:
merge_df = pd.merge(main_annot_df, length_df, on = ["contig", "dtr_seq"], how = "left")

merge_df = pd.merge(merge_df, name_df, on = "contig", how = "left")

merge_df = pd.merge(merge_df, tax_df, on = ["contig", "chunk_name"], how = "left")


merge_df['taxon'] = merge_df['taxon'].fillna("unclassified virus")



In [17]:
merge_df.columns

Index(['contig', 'gene_name', 'gene_orient', 'gene_start', 'gene_stop',
       'contig_length', 'dtr_seq', 'evidence_acession', 'evidence_description',
       'Evidence_source', 'vscore_category', 'chunk_name', 'chunk_length',
       'chunk_start', 'chunk_stop', 'in_length_contig', 'out_length_contig',
       'itr_seq', 'input_name', 'taxon', 'taxonomy_hierarchy', 'taxon_level',
       'avg_hallmark_AAI_to_ref'],
      dtype='object')

In [18]:
## parse fasta description

finalseq_list = []
for fsa in os.listdir(sequin_dir):
    if fsa.endswith('.fsa'):
        f = os.path.join(sequin_dir, fsa)

        if os.path.isfile(f) and os.path.getsize(f) > 0:
            finalseq_list.append(f)

if not finalseq_list:
    print("no files found for seqIO parse " + str(sequin_dir))
    exit


desc_list = []
for seq_file in finalseq_list:
    seq_record = SeqIO.read(seq_file, "fasta")
    try:
        if "@" in seq_record.id:
            contig = seq_record.id.split("@")[0]
            chunkq = seq_record.id.split("@")[1]
        else:
            contig = seq_record.id
            chunkq = None
        fields = re.findall(r'\[.*?\]', seq_record.description)
        organism = re.search(r'\[organism=(.*?)\]', fields[0]).group(1)
        gcode = re.search(r'\[gcode=(.*?)\]', fields[1]).group(1)
        desc_list.append([contig, chunkq, organism, gcode])
    except:
        print("except")

desc_df = pd.DataFrame(desc_list, columns=["contig", "chunk_name", "organism", "genetic_code"])


In [19]:
print(desc_df)

            contig chunk_name                    organism genetic_code
0   test0811d_1724       None     Winoviridae sp. ctTYXNQ           11
1   test0811d_2007       None  Caudoviricetes sp. ctIEOEM           11
2   test0811d_6316       None  Caudoviricetes sp. ctIG6WX           11
3   test0811d_3192    Chunk_0  Caudoviricetes sp. ct7P3DF           11
4   test0811d_1690       None  Caudoviricetes sp. ct45KJK           11
..             ...        ...                         ...          ...
76  test0811d_1675       None  Caudoviricetes sp. ct5Z7DX           11
77  test0811d_1339       None    Peduoviridae sp. ctKRX8H           11
78  test0811d_5360       None  Caudoviricetes sp. ctXL5I9           11
79  test0811d_1853       None  Caudoviricetes sp. ctK7K2C           11
80  test0811d_6279       None  Caudoviricetes sp. ctN4FSD           11

[81 rows x 4 columns]


In [None]:
print(merge_df[['contig','chunk_name']])

In [20]:
org_info_df = pd.merge(merge_df, desc_df, on = ["contig", "chunk_name"], how = "left")


In [21]:
org_info_df.columns

Index(['contig', 'gene_name', 'gene_orient', 'gene_start', 'gene_stop',
       'contig_length', 'dtr_seq', 'evidence_acession', 'evidence_description',
       'Evidence_source', 'vscore_category', 'chunk_name', 'chunk_length',
       'chunk_start', 'chunk_stop', 'in_length_contig', 'out_length_contig',
       'itr_seq', 'input_name', 'taxon', 'taxonomy_hierarchy', 'taxon_level',
       'avg_hallmark_AAI_to_ref', 'organism', 'genetic_code'],
      dtype='object')

In [None]:
def summarize_ct(x):
    ds = {}
    ds['gene_count'] = x['gene_name'].nunique()
    ds['hallmark_count'] = x.query("Evidence_source = 'hallmark_hmm'")['gene_name'].nunique()
    return pd.Series(ds, index=['gene_count', 'hallmark_count'])

def hm_count(x):
    return x.value_counts()['hallmark_hmm']

org_info_df.groupby(['contig', 'contig_length', 'dtr_seq', 'chunk_name', 'chunk_length',
                     'itr_seq', 'input_name', 'taxon', 'taxonomy_hierarchy', 'taxon_level',
                     'avg_hallmark_AAI_to_ref', 'organism', 'genetic_code'], dropna = False)\
                     .agg(gene_count=('gene_name', 'nunique'),
                          hallmark_count=('Evidence_source', 'hm_count')).reset_index()
                     #.apply(summarize_ct).reset_index()

In [48]:
type(org_info_df['contig'])

pandas.core.series.Series

In [73]:
grouped_df = org_info_df.groupby(['contig', 'contig_length', 'dtr_seq', 'chunk_name', 'chunk_length',
                     'itr_seq', 'input_name', 'taxon', 'taxonomy_hierarchy', 'taxon_level',
                     'avg_hallmark_AAI_to_ref', 'organism', 'genetic_code'], dropna = False)

summary_list = []
for name, group in grouped_df:
    gene_count = group['gene_name'].nunique()
    hallmark_count = group.query("Evidence_source == 'hallmark_hmm'")['gene_name'].nunique()
    hallmark_list = '|'.join(
        list(group.query("Evidence_source == 'hallmark_hmm'")['evidence_description'])
        ).replace("-", " ")
    if name[2]:
        end_type = "DTR"
    elif name[5]:
        end_type = "ITR"
    else:
        end_type = "None"
        
    if gene_count >= 1:
        summary_list.append([name[0], name[6], name[11], name[1], end_type, gene_count, hallmark_count, hallmark_list, name[8]])

summary_df = pd.DataFrame(summary_list, columns=['contig', 'input_name', 'organism', 'contig_length', 'end_feature', 'gene_count', 'hallmark_count', 'hallmark_genes', 'taxonomy_hierarchy'])