In [None]:
import pandas as pd
from ete3 import NCBITaxa
ncbi = NCBITaxa()

import pandas as pd

# load uhgg metadata and blast results
bacteria_db_metadata = pd.read_csv(
# '/home/carsonjm/resources/uhgg/genome-all_metadata.tsv'
str(snakemake.input.bacteria_db_metadata)
, sep='\t')

# split lineage column
bacteria_db_metadata[['superkingdom', 'phylum', 'class', 'order','family', 'genus', 'species']] = bacteria_db_metadata['Lineage'].str.split(';', expand=True)
bacteria_db_metadata['superkingdom'] = bacteria_db_metadata['superkingdom'].str.partition('d__')[2]
bacteria_db_metadata['phylum'] = bacteria_db_metadata['phylum'].str.partition('p__')[2]
bacteria_db_metadata['class'] = bacteria_db_metadata['class'].str.partition('c__')[2]
bacteria_db_metadata['order'] = bacteria_db_metadata['order'].str.partition('o__')[2]
bacteria_db_metadata['family'] = bacteria_db_metadata['family'].str.partition('f__')[2]
bacteria_db_metadata['genus'] = bacteria_db_metadata['genus'].str.partition('g__')[2]
bacteria_db_metadata['superkingdom'] = bacteria_db_metadata['superkingdom'].str.partition('_')[0]
bacteria_db_metadata['phylum'] = bacteria_db_metadata['phylum'].str.partition('_')[0]
bacteria_db_metadata['class'] = bacteria_db_metadata['class'].str.partition('_')[0]
bacteria_db_metadata['order'] = bacteria_db_metadata['order'].str.partition('_')[0]
bacteria_db_metadata['family'] = bacteria_db_metadata['family'].str.partition('_')[0]
bacteria_db_metadata['genus'] = bacteria_db_metadata['genus'].str.partition('_')[0]

# load uhgg phist results
phist_results = pd.read_csv(
# '/home/carsonjm/CarsonJM/phide_piper/06_VIRUS_HOST/02_phist/uneven_coverage/uhgg_host_predictions.csv'
str(snakemake.input.phist)
)

# format blast genome so it can be merged with metadata, then merge
phist_results = phist_results[(phist_results['host'].notnull())]
phist_results['Genome'] = phist_results['host'].str.split('.gff.gz', expand=True)[0]
phist_results_metadata = phist_results.merge(bacteria_db_metadata, on='Genome')

# filter to only retain high-quality CRISPR spacer matches
phist_results_metadata_hq = phist_results_metadata[phist_results_metadata['#common-kmers'] >
snakemake.params.min_common_kmers
# 0.5
]

merged=phist_results_metadata_hq[['phage', 'host', '#common-kmers', 'adj-pvalue', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus']]

merged2 = merged[merged['host'].notnull()]
max_kmer = merged2.groupby(by=['phage'], as_index=False).max()
max_kmers = max_kmer.loc[:,['phage', '#common-kmers']]
max_kmers.rename(columns = {'#common-kmers':'max_common-kmers'}, inplace = True)
max_kmers_merged = merged2.merge(max_kmers, on='phage', how='left')
max_kmers_merged2 = max_kmers_merged[max_kmers_merged['#common-kmers'] == max_kmers_merged['max_common-kmers']]

# count total phist hits
phist_count = max_kmers_merged2.groupby(by=['phage'], as_index=False).count()
phist_counts = phist_count.loc[:,['phage', 'superkingdom']]
phist_counts.rename(columns = {'superkingdom':'total_hits'}, inplace = True)
phist_counts_taxonomy = max_kmers_merged2.merge(phist_counts, on='phage', how='left')

# determine if any genomes have > min_agreement agreement at taxonomic level
def determine_consensus(taxonomic_rank, phist_table):
    taxonomy_hits = phist_table.groupby(['phage', taxonomic_rank], as_index=False
            ).agg(phist_taxonomy_hits=(taxonomic_rank,'count'))
    taxonomy_hits.rename(columns={'phist_taxonomy_hits':taxonomic_rank + '_hits'}, inplace=True)
    taxonomy_hits_merged = phist_table.merge(taxonomy_hits, on=['phage', taxonomic_rank], how='outer')
    taxonomy_hits_merged[taxonomic_rank + '_percent_agreement'] = taxonomy_hits_merged[taxonomic_rank + '_hits']/taxonomy_hits_merged['total_hits']
    taxonomy_hits_consensus = taxonomy_hits_merged[taxonomy_hits_merged[taxonomic_rank + '_percent_agreement']*100 >
    # 70
    snakemake.params.min_agreement
    ]
    taxonomy_hits_consensus = taxonomy_hits_consensus.groupby('phage', as_index=False).first()

    return taxonomy_hits_consensus

# determine genus level consensus
genus_consensus = determine_consensus('genus', phist_counts_taxonomy)
genus_unannotated = phist_counts_taxonomy[~phist_counts_taxonomy['phage'].isin(
    genus_consensus['phage'])]


# determine family level consensus
family_consensus = determine_consensus('family', genus_unannotated)
family_consensus['genus'] = 'NA'
family_unannotated = genus_unannotated[~genus_unannotated['phage'].isin(
    family_consensus['phage'])]

# determine order level consensus
order_consensus = determine_consensus('order', family_unannotated)
order_consensus[['genus', 'family']] = 'NA'
order_unannotated = family_unannotated[~family_unannotated['phage'].isin(
    order_consensus['phage'])]

# determine class level consensus
class_consensus = determine_consensus('class', order_unannotated)
class_consensus[['genus', 'family', 'order']] = 'NA'
class_unannotated = order_unannotated[~order_unannotated['phage'].isin(
    class_consensus['phage'])]

# determine phylum level consensus
phylum_consensus = determine_consensus('phylum', class_unannotated)
phylum_consensus[['genus', 'family', 'order', 'class']] = 'NA'
phylum_unannotated = class_unannotated[~class_unannotated['phage'].isin(
    phylum_consensus['phage'])]

# determine superkingdom level consensus
superkingdom_consensus = determine_consensus('superkingdom', phylum_unannotated)
superkingdom_consensus[['genus', 'family', 'order', 'class', 'phylum']] = 'NA'
superkingdom_unannotated = phylum_unannotated[~phylum_unannotated['phage'].isin(
    superkingdom_consensus['phage'])]

# merge all results
final_consensus = pd.concat([genus_consensus, family_consensus, order_consensus,
    class_consensus, phylum_consensus, superkingdom_consensus])
final_consensus = final_consensus.fillna('NA')
final_consensus['taxonomy'] = (final_consensus['superkingdom'] + ';'
    + final_consensus['phylum'] + ';' + final_consensus['class'] + ';'
    + final_consensus['order'] + ';' + final_consensus['family'] + ';'
    + final_consensus['genus'])

# format results file and save
final_consensus = final_consensus[['phage', '#common-kmers', 'total_hits',
'superkingdom','superkingdom_hits','superkingdom_percent_agreement','phylum','phylum_hits','phylum_percent_agreement',
'class','class_hits','class_percent_agreement','order','order_hits','order_percent_agreement',
'family','family_hits','family_percent_agreement','genus','genus_hits','genus_percent_agreement', 'taxonomy']]
final_consensus = final_consensus.add_prefix('phist_')
final_consensus.rename(columns = {'phist_phage': 'viral_genome'}, inplace=True)
final_consensus['viral_genome'] = final_consensus['viral_genome'].str.partition('.fna')[0]
final_consensus.to_csv(str(snakemake.output.report), index=False)
host_results = final_consensus[['viral_genome', 'phist_taxonomy']]
host_results.to_csv(str(snakemake.output.taxonomy), index=False)