# 02_pm_contig_vs_vmag_gtdb

## Load packages and data

In [None]:
# load packages
import pandas as pd
import os
import os.path as op
import sys
import csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import warnings

# global options
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

# load functions
def count_classified(df, level):
    level_prefix = level[0] + '__'
    subdf = df[(df[level] != 'Unclassified') & (~df[level].isna()) & (df[level] != '') & (df[level] != level_prefix)]
    return len(subdf)

# file paths
path = '/Users/melissaherring/Google Drive/My Drive/MH_project/'
#path = '/Users/juliabrown/Google Drive/My Drive/projects/OMZvir_round2/MH_project/'

# load proximeta master table
proximeta_master = pd.read_csv(op.join(path,'tables/proximeta_master.csv'))

# subset all observations with a classified host
proximeta_hosts = proximeta_master[(proximeta_master['classification'].notna()) & (proximeta_master['classification'] != 'Unclassified Bacteria') & (proximeta_master['classification'] != 'Unclassified')]

# create a dataframe for just contigs
contig_hosts = proximeta_hosts[proximeta_hosts['virus_type'] == 'contig']

# create a dataframe for just vmags
vmag_hosts = proximeta_hosts[proximeta_hosts['virus_type'] == 'vmag']


## How many cells were classified to at least each level?

In [None]:
''' contigs '''

contig_level_counts = [] # set up empty level counts string

levels = ['domain','phyla','class','order','family','genus','species'] # create a levels list

# for loop that counts how many were classified using the count_classified function created above
for level in levels:
    contig_lcount = count_classified(contig_hosts, level)
    print('There are', contig_lcount, 'cells annotated to', level,".") 
    contig_level_counts.append(contig_lcount)

# create a dictionary with the number classified for each level
contig_tax_dict = {'tax_level': levels, 'num_cells_classified': contig_level_counts} 

# create a dataframe from the dictionary
contig_tax_df = pd.DataFrame(data = contig_tax_dict)

# calculate total number of cells classified
contig_tot = len(contig_hosts)

# add an percent column
contig_tax_df['percent_classified'] = contig_tax_df['num_cells_classified']/contig_tot*100

# add a type column
contig_tax_df['type'] = 'Contig'

print(contig_tax_df)

fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12, 6), sharex=True)

sns.barplot(x = 'tax_level', y= 'num_cells_classified', data = contig_tax_df, color= 'royalblue', ax = axes[0])
axes[0].set_xlabel('Taxonomic classification')
axes[0].set_ylabel('Number of Cells')
axes[0].set_title('Number of Cells with Viral Contigs Classified with GTDB')

sns.barplot(x = 'tax_level', y= 'percent_classified', data = contig_tax_df, color = 'crimson', ax = axes[1])
axes[1].set_xlabel('Taxonomic classification')
axes[1].set_ylabel('Percent')
axes[1].set_ylim(0,100)
axes[1].set_title('Percent of Cells with Viral Contigs Classified with GTDB')

In [None]:
''' vmags '''

vmag_level_counts = [] # set up empty level counts string

levels = ['domain','phyla','class','order','family','genus','species'] # create a levels list

# for loop that counts how many were classified using the count_classified function created above
for level in levels:
    vmag_lcount = count_classified(vmag_hosts, level)
    print('There are', vmag_lcount, 'cells annotated to', level,".") 
    vmag_level_counts.append(vmag_lcount)

# create a dictionary with the number classified for each level
vmag_tax_dict = {'tax_level': levels, 'num_cells_classified': vmag_level_counts} 

# create a dataframe from the dictionary
vmag_tax_df = pd.DataFrame(data = vmag_tax_dict)

# calculate total number of cells classified
vmag_tot = len(vmag_hosts)

# add an percent column
vmag_tax_df['percent_classified'] = vmag_tax_df['num_cells_classified']/vmag_tot*100

# add a type column
vmag_tax_df['type'] = 'vmag'

print(vmag_tax_df)

fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12, 6), sharex=True)

sns.barplot(x = 'tax_level', y= 'num_cells_classified', data = vmag_tax_df, color= 'royalblue', ax = axes[0])
axes[0].set_xlabel('Taxonomic classification')
axes[0].set_ylabel('Number of vmags')
axes[0].set_title('Number of vmags Classified with GTDB')

sns.barplot(x = 'tax_level', y= 'percent_classified', data = vmag_tax_df, color = 'crimson', ax = axes[1])
axes[1].set_xlabel('Taxonomic classification')
axes[1].set_ylabel('Percent')
axes[1].set_ylim(0,100)
axes[1].set_title('Percent of vmags Classified with GTDB')

In [None]:
classified_plot = pd.concat([contig_tax_df,vmag_tax_df])

custom_palette = {'vmag': '#1f78b4', 'Contig': '#b2df8a'}

fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12, 6), sharex=True)

sns.barplot(x = 'tax_level', y= 'num_cells_classified', data = classified_plot, palette=custom_palette, hue='type', ax = axes[0])
axes[0].set_xlabel('Taxonomic classification')
axes[0].set_ylabel('Number of Cells')
axes[0].get_legend().remove()

sns.barplot(x = 'tax_level', y= 'percent_classified', data = classified_plot, palette=custom_palette, hue='type', ax = axes[1])
axes[1].set_xlabel('Taxonomic classification')
axes[1].set_ylabel('Percent')
axes[1].set_ylim(0,100)
plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')