# 01_create_proximeta_master_table

This script creates a master table for all proximeta data (contigs and vmags) with columns for all of the information available in google drive.

In [4]:
# Load packages
import pandas as pd
import os
import os.path as op
import sys
import csv
import numpy as np
import matplotlib
import glob
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

# file paths
path = '/Users/melissaherring/Google Drive/My Drive/MH_project/'
#path = '/Users/juliabrown/Google Drive/My Drive/projects/OMZvir_round2/MH_project/'

# define functions 
def split_classification(df):
    df[['domain','phyla','class','order','family','genus','species']] = df.classification.str.split(';', expand=True)
    df['domain'] = df['domain'].str.replace('d__', '')
    df['phyla'] = df['phyla'].str.replace('p__', '')
    df['class'] = df['class'].str.replace('c__', '')
    df['order'] = df['order'].str.replace('o__', '')
    df['family'] = df['family'].str.replace('f__', '')
    df['genus'] = df['genus'].str.replace('g__', '')
    df['species'] = df['species'].str.replace('s__', '')
    return df
def split_contig_id(df):
    df['virus_name'] = df['contig_id'].str.split("|").str[0]
    df['N'] = pd.to_numeric(df['contig_id'].str.split("|").str[1].str.replace('N=', ''))
    df['L'] = pd.to_numeric(df['contig_id'].str.split("|").str[2].str.replace('L=', ''))
    return df

# global options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


''' JOIN ASSOCIATED CONTIG AND VMAG DATA '''


# input contig data
jv119_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-119_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv121_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-121_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv132_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-132_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv154_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-154_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')

# create a list of all contig dataframes
contig_list = [jv119_contig, jv121_contig, jv132_contig, jv154_contig]

# input vMAG data
jv119_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-119_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv121_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-121_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv132_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-132_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv154_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-154_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')

# create a list of all vmag dataframes
vmag_list = [jv119_vmag, jv121_vmag, jv132_vmag, jv154_vmag]

# create virus_type column for each dataframe
for df in contig_list:
    df['virus_type'] = 'contig'
for df in vmag_list:
    df['virus_type'] = 'vmag'

# rename columns to match betweeen contig and vmag dataframes

## create dictionaries to use for column mapping -> old_col_name : new_col_name
column_mapping_contig = {
    'mobile_contig_name': 'virus_name',
    'mobile_contig_length (bp)': 'virus_length',
    'mobile_contig_read_count (reads)': 'virus_read_count',
    'mobile_contig_read_depth (reads/kbp)': 'virus_read_depth',
    'mobile_contig_read_depth_in_this_cluster (reads/kbp)': 'virus_read_depth_in_host',
    'cluster_name': 'host_name',
    'cluster_length (bp)': 'host_length',
    'cluster_read_count (reads)': 'host_read_count',
    'cluster_read_depth (reads/kbp)': 'host_read_depth',
    'intra_read_count (reads)': 'intra_read_count',
    'intra_linkage_density (reads/kbp^2)': 'intra_linkage_density',
    'inter_read_count (reads)': 'inter_read_count',
    'raw_inter_linkage_density (reads/kbp^2)': 'raw_inter_linkage_density',
    'raw_inter_vs_intra_ratio': 'raw_inter_vs_intra_ratio',
    'mobile_element_copies_per_cell': 'viral_copies_per_cell',
    'adjusted_inter_connective_linkage_density (reads/kbp^2)': 'adjusted_inter_linkage_density',
    'adjusted_inter_vs_intra_ratio': 'adjusted_inter_vs_intra_ratio',
    'sample_name':'sample_name',
    'virus_type': 'virus_type',
    'sample_depth': 'sample_depth'
}
column_mapping_vmag = {
    'mobile_cluster_name': 'virus_name',
    'mobile_cluster_length (bp)': 'virus_length',
    'mobile_cluster_read_count (reads)': 'virus_read_count',
    'mobile_cluster_read_depth (reads/kbp)': 'virus_read_depth',
    'mobile_cluster_read_depth_in_this_cluster (reads/kbp)': 'virus_read_depth_in_host',
    'cluster_name': 'host_name',
    'cluster_length (bp)': 'host_length',
    'cluster_read_count (reads)': 'host_read_count',
    'cluster_read_depth (reads/kbp)': 'host_read_depth',
    'intra_read_count (reads)': 'intra_read_count',
    'intra_linkage_density (reads/kbp^2)': 'intra_linkage_density',
    'inter_read_count (reads)': 'inter_read_count',
    'raw_inter_linkage_density (reads/kbp^2)': 'raw_inter_linkage_density',
    'raw_inter_vs_intra_ratio': 'raw_inter_vs_intra_ratio',
    'mobile_element_copies_per_cell': 'viral_copies_per_cell',
    'adjusted_inter_connective_linkage_density (reads/kbp^2)': 'adjusted_inter_linkage_density',
    'adjusted_inter_vs_intra_ratio': 'adjusted_inter_vs_intra_ratio',
    'sample_name':'sample_name',
    'virus_type': 'virus_type',
    'sample_depth': 'sample_depth'
}

## for loops to rename columns
for df in contig_list:
    df.rename(columns=column_mapping_contig, inplace=True)
for df in vmag_list: 
    df.rename(columns=column_mapping_vmag, inplace=True)
    
# create sample_name, sample_depth column, and cluster_name for each dataframe
for df in list([jv119_contig, jv119_vmag]):
    df['sample_name'] = 'JV119'
    df['sample_depth'] = 400
    df['host_name'] = 'JV119_' + df['host_name']
for df in list([jv121_contig, jv121_vmag]):
    df['sample_name'] = 'JV121'
    df['sample_depth'] = 95
    df['host_name'] = 'JV121_' + df['host_name']
for df in list([jv132_contig, jv132_vmag]):
    df['sample_name'] = 'JV132'
    df['sample_depth'] = 80
    df['host_name'] = 'JV132' + df['host_name']
for df in list([jv154_contig, jv154_vmag]):
    df['sample_name'] = 'JV154'
    df['sample_depth'] = 140
    df['host_name'] = 'JV154_' + df['host_name']

# merge all contig and vmag dataframes together
proximeta = pd.concat([jv119_contig, jv121_contig, jv132_contig, jv154_contig, jv119_vmag, jv121_vmag, jv132_vmag, jv154_vmag])

''' MERGE MAG GTDB DATA TOGETHER '''


# input mag gtdb files
jv119_arc = pd.read_csv(op.join(path,'data/mag_data/jv-119_gtdbtk/jv-119.ar53.summary.tsv'),sep='\t')
jv119_bac = pd.read_csv(op.join(path,'data/mag_data/jv-119_gtdbtk/jv-119.bac120.summary.tsv'),sep='\t')
jv121_arc = pd.read_csv(op.join(path,'data/mag_data/jv-121_gtdbtk/jv-121.ar53.summary.tsv'),sep='\t')
jv121_bac = pd.read_csv(op.join(path,'data/mag_data/jv-121_gtdbtk/jv-121.bac120.summary.tsv'),sep='\t')
jv132_arc = pd.read_csv(op.join(path,'data/mag_data/jv-132_gtdbtk/jv-132.ar53.summary.tsv'),sep='\t')
jv132_bac = pd.read_csv(op.join(path,'data/mag_data/jv-132_gtdbtk/jv-132.bac120.summary.tsv'),sep='\t')
jv154_arc = pd.read_csv(op.join(path,'data/mag_data/jv-154_gtdbtk/jv-154.ar53.summary.tsv'),sep='\t')
jv154_bac = pd.read_csv(op.join(path,'data/mag_data/jv-154_gtdbtk/jv-154.bac120.summary.tsv'),sep='\t')

# create sample_name and sample_depth column for each mag gtdb dataframe
for df in list([jv119_arc, jv119_bac]):
    df['sample_name'] = 'JV119'
    df['sample_depth'] = 400
    df['user_genome'] = 'JV119_' + df['user_genome']
for df in list([jv121_arc, jv121_bac]):
    df['sample_name'] = 'JV121'
    df['sample_depth'] = 95
    df['user_genome'] = 'JV121_' + df['user_genome']
for df in list([jv132_arc, jv132_bac]):
    df['sample_name'] = 'JV132'
    df['sample_depth'] = 80
    df['user_genome'] = 'JV132_' + df['user_genome']
for df in list([jv154_arc, jv154_bac]):
    df['sample_name'] = 'JV154'
    df['sample_depth'] = 140
    df['user_genome'] = 'JV154_' + df['user_genome']
    
# combine all mag gtdb dataframes
mag_gtdb = pd.concat([jv119_arc, jv119_bac, jv121_arc, jv121_bac, jv132_arc, jv132_bac, jv154_arc, jv154_bac])

# split the classification into tax level columns and rename
mag_gtdb = split_classification(mag_gtdb)

''' MERGE PROXIMETA AND GTDB DATA '''


# rename mag_gtdb user_genome column to match proximeta dataframe
mag_gtdb.rename(columns={'user_genome':'host_name'}, inplace=True) 

# Merge two data frames together
proximeta_gtdb = proximeta.merge(mag_gtdb,how='outer',on=["host_name","sample_name","sample_depth"]).replace('', np.nan).fillna("NA")

# Replace NA with 'uninfected mag' (MAGs with no associated viruses)
proximeta_gtdb['virus_type'] = proximeta_gtdb['virus_type'].replace('NA', 'uninfected mag')

# input vmag summary files
jv119_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-119_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')
jv121_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-121_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')
jv132_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-132_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')
jv154_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-154_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')

# create sample_name column
jv119_vmag_sum['sample_name'] = 'JV119'
jv121_vmag_sum['sample_name'] = 'JV121'
jv132_vmag_sum['sample_name'] = 'JV132'
jv154_vmag_sum['sample_name'] = 'JV154'

# split up contig_id column into different columns
for df in list([jv119_vmag_sum,jv121_vmag_sum,jv132_vmag_sum,jv154_vmag_sum]):
    df = split_contig_id(df)
    
# combine all vmag summary dataframes together
vmag_sums = pd.concat([jv119_vmag_sum, jv121_vmag_sum, jv132_vmag_sum, jv154_vmag_sum])

# remove contig_id column and turn all empty observations to NA
vmag_sums = vmag_sums.drop('contig_id', axis=1).fillna("NA")

# merge with proximeta_gtdb
proximeta_sum = proximeta_gtdb.merge(vmag_sums,how='outer',on=["sample_name","virus_name"]).replace('', np.nan).fillna("NA")

# replace NA with unassociated vmag
proximeta_sum['virus_type'] = proximeta_sum['virus_type'].replace('NA', 'unassociated vmag')

proximeta_sum

Unnamed: 0,virus_name,virus_length,virus_read_count,virus_read_depth,virus_read_depth_in_host,host_name,host_length,host_read_count,host_read_depth,intra_read_count,intra_linkage_density,inter_read_count,raw_inter_linkage_density,raw_inter_vs_intra_ratio,viral_copies_per_cell,adjusted_inter_linkage_density,adjusted_inter_vs_intra_ratio,virus_type,sample_name,sample_depth,classification,fastani_reference,fastani_reference_radius,fastani_taxonomy,fastani_ani,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,pplacer_taxonomy,classification_method,note,"other_related_references(genome_id,species_name,radius,ANI,AF)",msa_percent,translation_table,red_value,warnings_x,domain,phyla,class,order,family,genus,species,contig_length,provirus,proviral_length,gene_count,viral_genes,host_genes,checkv_quality,miuvig_quality,completeness,completeness_method,contamination,kmer_freq,warnings_y,N,L
0,k141_30505,6674.0,91.0,13.635001,4.999501,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,11.0,0.009322,0.822507,0.257554,0.036195,3.193528,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
1,k141_4246884,10943.0,244.0,22.297359,11.891925,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,40.0,0.020674,1.824134,0.612625,0.033747,2.977572,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
2,k141_3910817,15693.0,295.0,18.79819,6.914277,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,32.0,0.011533,1.0176,0.356196,0.032379,2.856854,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
3,k141_4680690,8982.0,141.0,15.698063,4.485161,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,10.0,0.006297,0.555597,0.231058,0.027253,2.404583,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
4,k141_6141634,5905.0,114.0,19.305673,8.190286,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,14.0,0.01341,1.183154,0.421931,0.031782,2.804141,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
5,k141_3651856,8253.0,231.0,27.989822,9.329941,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,23.0,0.015763,1.39075,0.480641,0.032795,2.89353,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
6,k141_7190536,8240.0,101.0,12.257282,6.511681,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,17.0,0.011669,1.029568,0.335456,0.034785,3.069159,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
7,k141_4910956,5917.0,180.0,30.420821,12.285332,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,21.0,0.020074,1.771132,0.632891,0.031717,2.798477,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
8,k141_2067012,15781.0,295.0,18.693365,7.771399,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,37.0,0.013261,1.170039,0.400352,0.033123,2.922529,contig,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,,,,,,,,,,,,,,,
9,vMAG_1,116593.0,1993.0,17.093651,6.387307,JV119_bin_165,176803.0,3432.0,19.411435,166.0,0.011334,210.0,0.010187,0.898835,0.329049,0.03096,2.731617,vmag,JV119,400.0,Unclassified Bacteria,,,,,,,,,,,,,,,,,,Insufficient number of amino acids in MSA (1.6%),Unclassified Bacteria,,,,,,,118393.0,No,,156.0,53.0,9.0,Medium-quality,Genome-fragment,54.91,AAI-based (medium-confidence),0.0,1.0,,10.0,116593.0


In [None]:
jv119 = proximeta_sum[proximeta_sum['sample_name'] == 'JV119']
jv119.to_csv(op.join(path, 'tables/jv119.csv'), index=False)

In [None]:
proximeta_sum.to_csv(op.join(path, 'tables/proximeta_master.csv'), index=False)

In [None]:
sorted_df = proximeta_sum.sort_values(by=['sample_name', 'virus_name'], ascending=[True, True])
sorted_df.to_csv(op.join(path, 'tables/proximeta_master.csv'), index=False)