# 01_create_proximeta_associations_table

Run this notebook to create a table that contains both viral contig and vMAG host-virus associations for all 4 samples. All input data are proximeta data files.

In [12]:
# Load Packages
import pandas as pd
import os
import os.path as op
import sys
import csv
import numpy as np
import matplotlib
import glob
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

# file paths
m_path = '/Users/melissaherring/Google Drive/My Drive/MH_project/'
#j_path = '/Users/juliabrown/Google Drive/My Drive/projects/OMZvir_round2/MH_project/'

# Input contig data
jv119_contig = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-119_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv121_contig = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-121_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv132_contig = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-132_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv154_contig = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-154_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')

# Input vMAG data
jv119_vmag = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-119_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv121_vmag = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-121_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv132_vmag = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-132_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv154_vmag = pd.read_csv(op.join(m_path,'data/proximeta_viral_files/jv-154_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')

# Create sample_name column for each data frame
jv119_contig['sample_name'] = 'JV119'
jv121_contig['sample_name'] = 'JV121'
jv132_contig['sample_name'] = 'JV132'
jv154_contig['sample_name'] = 'JV154'
jv119_vmag['sample_name'] = 'JV119'
jv121_vmag['sample_name'] = 'JV121'
jv132_vmag['sample_name'] = 'JV132'
jv154_vmag['sample_name'] = 'JV154'

# Create virus_type column for each data frame
jv119_contig['virus_type'] = 'contig'
jv121_contig['virus_type'] = 'contig'
jv132_contig['virus_type'] = 'contig'
jv154_contig['virus_type'] = 'contig'
jv119_vmag['virus_type'] = 'vMAG'
jv121_vmag['virus_type'] = 'vMAG'
jv132_vmag['virus_type'] = 'vMAG'
jv154_vmag['virus_type'] = 'vMAG'

# Create sample_depth column for each data frame
jv119_contig['sample_depth'] = 400
jv119_vmag['sample_depth'] = 400
jv121_contig['sample_depth'] = 95
jv121_vmag['sample_depth'] = 95
jv132_contig['sample_depth'] = 80
jv132_vmag['sample_depth'] = 80
jv154_contig['sample_depth'] = 140
jv154_vmag['sample_depth'] = 140

# create a list of contig data frames
df_list_contig = [jv119_contig, jv121_contig, jv132_contig, jv154_contig]

column_mapping_contig = {
    'mobile_contig_name': 'virus_name',
    'mobile_contig_length (bp)': 'virus_length',
    'mobile_contig_read_count (reads)': 'virus_read_count',
    'mobile_contig_read_depth (reads/kbp)': 'virus_read_depth',
    'mobile_contig_read_depth_in_this_cluster (reads/kbp)': 'virus_read_depth_in_host',
    'cluster_name': 'host_name',
    'cluster_length (bp)': 'host_length',
    'cluster_read_count (reads)': 'host_read_count',
    'cluster_read_depth (reads/kbp)': 'host_read_depth',
    'intra_read_count (reads)': 'intra_read_count',
    'intra_linkage_density (reads/kbp^2)': 'intra_linkage_density',
    'inter_read_count (reads)': 'inter_read_count',
    'raw_inter_linkage_density (reads/kbp^2)': 'raw_inter_linkage_density',
    'raw_inter_vs_intra_ratio': 'raw_inter_vs_intra_ratio',
    'mobile_element_copies_per_cell': 'viral_copies_per_cell',
    'adjusted_inter_connective_linkage_density (reads/kbp^2)': 'adjusted_inter_linkage_density',
    'adjusted_inter_vs_intra_ratio': 'adjusted_inter_vs_intra_ratio',
    'sample_name':'sample_name',
    'virus_type': 'virus_type',
    'sample_depth': 'sample_depth'
} # create a list of old_col_name : new_col_name for contig data frames

#for loop to rename columns
for df in df_list_contig: # for every data frame in the list named df_list_contig
    df.rename(columns=column_mapping_contig, inplace=True) # rewrite the column names using the column_mapping_contig list

# create a list of vMAG data frames
df_list_vmag = [jv119_vmag, jv121_vmag, jv132_vmag, jv154_vmag]

column_mapping_vmag = {
    'mobile_cluster_name': 'virus_name',
    'mobile_cluster_length (bp)': 'virus_length',
    'mobile_cluster_read_count (reads)': 'virus_read_count',
    'mobile_cluster_read_depth (reads/kbp)': 'virus_read_depth',
    'mobile_cluster_read_depth_in_this_cluster (reads/kbp)': 'virus_read_depth_in_host',
    'cluster_name': 'host_name',
    'cluster_length (bp)': 'host_length',
    'cluster_read_count (reads)': 'host_read_count',
    'cluster_read_depth (reads/kbp)': 'host_read_depth',
    'intra_read_count (reads)': 'intra_read_count',
    'intra_linkage_density (reads/kbp^2)': 'intra_linkage_density',
    'inter_read_count (reads)': 'inter_read_count',
    'raw_inter_linkage_density (reads/kbp^2)': 'raw_inter_linkage_density',
    'raw_inter_vs_intra_ratio': 'raw_inter_vs_intra_ratio',
    'mobile_element_copies_per_cell': 'viral_copies_per_cell',
    'adjusted_inter_connective_linkage_density (reads/kbp^2)': 'adjusted_inter_linkage_density',
    'adjusted_inter_vs_intra_ratio': 'adjusted_inter_vs_intra_ratio',
    'sample_name':'sample_name',
    'virus_type': 'virus_type',
    'sample_depth': 'sample_depth'
} # create a list of old_col_name : new_col_name for vMAG data frames

# for loop to rename columns
for df in df_list_vmag: # for every data frame in the list named df_list_vmag
    df.rename(columns=column_mapping_vmag, inplace=True) # rewrite the column names using the column_mapping_vMAG list

# merge data frames
combo = pd.concat([jv119_contig, jv121_contig, jv132_contig, jv154_contig, jv119_vmag, jv121_vmag, jv132_vmag, jv154_vmag])

# write csv
#combo.to_csv(op.join(m_path,'tables/proximeta_associations_table.csv'), index=False)

In [14]:
combo.columns

Index(['virus_name', 'virus_length', 'virus_read_count', 'virus_read_depth',
       'virus_read_depth_in_host', 'host_name', 'host_length',
       'host_read_count', 'host_read_depth', 'intra_read_count',
       'intra_linkage_density', 'inter_read_count',
       'raw_inter_linkage_density', 'raw_inter_vs_intra_ratio',
       'viral_copies_per_cell', 'adjusted_inter_linkage_density',
       'adjusted_inter_vs_intra_ratio', 'sample_name', 'virus_type',
       'sample_depth'],
      dtype='object')