In [104]:
# Load packages
import pandas as pd
import os
import os.path as op
import sys
import csv
import numpy as np
import matplotlib
import glob
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

# file paths
path = '/Users/melissaherring/Google Drive/My Drive/MH_project/'
#path = '/Users/juliabrown/Google Drive/My Drive/projects/OMZvir_round2/MH_project/'

# define functions 
def split_classification(df):
    df[['domain','phyla','class','order','family','genus','species']] = df.classification.str.split(';', expand=True)
    df['domain'] = df['domain'].str.replace('d__', '')
    df['phyla'] = df['phyla'].str.replace('p__', '')
    df['class'] = df['class'].str.replace('c__', '')
    df['order'] = df['order'].str.replace('o__', '')
    df['family'] = df['family'].str.replace('f__', '')
    df['genus'] = df['genus'].str.replace('g__', '')
    df['species'] = df['species'].str.replace('s__', '')
    return df


''' JOIN ASSOCIATED CONTIG AND VMAG DATA '''


# input contig data
jv119_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-119_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv121_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-121_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv132_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-132_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')
jv154_contig = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-154_proximeta/viral_hosts/viral_host_associations_filtered.tsv'), sep ='\t')

# create a list of all contig dataframes
contig_list = [jv119_contig, jv121_contig, jv132_contig, jv154_contig]

## input vMAG data
jv119_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-119_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv121_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-121_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv132_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-132_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')
jv154_vmag = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-154_proximeta/vMag_hosts/vMAG_host_associations_filtered.tsv'), sep ='\t')

# create a list of all vmag dataframes
vmag_list = [jv119_vmag, jv121_vmag, jv132_vmag, jv154_vmag]

# create virus_type column for each dataframe
for df in contig_list:
    df['virus_type'] = 'contig'
for df in vmag_list:
    df['virus_type'] = 'vmag'

# create sample_name and sample_depth column for each dataframe
for df in list([jv119_contig, jv119_vmag]):
    df['sample_name'] = 'JV119'
    df['sample_depth'] = 400
for df in list([jv121_contig, jv121_vmag]):
    df['sample_name'] = 'JV121'
    df['sample_depth'] = 95
for df in list([jv132_contig, jv132_vmag]):
    df['sample_name'] = 'JV132'
    df['sample_depth'] = 80
for df in list([jv154_contig, jv154_vmag]):
    df['sample_name'] = 'JV154'
    df['sample_depth'] = 140

# rename columns to match betweeen contig and vmag dataframes

## create dictionaries to use for column mapping -> old_col_name : new_col_name
column_mapping_contig = {
    'mobile_contig_name': 'virus_name',
    'mobile_contig_length (bp)': 'virus_length',
    'mobile_contig_read_count (reads)': 'virus_read_count',
    'mobile_contig_read_depth (reads/kbp)': 'virus_read_depth',
    'mobile_contig_read_depth_in_this_cluster (reads/kbp)': 'virus_read_depth_in_host',
    'cluster_name': 'host_name',
    'cluster_length (bp)': 'host_length',
    'cluster_read_count (reads)': 'host_read_count',
    'cluster_read_depth (reads/kbp)': 'host_read_depth',
    'intra_read_count (reads)': 'intra_read_count',
    'intra_linkage_density (reads/kbp^2)': 'intra_linkage_density',
    'inter_read_count (reads)': 'inter_read_count',
    'raw_inter_linkage_density (reads/kbp^2)': 'raw_inter_linkage_density',
    'raw_inter_vs_intra_ratio': 'raw_inter_vs_intra_ratio',
    'mobile_element_copies_per_cell': 'viral_copies_per_cell',
    'adjusted_inter_connective_linkage_density (reads/kbp^2)': 'adjusted_inter_linkage_density',
    'adjusted_inter_vs_intra_ratio': 'adjusted_inter_vs_intra_ratio',
    'sample_name':'sample_name',
    'virus_type': 'virus_type',
    'sample_depth': 'sample_depth'
}
column_mapping_vmag = {
    'mobile_cluster_name': 'virus_name',
    'mobile_cluster_length (bp)': 'virus_length',
    'mobile_cluster_read_count (reads)': 'virus_read_count',
    'mobile_cluster_read_depth (reads/kbp)': 'virus_read_depth',
    'mobile_cluster_read_depth_in_this_cluster (reads/kbp)': 'virus_read_depth_in_host',
    'cluster_name': 'host_name',
    'cluster_length (bp)': 'host_length',
    'cluster_read_count (reads)': 'host_read_count',
    'cluster_read_depth (reads/kbp)': 'host_read_depth',
    'intra_read_count (reads)': 'intra_read_count',
    'intra_linkage_density (reads/kbp^2)': 'intra_linkage_density',
    'inter_read_count (reads)': 'inter_read_count',
    'raw_inter_linkage_density (reads/kbp^2)': 'raw_inter_linkage_density',
    'raw_inter_vs_intra_ratio': 'raw_inter_vs_intra_ratio',
    'mobile_element_copies_per_cell': 'viral_copies_per_cell',
    'adjusted_inter_connective_linkage_density (reads/kbp^2)': 'adjusted_inter_linkage_density',
    'adjusted_inter_vs_intra_ratio': 'adjusted_inter_vs_intra_ratio',
    'sample_name':'sample_name',
    'virus_type': 'virus_type',
    'sample_depth': 'sample_depth'
}

## for loops to rename columns
for df in contig_list:
    df.rename(columns=column_mapping_contig, inplace=True)
for df in vmag_list: 
    df.rename(columns=column_mapping_vmag, inplace=True)

# merge all contig and vmag dataframes together
proximeta = pd.concat([jv119_contig, jv121_contig, jv132_contig, jv154_contig, jv119_vmag, jv121_vmag, jv132_vmag, jv154_vmag])


''' MERGE MAG GTDB DATA TOGETHER '''


# input mag gtdb files
jv119_arc = pd.read_csv(op.join(path,'data/mag_data/jv-119_gtdbtk/jv-119.ar53.summary.tsv'),sep='\t')
jv119_bac = pd.read_csv(op.join(path,'data/mag_data/jv-119_gtdbtk/jv-119.bac120.summary.tsv'),sep='\t')
jv121_arc = pd.read_csv(op.join(path,'data/mag_data/jv-121_gtdbtk/jv-121.ar53.summary.tsv'),sep='\t')
jv121_bac = pd.read_csv(op.join(path,'data/mag_data/jv-121_gtdbtk/jv-121.bac120.summary.tsv'),sep='\t')
jv132_arc = pd.read_csv(op.join(path,'data/mag_data/jv-132_gtdbtk/jv-132.ar53.summary.tsv'),sep='\t')
jv132_bac = pd.read_csv(op.join(path,'data/mag_data/jv-132_gtdbtk/jv-132.bac120.summary.tsv'),sep='\t')
jv154_arc = pd.read_csv(op.join(path,'data/mag_data/jv-154_gtdbtk/jv-154.ar53.summary.tsv'),sep='\t')
jv154_bac = pd.read_csv(op.join(path,'data/mag_data/jv-154_gtdbtk/jv-154.bac120.summary.tsv'),sep='\t')

# create sample_name and sample_depth column for each mag gtdb dataframe
for df in list([jv119_arc, jv119_bac]):
    df['sample_name'] = 'JV119'
    df['sample_depth'] = 400
for df in list([jv121_arc, jv121_bac]):
    df['sample_name'] = 'JV121'
    df['sample_depth'] = 95
for df in list([jv132_arc, jv132_bac]):
    df['sample_name'] = 'JV132'
    df['sample_depth'] = 80
for df in list([jv154_arc, jv154_bac]):
    df['sample_name'] = 'JV154'
    df['sample_depth'] = 140
    
# combine all mag gtdb dataframes
mag_gtdb = pd.concat([jv119_arc, jv119_bac, jv121_arc, jv121_bac, jv132_arc, jv132_bac, jv154_arc, jv154_bac])

# split the classification into tax level columns and rename
mag_gtdb = split_classification(mag_gtdb)


''' MERGE PROXIMETA AND GTDB DATA '''


# rename mag_gtdb user_genome column to match proximeta dataframe
mag_gtdb.rename(columns={'user_genome':'host_name'}, inplace=True) 

# Merge two data frames together
proximeta_gtdb = proximeta.merge(mag_gtdb,how='outer',on=["host_name","sample_name","sample_depth"]).replace('', np.nan).fillna("NA")

# Replace NA with 'uninfected mag' (MAGs with no associated viruses)
proximeta_gtdb['virus_type'] = proximeta_gtdb['virus_type'].replace('NA', 'uninfected mag')

# input vmag summary files
jv119_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-119_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')
jv121_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-121_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')
jv132_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-132_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')
jv154_vmag_sum = pd.read_csv(op.join(path,'data/proximeta_viral_files/jv-154_proximeta/viral_MAGs/viral_mags_summary.tsv'), sep = '\t')

# create sample_name column
jv119_vmag_sum['sample_name'] = 'JV119'
jv121_vmag_sum['sample_name'] = 'JV121'
jv132_vmag_sum['sample_name'] = 'JV132'
jv154_vmag_sum['sample_name'] = 'JV154'

# combine all vmag summary dataframes together
vmag_sums = pd.concat([jv119_vmag_sum, jv121_vmag_sum, jv132_vmag_sum, jv154_vmag_sum])

# split up contig_id column into different columns
vmag_sums['virus_name'] = vmag_sums['contig_id'].str.split("|").str[0]
vmag_sums['N'] = pd.to_numeric(vmag_sums['contig_id'].str.split("|").str[1].str.replace('N=', ''))
vmag_sums['L'] = pd.to_numeric(vmag_sums['contig_id'].str.split("|").str[2].str.replace('L=', ''))

# remove contig_id column and turn all empty observations to NA
vmag_sums = vmag_sums.drop('contig_id', axis=1).fillna("NA")

# merge with proximeta_gtdb
proximeta_sum = proximeta_gtdb.merge(vmag_sums,how='outer',on=["virus_name","sample_name"]).replace('', np.nan).fillna("NA")

In [107]:
proximeta_sum.to_csv(op.join(path, 'tables/proximeta_master.csv'))