# In this script you will calculate MAG relative abundance by normalizing mAG Truncated Average sequencing Depth (TAD80) values and metagenome Genome Equivalent values. You will also use metagenome size (number of reads and number of base pairs) to calculate other coverage statistics.

## Input must include the following column-separated files
### 1. List of MAGs and corresponding sizes (two columns)
### 2. List of metagenomes and corresponding number of reads and number of base pairs, as separate columns. In my sample file I also have column 2 containing sample metadata ('Day').
### 3. List of MAGs to check against #1
### 4. MAG-specific files containing raw TAD80 values for each sample (MAGs in column one, TAD80s in column two). These should be in a subdirectory, which I call here "tads_RAW".
### 5. Sample-specific files containing the Microbe Census output, including Genome Equivalent value

In [1]:
import numpy as np
import pandas as pd
from functools import reduce
import os as os
import glob as glob

In [2]:
insert your own working directory here
os.chdir("/Users/npatin3/Dropbox (GaTech)/Norovirus/TADs/Jupyter/Data")

### Constant dataframes

In [3]:
MAG_list = pd.read_csv("Noro_dRep_winners.csv", names=['MAG'])
MAG_sizes = pd.read_csv("MAG_sizes.csv", header=0, names=['MAG','Size'])
mgn_bps = pd.read_csv("Noro_mgn_reads.csv")

In [5]:
#sanity check
MAG_list.head()

Unnamed: 0,MAG
0,13_1_001
1,14_1_002
2,14_1_003
3,14_1_005
4,14_4_005


### Check if there are any MAGs from the list that didn't end up in the merged df with sizes

In [118]:
MAG_list[(~MAG_list.MAG.isin(sizes.MAG))]

### Function to extract Genome Equivalents value from text file

In [6]:
def extract_GE(GE):
    """Use the MicrobeCensus output file to extract the Genome Equivalent value of the metagenome"""
    with open(GE, "rt") as GE_file:
        # read all lines into a list
        lines = GE_file.readlines()
        # extract just the genome equivalent value, which is the second value on the 12th line
        ge = float(lines[12].split('\t')[1])
    return(ge)

### Loop through all MAG-specific files with raw TAD values and use each file to build new dataframe with sample and coverage data

In [7]:
df = []
for file in glob.glob("tads_RAW/*.csv"):
    a, b = file.split('/')
    c, d = b.split('.')
    e, f = c.split('-')
    raw_tads = pd.read_csv(file, names=['MAG','Raw_TAD'])
    # Merge all three dfs: MAG list, MAG sizes, and raw TADs
    dfs = [MAG_list, raw_tads, MAG_sizes]
    # make new data frame 'tads'
    tads = reduce(lambda left,right: pd.merge(left, right, on=['MAG']), dfs)
    # extract GE for sample
    GE_path = '%s_GE.txt' % e
    x = extract_GE(GE_path)
    # make GE list for each MAG
    ge_list = x * len(tads.index)
    bps = mgn_bps.loc[mgn_bps['Sample'] == e, 'bp'].iloc[0]
    bps_list = bps * len(tads.index)
    # make new dataframe 'tads' and populate it with all the stats
    tads['sample'] = e
    tads['GE'] = ge_list
    tads['mgn_bps'] = bps_list
    tads['Normalized_coverage'] = tads['Raw_TAD'] / tads['GE']
    tads['MAG_size'] = MAG_sizes['Size']
    tads['MAG_proportion'] = (tads['Raw_TAD'] * tads['MAG_size']) / tads['mgn_bps']
    tads = tads[['sample','GE','mgn_bps','MAG','Raw_TAD','MAG_size','Normalized_coverage','MAG_proportion']]
    #print(file, b, e)
    df.append(tads)
    
total_df = pd.concat(df)

In [8]:
total_df.head()

Unnamed: 0,sample,GE,mgn_bps,MAG,Raw_TAD,MAG_size,Normalized_coverage,MAG_proportion
0,13_1,36751.376735,74426278660,13_1_001,125.673472,4374699,0.00342,0.007387
1,13_1,36751.376735,74426278660,14_1_002,0.0,2251753,0.0,0.0
2,13_1,36751.376735,74426278660,14_1_003,0.388714,4356764,1.1e-05,2.3e-05
3,13_1,36751.376735,74426278660,14_1_005,0.0,2787927,0.0,0.0
4,13_1,36751.376735,74426278660,14_4_005,0.0,2878704,0.0,0.0


In [9]:
total_df.shape

(1742, 8)

#### Reformat data frame to show just values of interest, eg TAD80 values ('Normalized_coverage')

In [12]:
df_for_table = pd.pivot_table(data=df_to_save, index='MAG', values='Normalized_coverage', columns='sample')

In [13]:
df_for_table.head()

sample,13_1,13_2,13_4,15_1,15_3,15_6,15_7,15_8,15_9,28_1,...,37_8,37_9,38_1,38_6,41_1,41_7,49_1,49_3,49_4,4_1
MAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13_1_001,0.00342,0.004224,0.002117,0.00027,0.00011,0.0002527719,0.000472,0.000139,0.000385,5.2e-05,...,0.000489,0.000416,0.000935,0.00104,0.001795,0.002725,0.0008062754,0.000227,0.000882,0.001432
14_1_002,0.0,0.0,0.0,6e-06,2e-06,1.531116e-05,1.2e-05,8e-06,1.3e-05,6.8e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14_1_003,1.1e-05,0.0,2e-06,0.0,0.0,2.33796e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5e-06,0.0,0.0,0.0,0.0,0.0,0.0
14_1_005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000164,...,0.0,0.0,0.0,0.0,0.000275,0.00016,1.04148e-06,0.0,0.0,0.0
14_4_005,0.0,0.0,0.0,8.1e-05,4e-05,9.936179e-05,3.3e-05,2.3e-05,1.1e-05,0.0,...,6e-05,5.5e-05,0.0,0.0,0.000238,7.6e-05,7.173552e-07,0.0,0.0,0.0


In [14]:
df_for_table.to_csv("MAG_TAD80s.csv")