In [1]:
import glob 
import pandas as pd 
import numpy as np 
import os 
from utils import * 
import subprocess
import itertools

%load_ext autoreload
%autoreload 2

In [13]:
sample_metadata_df = pd.read_csv('sample_metadata.csv', index_col=0)
sample_metadata_df['metat'] = sample_metadata_df.sample_id.str.contains('metat')

ggkbase_name_to_year_map = sample_metadata_df.set_index('ggkbase_name').year.to_dict()
sample_id_to_year_map = sample_metadata_df.set_index('sample_id').year.to_dict()
ggkbase_name_to_sample_id_map = sample_metadata_df.set_index('ggkbase_name').sample_id.to_dict()
sample_id_to_ggkbase_name_map = sample_metadata_df.set_index('sample_id').ggkbase_name.to_dict()

sample_path_template = '/groups/banfield/sequences/{year}/{ggkbase_name}/raw.d/{ggkbase_name}_trim_clean.{paired_end}.fastq.gz'
ref_genome_dir = '../data/'
ref_genome_paths = [f'../data/data/{file_name}.fn' for file_name in id_to_ggkbase_name_map.keys()]

sample_ids = sample_metadata_df.sample_id.unique()
ref_genome_ids = list(id_to_ggkbase_name_map.keys())

In [3]:
def clean_fasta_file(path:str):
    '''Remove extra information from the FASTA file headers.'''
    subprocess.run(f"sed -i 's/ .*//' {path}", shell=True, check=True)

for path in ref_genome_paths:
    clean_fasta_file(path)

In [4]:
coverm_sample_paths = '' 
for row in sample_metadata_df[~sample_metadata_df.metat].itertuples():
    coverm_sample_paths += sample_path_template.format(ggkbase_name=row.ggkbase_name, paired_end='PE.1', year=row.year) + ' '
    coverm_sample_paths += sample_path_template.format(ggkbase_name=row.ggkbase_name, paired_end='PE.2', year=row.year) + ' '
coverm_sample_paths = coverm_sample_paths.strip()

coverm_fields = 'mean trimmed_mean covered_bases variance count rpkm tpm'

with open('../scripts/coverm_mapping.sh', 'w') as f:
    for ref_genome_id in id_to_ggkbase_name_map.keys():
        ref_genome_path = os.path.join(ref_genome_dir, f'{ref_genome_id}.fn')

        if 'metat' in ref_genome_id: # Don't do this for the transcripts. 
            continue 

        output_file_name = f'{ref_genome_id}.tsv'.lower()
        output_path = f'/home/philippar/data/coverm/{output_file_name}'
        cmd = f'coverm contig -c {coverm_sample_paths} -r {ref_genome_path} --min-read-percent-identity 97 --min-read-aligned-percent 80 --trim-min 5 --trim-max 95 -m {coverm_fields} -t 20 -o {output_path}'
        f.write(f'sbatch --wrap "{cmd}"\n')

In [5]:
# with open('../scripts/bbduk_library_sizes.sh', 'w') as f:
#     for name, sample_id in sample_id_map.items():
#             paired_ends_1_path = sample_path_template.format(paired_end='PE.1', sample_id=sample_id, year=sample_year_map.get(sample_id, 2025))
#             paired_ends_2_path = sample_path_template.format(paired_end='PE.2', sample_id=sample_id, year=sample_year_map.get(sample_id, 2025))
#             output_path = f'/home/philippar/data/bbduk/{name}.txt'
#             cmd = f'bbduk.sh in={paired_ends_1_path} in2={paired_ends_2_path} out=/dev/null stats={output_path} threads=64'
#             f.write(f'sbatch --wrap "{cmd}"\n')

In [16]:
def bbmap_get_mapping_command(sample_id:str, ref_genome_id:str, output_dir:str='../data/metat/', input_dir='../data/'):
    ref_genome_path = os.path.join(input_dir, f'{ref_genome_id}.fn')

    input_path_1 = sample_path_template.format(ggkbase_name=sample_id_to_ggkbase_name_map[sample_id], paired_end='PE.1', year=sample_id_to_year_map[sample_id])
    input_path_2 = sample_path_template.format(ggkbase_name=sample_id_to_ggkbase_name_map[sample_id], paired_end='PE.2', year=sample_id_to_year_map[sample_id])

    output_dir = os.path.join(output_dir, ref_genome_id)
    output_path = os.path.join(output_dir, f'{sample_id}.bam')

    params = 'pigz=t unpigz=t ambiguous=random minid=0.96 idfilter=0.97 threads=64 out=stdout.sam editfilter=5 out=stdout.sam'
    cmd = f'bbmap.sh {params} in1={input_path_1} in2={input_path_2} ref={ref_genome_path} nodisk | shrinksam | sambam > {output_path}'
    return cmd


output_dir = '../data/metat'

with open('../scripts/bbmap_mapping.sh', 'w') as f:
    for ref_genome_id in ref_genome_ids:
        f.write(f'mkdir -p {os.path.join(output_dir, ref_genome_id)}\n') # Make sure the output directory exists.
        for sample_id in sample_ids:
            cmd = bbmap_get_mapping_command(sample_id, ref_genome_id, output_dir=output_dir)
            f.write(f'sbatch --wrap "{cmd}"\n')

# def get_counting_command(bam_path:str, ref_path:str=None, output_dir:str=output_dir):
#     output_file_name = os.path.basename(bam_path).replace('.bam', '')
#     output_file_name += '_read_counts'
#     output_path = os.path.join(output_dir, output_file_name)
#     return f'featureCounts -p -T 64 -g ID -t CDS -a {ref_path} -s 2 -o {output_path} {bam_path}' 



In [14]:
ref_genome_ids

['mp_4',
 'mp_1',
 'mp_3',
 'mp_5',
 'mp_2',
 'jupiter_mini_borg_1',
 'jupiter_mini_borg_2',
 'jupiter_mini_borg_3',
 'jupiter_mini_borg_4',
 'jupiter_mini_borg_6',
 'jupiter_mini_borg_7',
 'jupiter_mini_borg_8',
 'jupiter_mini_borg_9',
 'saturn_mini_borg_1',
 'saturn_mini_borg_2',
 'saturn_mini_borg_3',
 'saturn_mini_borg_4',
 'unclassified_mini_borg',
 'unclassified_borg',
 'amethyst_borg',
 'oxblood_borg',
 'pink_borg',
 'purple_borg',
 'rose_borg',
 'vermilion_borg',
 'mercury_mini_borg',
 'saturn_mini_borg_like',
 'ruby_borg_related',
 'black_borg',
 'linear_ece_19kb']

In [7]:
i = 0 

mapping_path, counting_path = '../scripts/metat_mapping.sh', '../scripts/metat_counting.sh'
mapping, counting = list(), list()
for ref_path in ref_paths:
    for sample_path in sample_paths:
        mapping_command, bam_path = get_mapping_command(sample_path, ref_path=ref_path)
        counting_command = get_counting_command(bam_path, ref_path=ref_path.replace('fn', 'gff'))
        mapping.append(get_sbatch_command(mapping_command, job_name=i))
        counting.append(counting_command)

        i += 1

with open(mapping_path, 'w') as f:
    f.write('\n'.join(mapping))

with open(counting_path, 'w') as f:
    f.write('\n'.join(counting))

NameError: name 'ref_paths' is not defined