In [None]:
sample_name_map = {
    'ck_bottom_2025': 'SR-VP_Bioreactor_ck_bot_05_17_2025',
    'ck_middle_2025': 'SR-VP_Bioreactor_ck_mid_05_17_2025',
    'ck_top_2025': 'SR-VP_Bioreactor_ck_top_05_17_2025',
    'n_bottom_2025': 'SR-VP_Bioreactor_N_bot_05_17_2025',
    'n_middle_2025': 'SR-VP_Bioreactor_N_mid_05_17_2025',
    'n_top_2025': 'SR-VP_Bioreactor_N_top_05_17_2025',

    'n_middle_2024': 'SR-VP_05_06_2024_N_middle',
    'n_top_2024':    'SR-VP_05_06_2024_N_top',   
    'n_bottom_2024': 'SR-VP_05_06_2024_N_bottom',
    'ck_bottom_2024': 'SR-VP_05_06_2024_ck_bottom',

    '90cm_2020':  'SR-VP_26_10_2019_1_90cm',
    '100cm_2020': 'SR-VP_26_10_2019_1_100cm',
    '100cm_2021_1': 'SR-VP_26_10_2020_1_100CM',
    '100cm_2021_2': 'SR-VP_26_10_2020_2_100CM',
    '80cm_2023': 'SR-VP_11_27_2022_S1_80cm_MG',
    '40cm_2020': 'SR-VP_26_10_2019_C_40cm',
    '100cm_2022': 'SR-VP_11_27_2022_S1_100cm_MG',
    '90cm_2023': 'SR-VP_07_25_2022_A1_90cm_MG',
    '90cm_2020': 'SR-VP_26_10_2019_2_90cm',
    '60cm_2023': 'SR-VP_11_27_2022_S1_60cm_MG',

    'ck_bottom_2025_metat': 'SR-VP_Bioreactor_ck_bot_05_17_2025_metaT',
    'ck_middle_2025_metat': 'SR-VP_Bioreactor_ck_mid_05_17_2025_metaT',
    'ck_top_2025_metat': 'SR-VP_Bioreactor_ck_top_05_17_2025_metaT',
    'n_top_2025_metat': 'SR-VP_Bioreactor_N_top_05_17_2025_metaT',
    'n_bottom_2025_metat': 'SR-VP_Bioreactor_N_bot_05_17_2025_metaT',
    'n_middle_2025_metat': 'SR-VP_Bioreactor_N_mid_05_17_2025_metaT',

    'ck_bottom_2024_metat': 'SR-VP_Bioreactor_ck_bot_05_06_2024_metaT',
    'n_top_2024_metat': 'SR-VP_Bioreactor_N_top_05_06_2024_metaT',
    'n_bottom_2024_metat': 'SR-VP_Bioreactor_N_bot_05_06_2024_metaT',
    'n_middle_2024_metat': 'SR-VP_Bioreactor_N_mid_05_06_2024_metaT'}


sample_year_map = {
    'SR-VP_26_10_2019_1_90cm': 2020,
    'SR-VP_26_10_2019_1_100cm': 2020,
    'SR-VP_26_10_2020_1_100CM': 2021,
    'SR-VP_26_10_2020_2_100CM': 2021,
    'SR-VP_11_27_2022_S1_80cm_MG': 2023,
    'SR-VP_26_10_2019_C_40cm': 2020,
    'SR-VP_11_27_2022_S1_100cm_MG': 2023,
    'SR-VP_07_25_2022_A1_90cm_MG': 2023,
    'SR-VP_26_10_2019_2_90cm': 2020,
    'SR-VP_11_27_2022_S1_60cm_MG': 2023}

input_path_template = '/groups/banfield/sequences/{year}/{sample_name}/raw.d/{sample_name}_trim_clean.{paired_end}.fastq.gz'

In [None]:
coverm_ref_paths = ['../data/ece_26_1334.fn', '../data/methanoperedens_2.fn']
coverm_fields = 'mean trimmed_mean covered_bases variance count rpkm tpm'

# variance: Coverage variance, which is a statistical measure of how uneven the read depth is across a contig or genome.
# trimmed_mean: Trimmed mean coverage per contig, which is essentially the mean depth after removing the lowest and highest 10% of per-base coverage values.
#   Read depth is defined per nucleotide base. 
# rpkm: Reads per kb per million mapped reads.
# tpm: TPM-normalized coverage, which is the RPKM for a specific contig divided by the sum of all RPKM. This controls for both contig length and sequencing depth. 


with open('../scripts/coverm_mapping.sh', 'w') as f:
    for ref_path in coverm_ref_paths:
        target_name = os.path.basename(ref_path).replace('.fn', '')
        for name, sample_name in sample_name_map.items():
            if 'metat' in name: # Don't do this for the transcripts. 
                continue 
            paired_ends_1_path = input_path_template.format(paired_end='PE.1', sample_name=sample_name, year=sample_year_map.get(sample_name, 2025))
            paired_ends_2_path = input_path_template.format(paired_end='PE.2', sample_name=sample_name, year=sample_year_map.get(sample_name, 2025))
            output_file_name = f'{name}-{target_name}.tsv'.lower()
            output_path = f'/home/philippar/data/coverm/{output_file_name}'
            cmd = f'coverm contig -1 {paired_ends_1_path} -2 {paired_ends_1_path} -r {ref_path} --min-read-percent-identity 97 --min-read-aligned-percent 80 --trim-min 5 --trim-max 95 -m {coverm_fields} -t 20 -o {output_path}'
            f.write(f'sbatch --wrap "{cmd}"\n')

In [None]:
with open('../scripts/bbduk_library_sizes.sh', 'w') as f:
    for name, sample_name in sample_name_map.items():
            paired_ends_1_path = input_path_template.format(paired_end='PE.1', sample_name=sample_name, year=sample_year_map.get(sample_name, 2025))
            paired_ends_2_path = input_path_template.format(paired_end='PE.2', sample_name=sample_name, year=sample_year_map.get(sample_name, 2025))
            output_path = f'/home/philippar/data/bbduk/{name}.txt'
            cmd = f'bbduk.sh in={paired_ends_1_path} in2={paired_ends_2_path} out=/dev/null stats={output_path} threads=64'
            f.write(f'sbatch --wrap "{cmd}"\n')

In [None]:
input_path_template = '/groups/banfield/sequences/2025/{sample_name}/raw.d/{sample_name}_trim_clean.{paired_end}.fastq.gz'

ece_id = 'ece_26_1334'
ref_paths = ['/home/philippar/data/methanoperedens_1.fn', '/home/philippar/data/methanoperedens_2.fn', f'/home/philippar/data/{ece_id}.fn']
output_dir = '/home/philippar/data/metat/'

In [None]:

def get_mapping_command(sample_path:str, ref_path:str=None, output_dir:str=output_dir):
    # TODO: Should look into what paired-end reads are and how that works experimentally. 

    target_name = os.path.basename(ref_path).replace('.fn', '')
    sample_name = os.path.basename(sample_path)
    input_path_1 = os.path.join(sample_path, 'raw.d', f'{sample_name}_trim_clean.PE.1.fastq.gz')
    input_path_2 = os.path.join(sample_path, 'raw.d', f'{sample_name}_trim_clean.PE.2.fastq.gz')
    output_path = os.path.join(output_dir, f'{sample_name_map[sample_name]}-{target_name}.bam')

    params = 'pigz=t unpigz=t ambiguous=random minid=0.96 idfilter=0.97 threads=64 out=stdout.sam editfilter=5 out=stdout.sam'
    cmd = f'bbmap.sh {params} in1={input_path_1} in2={input_path_2} ref={ref_path} nodisk | shrinksam | sambam > {output_path}'
    return cmd, output_path


def get_counting_command(bam_path:str, ref_path:str=None, output_dir:str=output_dir):
    output_file_name = os.path.basename(bam_path).replace('.bam', '')
    output_file_name += '_read_counts'
    output_path = os.path.join(output_dir, output_file_name)
    return f'featureCounts -p -T 64 -g ID -t CDS -a {ref_path} -s 2 -o {output_path} {bam_path}' 


def get_sbatch_command(cmd, job_name:str=None):
    return f'sbatch --wrap "{cmd}" --output ../slurm.out/{job_name}.out'



In [None]:
i = 0 

mapping_path, counting_path = '../scripts/metat_mapping.sh', '../scripts/metat_counting.sh'
mapping, counting = list(), list()
for ref_path in ref_paths:
    for sample_path in sample_paths:
        mapping_command, bam_path = get_mapping_command(sample_path, ref_path=ref_path)
        counting_command = get_counting_command(bam_path, ref_path=ref_path.replace('fn', 'gff'))
        mapping.append(get_sbatch_command(mapping_command, job_name=i))
        counting.append(counting_command)

        i += 1

with open(mapping_path, 'w') as f:
    f.write('\n'.join(mapping))

with open(counting_path, 'w') as f:
    f.write('\n'.join(counting))