### Colombia 012025, 032024 combined assembly

concatenate 012025 reads for each species, removed the re-sequenced 032024 samples (032024_COL_SAN_T5_140_OFAV_S30 and 032024_COL_SAN_T5_163_PSTR_S31) from the 012025 lists, need to concatenate those with the previous samples

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 5:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-cat-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

SAMPLENAME="ofav"
SAMPLELIST="012025_${SAMPLENAME}_sampleids.txt" 
READSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_final_filtered/repaired"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
WORKDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/concat_reads_ready"

# CONCATETATE all f and r seqs into single file (1 for f, 1 for r)
# Read the sample IDs from the file
while IFS= read -r SAMPLEID; do
    # Construct the file paths for forward and reverse reads
    FORWARD_READ="$READSPATH/${SAMPLEID}_host_removed_R1.tagged_filter_ready.fastq.gz"
    REVERSE_READ="$READSPATH/${SAMPLEID}_host_removed_R2.tagged_filter_ready.fastq.gz"

    # Check if the files exist before concatenating
    if [ -e "$FORWARD_READ" ]; then
        cat "$FORWARD_READ" >> "$WORKDIR/012025_${SAMPLENAME}_reads_R1.fastq.gz"
    else
        echo "Forward read file not found for sample $SAMPLEID"
    fi

    if [ -e "$REVERSE_READ" ]; then
        cat "$REVERSE_READ" >> "$WORKDIR/012025_${SAMPLENAME}_reads_R2.fastq.gz"
    else
        echo "Reverse read file not found for sample $SAMPLEID"
    fi
done < "$LISTPATH/${SAMPLELIST}"

conda deactivate
echo "Concatenation completed!"

# JOB-ID:50346545
# bash script file name: nikea/COL/bash_scripts/Col_concat.sh

IDs for the other species \
pstr: 50347739 \
dlab: 50347756 \
mcav: 50347784

concatenated the re-sequenced samples to original ones and re-concatenated the 032024 to include them.

concatenated \
032024_COL_SAN_T5_140_OFAV_S30_host_removed_R1.tagged_filter_ready.fastq.gz and 032024_COL_SAN_T5_140_OFAV_S18_host_removed_R1.tagged_filter_ready.fastq.gz \
032024_COL_SAN_T5_140_OFAV_S30_host_removed_R2.tagged_filter_ready.fastq.gz and 032024_COL_SAN_T5_140_OFAV_S18_host_removed_R2.tagged_filter_ready.fastq.gz \

032024_COL_SAN_T5_163_PSTR_S31_host_removed_R1.tagged_filter_ready.fastq.gz and 032024_COL_SAN_T5_163_PSTR_S11_host_removed_R1.tagged_filter_ready.fastq.gz \
032024_COL_SAN_T5_163_PSTR_S31_host_removed_R2.tagged_filter_ready.fastq.gz and 032024_COL_SAN_T5_163_PSTR_S11_host_removed_R2.tagged_filter_ready.fastq.gz \ 

new files in /032024_resequenced \
032024_COL_SAN_T5_140_OFAV_S18c R1 and R2 \
032024_COL_SAN_T5_163_PSTR_S11c R1 and R2 \
**moved these samples back into species folders with the other 032024 samples, and reconcatenated**

concatenated all samples for each species

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/pstr/slurm-cat-%j.out  # %j = job ID

SAMPLENAME="dlab"
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/concat_reads_ready"
OUTDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/all_sequences_concat"

cd $READSPATH

cat ./"032024_${SAMPLENAME}_reads_R1.fastq.gz" ./"012025_${SAMPLENAME}_reads_R1.fastq.gz" \
>> $OUTDIR/"${SAMPLENAME}_all_reads_R1.fastq.gz"

cat ./"032024_${SAMPLENAME}_reads_R2.fastq.gz" ./"012025_${SAMPLENAME}_reads_R2.fastq.gz" \
>> $OUTDIR/"${SAMPLENAME}_all_reads_R2.fastq.gz"


# JOB-ID: 50352583
# bash script file name: nikea/COL/bash_scripts/Col_concat_all.sh

ran for the others \
mcav: 50353368 \
ofav: 50353621 \
pstr: 50354758

### Megahit assembly all data (both 032024 and 012025)

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH --qos=long # extend time limit to longer than 2 days
#SBATCH -t 72:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-assembly-ofav-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

SAMPLENAME="ofav"
WORKDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/all_sequences_concat"
OUTDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/assemblies_all"
mkdir -p $OUTDIR

# ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps
megahit --presets meta-large \
-1 "$WORKDIR"/"$SAMPLENAME"_all_reads_R1.fastq.gz \
-2 "$WORKDIR"/"$SAMPLENAME"_all_reads_R2.fastq.gz \
-o $OUTDIR/megahit_assembly_"$SAMPLENAME" --out-prefix "$SAMPLENAME"_all \
#this one has to make the directory; will fail if it already exists

conda deactivate
echo "Assembly completed!"

# JOB-ID: 50356739
# bash script file name: nikea/COL/bash_scripts/Col_assemble_ofav.sh

didn't add "#SBATCH --qos=long" to ofav script, but I might need it, so adding it for the others

pstr: 50366433 \
dlab: 50368001 \
mcav: 50367881

I got them running before I realized I didn't add quast into the script ugh. Is it worth it to run metaquast on a co-assembly? 

## Mapping

mapping the co-assemblies that include all data. will do separate scripts for mapping to 012025 data and 032024 because seq files are in different places.

#### 012025 first

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH --qos=long # extend time limit to longer than 2 days
#SBATCH -t 72:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-mapping012025_ofav-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

SAMPLENAME="ofav"
READSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_final_filtered/repaired"
CONTIGPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/assemblies_all/megahit_assembly_${SAMPLENAME}"
CONTIGFILE="$SAMPLENAME"_all.contigs.fa
WORKPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/012025"
mkdir -p "$WORKPATH"
XTRAFILES="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/012025/sams"
mkdir -p "$XTRAFILES"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
SAMPLELIST="012025_${SAMPLENAME}_sampleids.txt" 
 
anvi-script-reformat-fasta $CONTIGPATH/$CONTIGFILE -o $WORKPATH/"${SAMPLENAME}.contigs-fixed.fsa" -l 1000 --simplify-names --report-file $WORKPATH/"${SAMPLENAME}"_contig-rename-report.txt

#fixes deflines (filters contigs and reformats so naming is cleaner)
#filtering seq length 1000bp...need to play around with filtering based on bp length
#deflines = sequence definition line. comes directly before its associated sequence in a fasta file


FIXEDCON="${SAMPLENAME}.contigs-fixed.fsa"

cd $WORKPATH
#this builds an index of your contigs, which only needs to happen once
bowtie2-build $FIXEDCON "$SAMPLENAME"_contigs
# will not accept path before contigs file - must be in the correct dir 

while IFS= read -r SAMPLEID; do
    #align reads to your contigs and collects that in a .sam file
    bowtie2 --threads 11 -x "$SAMPLENAME"_contigs -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.tagged_filter_ready.fastq.gz -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.tagged_filter_ready.fastq.gz -S $XTRAFILES/"${SAMPLEID}".sam
    #make sure to point it to the index not the FIXEDCON file (-x parameter)
    
    #converts your sam file to a bam file, but its neither sorted nor indexed, so we use an Anvi'O script to do so:
    samtools view -F 4 -b -S $XTRAFILES/"${SAMPLEID}".sam -o $WORKPATH/"${SAMPLEID}"-RAW.bam
   
    #index and sort your bam file
    anvi-init-bam $WORKPATH/"${SAMPLEID}"-RAW.bam -o $WORKPATH/"${SAMPLEID}".bam
    
    rm $WORKPATH/"${SAMPLEID}"-RAW.bam
done < "$LISTPATH/${SAMPLELIST}"
echo "Mapping success!"

#JOB ID: 50420392
#bash script: nikea/COL/bash_scripts/Col_mapping_012025_ofav.sh

job IDs \
mcav: 50427381 \
dlab: 50433768 \
pstr: 50433798

#### next 032024

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH --qos=long # extend time limit to longer than 2 days
#SBATCH -t 72:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-mapping032024_ofav-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

SAMPLENAME="ofav"
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/repaired"
CONTIGPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/assemblies_all/megahit_assembly_${SAMPLENAME}"
CONTIGFILE="$SAMPLENAME"_all.contigs.fa
WORKPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/032024"
mkdir -p "$WORKPATH"
XTRAFILES="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/032024/sams"
mkdir -p "$XTRAFILES"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
SAMPLELIST="032024_${SAMPLENAME}_sampleids_new.txt" 
 
#anvi-script-reformat-fasta $CONTIGPATH/$CONTIGFILE -o $WORKPATH/"${SAMPLENAME}.contigs-fixed.fsa" -l 1000 --simplify-names --report-file $WORKPATH/"${SAMPLENAME}"_contig-rename-report.txt

#fixes deflines (filters contigs and reformats so naming is cleaner)
#filtering seq length 1000bp...need to play around with filtering based on bp length
#deflines = sequence definition line. comes directly before its associated sequence in a fasta file

#FIXEDCON="${SAMPLENAME}.contigs-fixed.fsa"
CONTIGDB="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/012025"

cd $WORKPATH
#this builds an index of your contigs, which only needs to happen once

#bowtie2-build $FIXEDCON "$SAMPLENAME"_contigs

# will not accept path before contigs file - must be in the correct dir 

while IFS= read -r SAMPLEID; do
    #align reads to your contigs and collects that in a .sam file
    bowtie2 --threads 11 -x $CONTIGDB/"$SAMPLENAME"_contigs -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.tagged_filter_ready.fastq.gz -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.tagged_filter_ready.fastq.gz -S $XTRAFILES/"${SAMPLEID}".sam
    #make sure to point it to the index not the FIXEDCON file (-x parameter)
    
    #converts your sam file to a bam file, but its neither sorted nor indexed, so we use an Anvi'O script to do so:
    samtools view -F 4 -b -S $XTRAFILES/"${SAMPLEID}".sam -o $WORKPATH/"${SAMPLEID}"-RAW.bam
   
    #index and sort your bam file
    anvi-init-bam $WORKPATH/"${SAMPLEID}"-RAW.bam -o $WORKPATH/"${SAMPLEID}".bam
    
    rm $WORKPATH/"${SAMPLEID}"-RAW.bam
done < "$LISTPATH/${SAMPLELIST}"
echo "Mapping success!"

#JOB ID: 50427641
#bash script: nikea/COL/bash_scripts/Col_032024_mapping_ofav.sh

job IDs \
mcav: 50434140\
dlab: 50434147\
pstr: 50434158 (used new sample id table here too)

#### blasting contigs to probiotics database (custom db I made with the probiotic genomes)

https://www.ncbi.nlm.nih.gov/books/NBK279684/ good reference for local blast flags

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=10G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 3:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-probioticsblast-%j.out  # %j = job ID

module load conda/latest
conda activate /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/.conda/envs/quast #this env has blast

#set parameters
SAMPLENAME="ofav"
PROBPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/cordap_isolates/filtered_assembly/renamed_headers/probiotics_db"
PROBFASTA="probiotics7062_1_to_10.fasta"
CONTIGPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/012025"
CONTIGS="${SAMPLENAME}.contigs-fixed.fsa"
OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/probiotics_blast"
mkdir -p $OUTDIR

#I think it makes sense to make the contig db and then blast the probiotics.fasta to it? or does the other way around make sense?
makeblastdb -in $PROBPATH/$PROBFASTA -parse_seqids -dbtype nucl

blastn -db $PROBPATH/$PROBFASTA -query $CONTIGPATH/$CONTIGS -num_threads 4 -out $OUTDIR/"${SAMPLENAME}"_prob_blast_out.txt \
-outfmt "6 qseqid sseqid length qstart qend evalue bitscore" 

conda deactivate

#JOB ID: 50432106
#bash script: probiotics_blast_ofav.sh

do for the others, but need to decide if this is helpful or if I need to do another way