### Colombia 012025 sample processing

In [None]:
ws_allocate col_data 30
# /scratch4/workspace/nikea_ulrich_uml_edu-col_data

Ran fastqc/multiqc on raw reads \
**Reads with high adaptor content**
- 012025_COL_SAN_T5_569_DLAB_S2 R1 & R2
- 012025_COL_SAN_T5_571_DLAB_S4 R1 & R2
- 012025_COL_SAN_T5_573_PSTR_S6 R1 & R2
- 012025_COL_SAN_T5_582_PSTR_S15 R1 & R2
- 012025_COL_SAN_T5_586_DLAB_S19 R1 & R2
- 012025_COL_SAN_T5_590_PSTR_S23 R1 & R2

### QC - trim galore

In [None]:
#make sample id list
ls *R1_001.fastq.gz > 012025_sampleids.txt
#then remove _R1_001.fastq.gz

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=100G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 48:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-012025qc-%j.out  # %j = job ID

module load conda/latest

# Run qc with trim galore and fastqc
conda activate qc

# Define the paths and variables
FILEPATH="/scratch3/workspace/nikea_ulrich_uml_edu-data_download/012025_COL_SAN_T5_raw"
OUTDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/trimmed"
mkdir -p $OUTDIR
NSLOTS=4  

SAMPLE_NAMES_FILE="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/012025_sampleids.txt"

# Check if the file exists
if [ ! -e "$SAMPLE_NAMES_FILE" ]; then
    echo "Error: $SAMPLE_NAMES_FILE does not exist."
    exit 1
fi

# Read each line from the file and perform actions
while IFS= read -r sample_id; do
    # Form the full file names
    input_r1="$FILEPATH/${sample_id}_R1_001.fastq.gz"
    input_r2="$FILEPATH/${sample_id}_R2_001.fastq.gz"
    
    # Ensure the input files exist before running the tools
    if [ ! -e "$input_r1" ] || [ ! -e "$input_r2" ]; then
        echo "Error: Input files do not exist for sample $sample_id"
        continue
    fi

    # Run trim_galore
    trim_galore -j "$NSLOTS" -q 20 --phred33 --length 20 --paired $input_r1 $input_r2 --fastqc -o $OUTDIR

done < "$SAMPLE_NAMES_FILE"


# JOB-ID: 49466490
# bash script: COL_qc.sh

In [None]:
# run multiqc
module load conda/latest
conda activate multiqc
multiqc .

All reads failed the per base sequence content but see https://sequencing.qcfail.com/articles/positional-sequence-bias-in-random-primed-libraries/. Shouldn't be a problem, has more to do with the library prep kit that does random PCR. Proceeding to next step

### Host removal

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 48:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-mcav_hostremoval-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

#Remove host from sample reads
#set general parameters:
SAMPLENAME="mcav"
SAMPLELIST="012025_${SAMPLENAME}_sampleids.txt" 
#manually created the mcav_sampleids file from the whole sample list
RAWREADSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/trimmed"
READSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_host_removed/${SAMPLENAME}"
mkdir -p $READSPATH
EXTRAFILESPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_host_removed/${SAMPLENAME}/bams"
mkdir -p $EXTRAFILESPATH
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL"
#set step parameters 
GENOME="Mcav"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#loop through samples
while IFS= read -r SAMPLEID; do
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 $RAWREADSPATH/"${SAMPLEID}_R1_001_val_1.fq.gz" -2 $RAWREADSPATH/"${SAMPLEID}_R2_001_val_2.fq.gz" -S $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq.gz files
samtools sort -n -m 5G -@ 2 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -c 6 -@ 8 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Host removal: All samples processed successfully."

# JOB-ID: 49505851
# bash script file name: Col_host_removal_mcav.sh

ran the other species. pstr used Cnat reference, all others used reference genomes of the same species \
ofav: 49505852 \
pstr: 49505853 \
dlab: 49505956 (032024 samples were mapped to Cnat, so need to re-process those by mapping to the new draft ref DLAB)


### Symbiont screening + removal

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 48:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-mcav-symbremoval-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

#Remove symbiont and human seqs using fastq screen 
SAMPLENAME="mcav"
SAMPLELIST="012025_${SAMPLENAME}_sampleids.txt"  
READSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_host_removed/${SAMPLENAME}"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL"
FASTQSCREEN='/home/nikea_ulrich_uml_edu/.conda/envs/assembly/share/fastq-screen-0.16.0-0'
OUTDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_final_filtered"
mkdir -p "$OUTDIR"

while IFS= read -r SAMPLEID; do
$FASTQSCREEN/fastq_screen --nohits --aligner bowtie2 --conf $FASTQSCREEN/fastq_screen.conf --outdir $OUTDIR \
$READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz;
 if [ $? -eq 0 ]; then
        echo "fastq_screen completed successfully for sample: $SAMPLEID"
    else
        echo "fastq_screen encountered an error for sample: $SAMPLEID"
        exit 1
    fi
# --nohits = output reads do not map to any genomes
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Symbiont removal: All samples processed successfully."

# JOB-ID: 50003769
# bash script file name: Col_symb_removal.sh

ran the other species \
dlab: 49608039 \
pstr: 49608608 \
ofav: 49608433

for some reason host removal wasn't completed for 012025_COL_SAN_T5_595_MCAV_S28 \
so reran host removal: 50157203 \
and then symb removal: 50242334

In [None]:
cd /scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_final_filtered
conda activate mutliqc
multiqc .

before concatenating and performing co-assemblies let's try to blast these filtered seqs against a database composed of the probiotics genomes to get an idea about how abundant they are in this MG data. can't blast with fastq files but I could maybe map with bowtie. actually might make sense to assemble first and the blast to that.

In [None]:
sed "s/^>/>7062_015_/" 7062_015_filtered_assembly.fasta > 7062_015_filtered_assembly_renamed.fasta 
# this adds 7062_015_ to each header (right after >)

#### repair reads

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-repair012025-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

SAMPLELIST="012025_sampleids.txt" 
READSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_final_filtered"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
OUTDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/012025_final_filtered/repaired"
mkdir -p "$OUTDIR"

#Lets try this! Using repair.sh script from:https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/
while IFS= read -r SAMPLEID; do

repair.sh in1=$READSPATH/"${SAMPLEID}"_host_removed_R1.tagged_filter.fastq.gz in2=$READSPATH/"${SAMPLEID}"_host_removed_R2.tagged_filter.fastq.gz \
out1=$OUTDIR/"${SAMPLEID}"_host_removed_R1.tagged_filter_ready.fastq.gz out2=$OUTDIR/"${SAMPLEID}"_host_removed_R2.tagged_filter_ready.fastq.gz \
outs=$OUTDIR/"${SAMPLEID}"singletons.fq repair;
 if [ $? -eq 0 ]; then
        echo "repair completed successfully for sample: $SAMPLEID"
    else
        echo "repair encountered an error for sample: $SAMPLEID"
        exit 1
    fi
done < "$LISTPATH/${SAMPLELIST}"

conda deactivate
echo "Repair: All samples processed successfully."

# JOB-ID: 50277995
# bash script file name: nikea/COL/bash_scripts/bbtools_repair.sh