# Assembly COL_032024 seq data

Reads have been run through QC and trimming (Col_qc.sh). Continuing on with SAMPLEID_R1/2_001_val_2.fq.gz files. Seq data is in /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed and separated by coral species.

In [None]:
#INSTALLATION envs
module load conda/latest
conda create -n assembly
conda activate assembly
conda install -c bioconda megahit
conda install -c bioconda quast python=2.7

module load conda/latest
conda create -y --name anvio-8 python=3.10
conda activate anvio-8
conda install -y -c conda-forge -c bioconda python=3.10 \
        sqlite prodigal idba mcl muscle=3.8.1551 famsa hmmer diamond \
        blast megahit spades bowtie2 bwa graphviz "samtools>=1.9" \
        trimal iqtree trnascan-se fasttree vmatch r-base r-tidyverse \
        r-optparse r-stringi r-magrittr bioconductor-qvalue meme ghostscript \
        nodejs
curl -L https://github.com/merenlab/anvio/releases/download/v8/anvio-8.tar.gz \
        --output anvio-8.tar.gz
pip install anvio-8.tar.gz
#the interactive server did not start, but will move forward and see how this goes, pretty sure it is installed

Documentation for anvi'o https://anvio.org/install/linux/stable/

Developing from Brooke's scripts, with these steps:

1)remove host from sample reads \
2)remove symbiont and human seqs using fastq screen \
3)concatenate all f and r seqs into single file (1 for f, 1 for r) \
4)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps - larger portions of genomes if not all are now together in one sequence)

to run the script below: sbatch Col_host_removal.sh

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/MCAV/slurm-removal-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

# 1)remove host from sample reads
#set general parameters:
SAMPLENAME="mcav"
SAMPLELIST="032024_mcav_sampleids.txt"
RAWREADSPATH='/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/mcav'
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/host_removed"
mkdir -p $READSPATH
EXTRAFILESPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/host_removal/${SAMPLENAME}"
mkdir -p $EXTRAFILESPATH
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
#set step parameters 
GENOME="Mcav"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#build a bowtie2 index from a known genome
# **using ofav since pstr is more closely related to ofav than mcav - (was only built for histat so need to redo for bowtie)
#bowtie2-build $INPUTPATH/GCF_002042975.1_ofav_dov_v1_genomic.fna $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$RAWREADSPATH"/"${SAMPLEID}_R1_001_val_1.fq.gz" -2 "$RAWREADSPATH"/"${SAMPLEID}_R2_001_val_2.fq.gz" -S $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq files
samtools sort -n -m 5G -@ 2 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -c 6 -@ 8 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Host removal: All samples processed successfully."

# JOB-ID:26516717
# bash script file name: nikea/COL/Col_host_removal.sh