# Assembly COL_032024 seq data

Reads have been run through QC and trimming (Col_qc.sh). Continuing on with SAMPLEID_R1/2_001_val_2.fq.gz files. Seq data is in /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed and separated by coral species.

In [None]:
#INSTALLATION envs
module load conda/latest
conda create -n assembly
conda activate assembly
conda install -c bioconda megahit
conda install -c bioconda quast python=2.7

module load conda/latest
conda create -y --name anvio-8 python=3.10
conda activate anvio-8
conda install -y -c conda-forge -c bioconda python=3.10 \
        sqlite prodigal idba mcl muscle=3.8.1551 famsa hmmer diamond \
        blast megahit spades bowtie2 bwa graphviz "samtools>=1.9" \
        trimal iqtree trnascan-se fasttree vmatch r-base r-tidyverse \
        r-optparse r-stringi r-magrittr bioconductor-qvalue meme ghostscript \
        nodejs
curl -L https://github.com/merenlab/anvio/releases/download/v8/anvio-8.tar.gz \
        --output anvio-8.tar.gz
pip install anvio-8.tar.gz
#the interactive server did not start, but will move forward and see how this goes, pretty sure it is installed

Documentation for anvi'o https://anvio.org/install/linux/stable/

Developing from Brooke's scripts, with these steps:

1)remove host from sample reads \
2)remove symbiont and human seqs using fastq screen \
3)concatenate all f and r seqs into single file (1 for f, 1 for r) \
4)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps - larger portions of genomes if not all are now together in one sequence)

## Host removal (step 1)

### Started with mcav files first!

In [None]:
sbatch Col_host_removal.sh

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/MCAV/slurm-removal-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

# 1)remove host from sample reads
#set general parameters:
SAMPLENAME="mcav"
SAMPLELIST="032024_mcav_sampleids.txt" 
#manually created the mcav_sampleids file from the whole sample list
RAWREADSPATH='/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/mcav'
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/host_removed"
mkdir -p $READSPATH
EXTRAFILESPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/host_removal/${SAMPLENAME}"
mkdir -p $EXTRAFILESPATH
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
#set step parameters 
GENOME="Mcav"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#build a bowtie2 index from a known genome (this was already built for Mcav)
#bowtie2-build $INPUTPATH/genomic.fna $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$RAWREADSPATH"/"${SAMPLEID}_R1_001_val_1.fq.gz" -2 "$RAWREADSPATH"/"${SAMPLEID}_R2_001_val_2.fq.gz" -S $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq.gz files
samtools sort -n -m 5G -@ 2 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -c 6 -@ 8 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Host removal: All samples processed successfully."

# JOB-ID:26516717
# bash script file name: nikea/COL/Col_host_removal.sh

Outputs:

The SAMs, sorted and unsorted BAMs are in /project.../COL_files/host_removal_mcav/ for storage reasons \
The host-removed fastq.gz files are in /work/.../COL/assembly/mcav/ for next step

Realized later that last sample (032024_COL_SAN_T5_150_MCAV_S8) didn't run (the loop skipped it). Hopefully fixed in later version of the script. Ran the job separately (Job ID:26677005)

### Ofav files next

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurm-removal-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

# 1)remove host from sample reads
#set general parameters:
SAMPLENAME="ofav"
SAMPLELIST="032024_ofav_sampleids.txt"
RAWREADSPATH='/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/ofav'
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/host_removed"
mkdir -p $READSPATH
EXTRAFILESPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/host_removal/${SAMPLENAME}"
mkdir -p $EXTRAFILESPATH
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
#set step parameters 
GENOME="Ofav"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#build a bowtie2 index from a known genome (this was already built for Ofav)
#bowtie2-build $INPUTPATH/genomic.fna $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$RAWREADSPATH"/"${SAMPLEID}_R1_001_val_1.fq.gz" -2 "$RAWREADSPATH"/"${SAMPLEID}_R2_001_val_2.fq.gz" -S $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq.gz files
samtools sort -n -m 5G -@ 2 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -c 6 -@ 8 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Host removal: All samples processed successfully."

# JOB-ID:26590329
# bash script file name: nikea/COL/Col_host_removal.sh

For some reason, 1 sample 032024_COL_SAN_T5_167_OFAV_S21 didn't get run but the script said all samples were successfully done. Everything looks fine (script ended before the time I allotted it). Will re-run for just that sample (job id: 26657822) 

### PSTR next

Will be using the new *Colpophyllia natans* genome as a reference (from this paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC11370458/)

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurm-removal-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

# 1)remove host from sample reads
#set general parameters:
SAMPLENAME="pstr"
SAMPLELIST="032024_pstr_sampleids.txt"
RAWREADSPATH='/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/pstr'
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/host_removed"
mkdir -p $READSPATH
EXTRAFILESPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/host_removal/${SAMPLENAME}"
mkdir -p $EXTRAFILESPATH
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
#set step parameters 
GENOME="Cnat"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#build a bowtie2 index from a known genome (using C. natans genome)
bowtie2-build $INPUTPATH/Cnat_genomic.fna $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$RAWREADSPATH"/"${SAMPLEID}_R1_001_val_1.fq.gz" -2 "$RAWREADSPATH"/"${SAMPLEID}_R2_001_val_2.fq.gz" -S $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq.gz files
samtools sort -n -m 5G -@ 2 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -c 6 -@ 8 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Host removal: All samples processed successfully."

# JOB-ID:26661001
# bash script file name: nikea/COL/Col_host_removal_pstr.sh

Again last sample didn't run, must be something wrong with the loop or sample list file. Figure out before running DLAB!
Job ID for 032024_COL_SAN_T5_162_PSTR_S39: 26667854

### DLAB last

also using *C. natans* genome for mapping. Added command to remove .sam files in this bash script instead of deleting after. Also made sure I pressed enter after the last sample in the sample list (maybe that will help it read the last sample ID?) Will report back.

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurm-removal-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

# 1)remove host from sample reads
#set general parameters:
SAMPLENAME="dlab"
SAMPLELIST="032024_dlab_sampleids.txt"
RAWREADSPATH='/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/trimmed/dlab'
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/host_removed"
mkdir -p $READSPATH
EXTRAFILESPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/host_removal/${SAMPLENAME}"
mkdir -p $EXTRAFILESPATH
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
#set step parameters 
GENOME="Cnat"
INPUTPATH="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/${GENOME}_genome"
INDEX="${GENOME}_DB"

#build a bowtie2 index from a known genome (using C. natans genome, indexes already built)
#bowtie2-build $INPUTPATH/Cnat_genomic.fna $INPUTPATH/"$INDEX"

#loop through samples
while IFS= read -r SAMPLEID; do
        echo "starting on $SAMPLEID"
#re-align reads back to the index
bowtie2 -p 8 -x $INPUTPATH/$INDEX -1 "$RAWREADSPATH"/"${SAMPLEID}_R1_001_val_1.fq.gz" -2 "$RAWREADSPATH"/"${SAMPLEID}_R2_001_val_2.fq.gz" -S $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam
#convert sam file from bowtie to a bam file for processing
samtools view -bS $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.sam > $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam
#extract only the reads of which both do not match against the host genome
samtools view -b -f 12 -F 256 $EXTRAFILESPATH/"${SAMPLEID}"_mapped_and_unmapped.bam > $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam
# sorts the file so both mates are together and then extracts them back as .fastq.gz files
samtools sort -n -m 5G -@ 2 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped.bam -o $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam
samtools fastq -c 6 -@ 8 $EXTRAFILESPATH/"${SAMPLEID}"_bothReadsUnmapped_sorted.bam \
    -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz \
    -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz \
    -0 /dev/null -s /dev/null -n
 if [ $? -eq 0 ]; then
        echo "host removal completed successfully for sample: $SAMPLEID"
    else
        echo "host removal encountered an error for sample: $SAMPLEID"
        exit 1  
    fi
done < "$LISTPATH/${SAMPLELIST}"
rm -rf $EXTRAFILESPATH/*.sam
conda deactivate
echo "Host removal: All samples processed successfully."

# JOB-ID: 26674335
# bash script file name: nikea/COL/Col_host_removal.sh

This script went through all samples but failed at the end because I had a few extra spaces added to the sample list. *Make sure to "enter" just once after the last sample in the list.*

*once host removal is finished for all samples, moved the /trimmed sequence files (...val...fastq) to /project to keep /work just the current sequences I'm moving forward with*

## Fastq-screen for symbiont and human seq removal (step 2)
https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/

In [None]:
#Installation
module load conda/latest
conda activate assembly
conda install -c bioconda fastq-screen

Made the fastq-screen.conf file (tells the program what aligner to use and where all the databases are) \
fastq_screen.conf file below \
*added more symbiont databases to it. Used a quick bowtie script to index the new genomes I added*

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/slurm-index-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

#build a bowtie2 index 
bowtie2-build --threads 20 /project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/Genomes/GCA_963969995.1/GCA_963969995.1_Durusdinium_trenchii_SCF082.fna /project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_963969995.1_index

bowtie2-build /project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/Genomes/GCA_947184155.2/GCA_947184155.2_Cgoreaui_SCF055-01_v2.1_genomic.fna /project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_947184155.2_index

conda deactivate

# Job ID: 26690898
# bash script file name: bowtie2_indexing.sh

Couldn't get GCA_963969995.1 to index for some reason. There is already a Durusdinium_trenchii indexed genome in the folder so I will keep with that for now.

In [None]:
############################
## Bowtie, Bowtie 2 or BWA #
############################
## If the Bowtie, Bowtie 2 or BWA binary is not in your PATH, you can set 
## this value to tell the program where to find your chosen aligner.  Uncomment 
## the relevant line below and set the appropriate location.  Please note, 
## this path should INCLUDE the executable filename.

#BOWTIE	/usr/local/bin/bowtie/bowtie
BOWTIE2 /home/nikea_ulrich_uml_edu/.conda/envs/assembly/bin/bowtie2
#BWA /usr/local/bwa/bwa



############################################
## Bismark (for bisulfite sequencing only) #
############################################
## If the Bismark binary is not in your PATH then you can set this value to 
## tell the program where to find it.  Uncomment the line below and set the 
## appropriate location. Please note, this path should INCLUDE the executable 
## filename.

#BISMARK	/usr/local/bin/bismark/bismark



############
## Threads #
############
## Genome aligners can be made to run across multiple CPU cores to speed up 
## searches.  Set this value to the number of cores you want for mapping reads.

THREADS		12



##############
## DATABASES #
##############
## This section enables you to configure multiple genomes databases (aligner index 
## files) to search against in your screen.  For each genome you need to provide a 
## database name (which can't contain spaces) and the location of the aligner index 
## files.
##
## The path to the index files SHOULD INCLUDE THE BASENAME of the index, e.g:
## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37
## Thus, the index files (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.) 
## are found in a folder named 'GRCh37'.
##
## If, for example, the Bowtie, Bowtie2 and BWA indices of a given genome reside in 
## the SAME FOLDER, a SINLGE path may be provided to ALL the of indices.  The index 
## used will be the one compatible with the chosen aligner (as specified using the 
## --aligner flag).  
##
## The entries shown below are only suggested examples, you can add as many DATABASE 
## sections as required, and you can comment out or remove as many of the existing 
## entries as desired.  We suggest including genomes and sequences that may be sources 
## of contamination either because they where run on your sequencer previously, or may 
## have contaminated your sample during the library preparation step.
##
## Human - sequences available from
## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/
## (Kraken2 RefSeq db)
DATABASE	Human	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/ref_databases/standard/library/human/index
##
## added more databases and updated a few listed here with their updated assemblies 12.11.2024
## Symbionts
DATABASE	Symbiont1	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/Durusdinium_trenchii_indexed
## Symbionts
DATABASE	Symbiont2	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_000507305.1_index
## Symbionts
DATABASE	Symbiont3	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_001939145.1_index
## Symbionts
DATABASE	Symbiont4	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_003297005.1_index
## Symbionts
DATABASE	Symbiont5	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_009767595.1_index
## Symbionts
DATABASE	Symbiont6	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_018327485.1_index
## Symbionts
DATABASE	Symbiont7	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_905221635.1_index
## Symbionts
DATABASE	Symbiont8	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_947184155.2_index
## Symbionts
DATABASE	Symbiont9	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_003297045.1_index
## Symbionts
DATABASE	Symbiont10	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_905231905.1_index
## Symbionts
DATABASE	Symbiont11	/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/symbionts/indexed/GCA_905231915.1_index
##
## Ecoli- sequence available from EMBL accession U00096.2
#DATABASE	Ecoli	/data/public/Genomes/Ecoli/Ecoli
##
## PhiX - sequence available from Refseq accession NC_001422.1
#DATABASE	PhiX	/data/public/Genomes/PhiX/phi_plus_SNPs
##
## Adapters - sequence derived from the FastQC contaminats file found at: www.bioinformatics.babraham.ac.uk/projects/fastqc
#DATABASE	Adapters	/data/public/Genomes/Contaminants/Contaminants
##
## Vector - Sequence taken from the UniVec database
## http://www.ncbi.nlm.nih.gov/VecScreen/UniVec.html
#DATABASE	Vectors		/data/public/Genomes/Vectors/Vectors

Moved the conf file to the fastq screen directory in the assembly environment

In [None]:
mv fastq_screen.conf /home/nikea_ulrich_uml_edu/.conda/envs/assembly/share/fastq-screen-0.16.0-0

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/pstr/slurm-HSremoval-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

# 2)remove symbiont and human seqs using fastq screen 
SAMPLENAME="pstr"
SAMPLELIST="032024_pstr_sampleids.txt" 
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/host_removed"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"

FASTQSCREEN='/home/nikea_ulrich_uml_edu/.conda/envs/assembly/share/fastq-screen-0.16.0-0'
OUTPUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/final_reads_filtered"

mkdir -p "$OUTPUTDIR"
if [ $? -ne 0 ]; then
    echo "Error: Failed to create output directory $OUTPUTDIR"
    exit 1
fi

while IFS= read -r SAMPLEID; do
$FASTQSCREEN/fastq_screen --nohits --aligner bowtie2 --conf $FASTQSCREEN/fastq_screen.conf --outdir $OUTPUTDIR \
$READSPATH/"${SAMPLEID}"_host_removed_R1.fastq.gz $READSPATH/"${SAMPLEID}"_host_removed_R2.fastq.gz;
 if [ $? -eq 0 ]; then
        echo "fastq_screen completed successfully for sample: $SAMPLEID"
    else
        echo "fastq_screen encountered an error for sample: $SAMPLEID"
        exit 1
    fi
# --nohits = output reads do not map to any genomes
done < "$LISTPATH/${SAMPLELIST}"
conda deactivate
echo "Symbiont, host removal: All samples processed successfully."

# JOB-ID:26697214
# bash script file name: nikea/COL/bash_scripts/Col_human_symiont_removal.sh

*note*- to find path for bowtie2 aligner for fastq_screen.conf, I loaded the assembly conda environment and did the command: bowtie2 --version \
This gives you the path to put in the conf file.

Started with pstr (Job ID: 26697214). Switched out sample name/sample list to run the others \
dlab (Job ID: 26699799) *don't forget to move slurm output back into dlab folder*\
ofav (Job ID: ) \
mcav (Job ID: ) 


## Concatenating reads and assembling into contigs (steps 3 & 4)

https://github.com/voutcn/megahit/wiki/An-example-of-real-assembly

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 56:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/assembly/slurm-assembly-%j.out  # %j = job ID

module load conda/latest
conda activate assembly

# 3)CONCATETATE all f and r seqs into single file (1 for f, 1 for r)
SAMPLENAME="pstr"
SAMPLELIST="032024_pstr_sampleids.txt" 
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/final_reads_filtered"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"

OUTDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}"

# Read the sample IDs from the file
while IFS= read -r SAMPLEID; do
    # Construct the file paths for forward and reverse reads
    FORWARD_READ="$READSPATH/${SAMPLEID}_host_removed_R1.tagged_filter.fastq.gz"
    REVERSE_READ="$READSPATH/${SAMPLEID}_host_removed_R2.tagged_filter.fastq.gz"

    # Check if the files exist before concatenating
    if [ -e "$FORWARD_READ" ]; then
        cat "$FORWARD_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R1_ALL.fastq.gz"
    else
        echo "Forward read file not found for sample $SAMPLEID"
    fi

    if [ -e "$REVERSE_READ" ]; then
        cat "$REVERSE_READ" >> "$OUTDIR/${SAMPLENAME}_reads_R2_ALL.fastq.gz"
    else
        echo "Reverse read file not found for sample $SAMPLEID"
    fi
done < "$OUTDIR/032024_pstr_sampleids"

# 4)ASSEMBLE reads into contigs (contiguous sequence - joins them together based on read overlap, and ensures there are no gaps
megahit --presets meta-large \
-1 "$OUTDIR"/"$SAMPLENAME"_reads_R1_ALL.fastq.gz \
-2 "$OUTDIR"/"$SAMPLENAME"_reads_R2_ALL.fastq.gz \
--keep-tmp-files \
-o megahit_reads_filtered --out-prefix $SAMPLENAME \
#--continue
#this one has to make the directory; will fail if it already exists

# JOB-ID: 
# bash script file name: 