# Euler classifier training

The following code chunks were run on euler in order to train a classifier for our fungal ITS data, do not attempt running in the nb. Including different cutadapt and unite classifier. 


In [None]:
# Setting up euler
# Get qiime2 on euler (30 mins to set up, around 5gb)
conda env create -n q2 --file https://data.qiime2.org/distro/amplicon/qiime2-amplicon-2024.5-py39-osx-conda.yml

# Now qiime2 can be activated with:
conda activate q2

In [None]:
# Getting our pundemic data and denoising / trimming
cd /cluster/scratch/ameara/ #change to your directory if running yourself
mkdir -p apbio2
cd apbio2
mkdir -p scripts
mkdir -p data
mkdir -p unite

wget -O data/pundemic_metadata.tsv https://polybox.ethz.ch/index.php/s/7LxWSbaw2q37yof/download
wget -O data/pundemic_forward_reads.qza  https://polybox.ethz.ch/index.php/s/o8HqHJqvuf9e2on/download

# save the folowing scripts in the script folder

# Trimming and filtering script trim_filter.sh
#!/bin/bash
source ~/miniconda3/etc/profile.d/conda.sh
conda activate q2
DATA="/cluster/scratch/ameara/apbio2/data" # change to yours

echo "starting trimming"

! qiime cutadapt trim-single \
  --i-demultiplexed-sequences $DATA/pundemic_forward_reads.qza \
  --p-front CTTGGTCATTTAGAGGAAGTAA \
  --p-error-rate 0.2 \
  --p-discard-untrimmed \
  --verbose \
  --o-trimmed-sequences $DATA/pundemic_forward_reads_trimmed.qza | tee $DATA/cutadapt_results.log 

echo "finished trimming"

! qiime quality-filter q-score \
    --i-demux $DATA/pundemic_forward_reads_trimmed.qza \
    --p-min-quality 15 \
    --p-quality-window 3 \
    --o-filtered-sequences $DATA/pundemic_forward_reads_trimmed_QCfiltered.qza \
    --o-filter-stats $DATA/pundemic_forward_reads_trimmed_QCfiltered_stats.qza


# Running the script
sbatch --time=60 --ntasks=4 --mem-per-cpu=30G scripts/trim_filter.sh 


# Denoising script denoise.sh
#!/bin/bash
source ~/miniconda3/etc/profile.d/conda.sh
conda activate q2
DATA="/cluster/scratch/ameara/apbio2/data" # change to yours

echo "starting denoising"

qiime dada2 denoise-single \
   --i-demultiplexed-seqs $DATA/pundemic_forward_reads_trimmed_QCfiltered.qza \
   --p-trunc-len 140 \
   --p-n-threads 4 \
   --o-table $DATA/dada2_table.qza \
   --o-representative-sequences $DATA/dada2_rep_seq.qza \
   --o-denoising-stats $DATA/dada2_stats.qza

echo "finished truncation"

# Running the script
sbatch --time=60 --ntasks=4 --mem-per-cpu=30G scripts/denoise.sh 


## Training a classifier

In [None]:
# Getting unite db for fungal ITS classifier
qiime rescript get-unite-data \
    --p-version 10.0 \
    --p-taxon-group eukaryotes \
    --p-cluster-id dynamic \
    --p-no-singletons \
    --verbose \
    --output-dir unite_dynamic 

# Cleaning up unite db
qiime rescript edit-taxonomy \
    --i-taxonomy unite_dynamic/taxonomy.qza \
    --o-edited-taxonomy unite_dynamic/taxonomy-no-SH.qza \
    --p-search-strings ';sh__.*' \
    --p-replacement-strings '' \
    --p-use-regex



In [None]:
# Train classifier
sbatch --time=300  --ntasks=4 --mem-per-cpu=30G --wrap="\
    qiime feature-classifier fit-classifier-naive-bayes \
    --i-reference-reads unite_dynamic/sequences.qza \
    --i-reference-taxonomy unite_dynamic/taxonomy-no-SH.qza \
    --o-classifier unite_dynamic/classifier.qza"

# See how long it took on euler:
sacct -j <jobID> --format=JobID,JobName,Elapsed
# job ID here was 11433149, elapsed time: 02:07:47

In [None]:
# Evaluate the classifier
# Script for eval
#!/bin/bash

echo "starting eval"

qiime rescript evaluate-fit-classifier \
    --i-sequences unite_dynamic/sequences.qza   \
    --i-taxonomy unite_dynamic/taxonomy-no-SH.qza \
    --p-n-jobs 4 \
    --o-classifier unite_dynamic/classifier.qza \
    --o-evaluation unite_dynamic/classifier-evaluation.qzv \
    --o-observed-taxonomy unite_dynamic/predicted-taxonomy.qza

echo "finished fit eval"

qiime rescript evaluate-taxonomy \
  --i-taxonomies unite_dynamic/taxonomy-no-SH.qza unite_dynamic/predicted-taxonomy.qza \
  --p-labels ref-taxonomy predicted-taxonomy \
  --o-taxonomy-stats unite_dynamic/both-taxonomy-evaluation.qzv

echo "finished tax eval"

# Running the script
sbatch --time=360 --ntasks=4 --mem-per-cpu=30G scripts/ceval_dynamic.sh 


In [None]:
# Use the classifier on our data
# Script for classify

#!/bin/bash

echo "Start classifying!"

PDATA="/cluster/scratch/ameara/apbio/processed_data"
CDATA="/cluster/scratch/ameara/apbio/unite_dynamic"

qiime feature-classifier classify-sklearn \
    --i-classifier $CDATA/classifier.qza \
    --i-reads $PDATA/dada2_rep_seq.qza \
    --o-classification $PDATA/taxonomy_d.qza

echo "Classification done."

# Run classify script
sbatch --time=360 --ntasks=4 --mem-per-cpu=30G /cluster/home/ameara/applied_bioinformatics/project/scripts/classify_d.sh


## Classifying with a pre trained classifier

In [None]:
# Use a pre-trained classifier on our data

# Uploading pre-trained classifier from git
# options -> eukaryotes inc, 99, no singletons
wget -O /cluster/scratch/ameara/apbio2/unite/unite_ver10_99_all_04.04.2024-Q2-2024.5.qza https://github.com/colinbrislawn/unite-train/releases/download/v10.0-v04.04.2024-qiime2-2024.5/unite_ver10_99_all_04.04.2024-Q2-2024.5.qza
#wget -O /home/jovyan/assignments/pundemic/data/unite_ver10_99_all_04.04.2024-Q2-2024.5.qza https://github.com/colinbrislawn/unite-train/releases/download/v10.0-v04.04.2024-qiime2-2024.5/unite_ver10_99_all_04.04.2024-Q2-2024.5.qza

# Run pre-trained classifier on our data:
#!/bin/bash
source ~/miniconda3/etc/profile.d/conda.sh
conda activate q2

DATA="/cluster/scratch/ameara/apbio2/data"
CDATA="/cluster/scratch/ameara/apbio2/unite"

echo "Start classifying!"

qiime feature-classifier classify-sklearn \
    --i-classifier $CDATA/unite_ver10_99_all_04.04.2024-Q2-2024.5.qza \
    --i-reads $DATA/dada2_rep_seq.qza \
    --o-classification $DATA/taxonomy.qza

echo "Classification done."

# Running the script
sbatch --time=360 --ntasks=1 --mem-per-cpu=128G scripts/classify.sh 


In [None]:
# Retrieve data to local computer
# Getting the evaluations
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/unite_dynamic/both-taxonomy-evaluation.qzv .
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/unite_dynamic/classifier-evaluation.qzv .
# Getting our classification data 
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio2/data/taxonomy.qza .

In [12]:
! qiime cutadapt --help


Usage: [94mqiime cutadapt[0m [OPTIONS] COMMAND [ARGS]...

  Description: This QIIME 2 plugin uses cutadapt to work with adapters (e.g.
  barcodes, primers) in sequence data.

  Plugin website: https://github.com/qiime2/q2-cutadapt

  Getting user support: Please post to the QIIME 2 forum for help with this
  plugin: https://forum.qiime2.org

[1mOptions[0m:
  [94m--version[0m            Show the version and exit.
  [94m--example-data[0m PATH  Write example data and exit.
  [94m--citations[0m          Show citations and exit.
  [94m--help[0m               Show this message and exit.

[1mCommands[0m:
  [94mdemux-paired[0m  Demultiplex paired-end sequence data with barcodes in-
                sequence.
  [94mdemux-single[0m  Demultiplex single-end sequence data with barcodes in-
                sequence.
  [94mtrim-paired[0m   Find and remove adapters in demultiplexed paired-end
                sequences.
  [94mtrim-single[0m   Find and remove adapters in demultiplex

In [1]:
! qiime vsearch cluster-features-closed-reference --help

Usage: [94mqiime vsearch cluster-features-closed-reference[0m [OPTIONS]

  Given a feature table and the associated feature sequences, cluster the
  features against a reference database based on user-specified percent
  identity threshold of their sequences. This is not a general-purpose closed-
  reference clustering method, but rather is intended to be used for
  clustering the results of quality-filtering/dereplication methods, such as
  DADA2, or for re-clustering a FeatureTable at a lower percent identity than
  it was originally clustered at. When a group of features in the input table
  are clustered into a single feature, the frequency of that single feature in
  a given sample is the sum of the frequencies of the features that were
  clustered in that sample. Feature identifiers will be inherited from the
  centroid feature of each cluster. See the vsearch documentation for details
  on how sequence clustering is performed.

[1mInputs[0m:
  [94m[4m--i-sequences[0m ARTI