# Euler classifier training

The following code chunks were run on euler in order to train a classifier for our fungal ITS data, do not attempt running in the nb


In [None]:
# Setting up euler
# Get qiime2 on euler (30 mins to set up, around 5gb)
conda env create -n q2 --file https://data.qiime2.org/distro/amplicon/qiime2-amplicon-2024.5-py39-osx-conda.yml

# Now qiime2 can be activated with:
conda activate q2

In [None]:
# Getting our pundemic data and denoising
wget -O pundemic_metadata.tsv https://polybox.ethz.ch/index.php/s/7LxWSbaw2q37yof/download
wget -O pundemic_forward_reads.qza  https://polybox.ethz.ch/index.php/s/o8HqHJqvuf9e2on/download

# Denoising script:
#!/bin/bash
RDATA="/cluster/home/ameara/applied_bioinformatics/project/raw_data"
PDATA="/cluster/scratch/ameara/apbio/processed_data"

echo "starting denoising"

qiime dada2 denoise-single \
   --i-demultiplexed-seqs $RDATA/pundemic_forward_reads.qza \
   --p-trunc-len 129 \
   --p-n-threads 4 \
   --o-table $PDATA/dada2_table.qza \
   --o-representative-sequences $PDATA/dada2_rep_seq.qza \
   --o-denoising-stats $PDATA/dada2_stats.qza

echo "finished truncation"

# Running the script
sbatch --time=60 --ntasks=4 --mem-per-cpu=30G scripts/denoising.sh 


In [None]:
# Getting unite db for fungal ITS classifier
qiime rescript get-unite-data \
    --p-version 10.0 \
    --p-taxon-group eukaryotes \
    --p-cluster-id dynamic \
    --p-no-singletons \
    --verbose \
    --output-dir unite_dynamic 

# Cleaning up unite db
qiime rescript edit-taxonomy \
    --i-taxonomy unite_dynamic/taxonomy.qza \
    --o-edited-taxonomy unite_dynamic/taxonomy-no-SH.qza \
    --p-search-strings ';sh__.*' \
    --p-replacement-strings '' \
    --p-use-regex



In [None]:
# Train classifier
sbatch --time=300  --ntasks=4 --mem-per-cpu=30G --wrap="\
    qiime feature-classifier fit-classifier-naive-bayes \
    --i-reference-reads unite_dynamic/sequences.qza \
    --i-reference-taxonomy unite_dynamic/taxonomy-no-SH.qza \
    --o-classifier unite_dynamic/classifier.qza"

# See how long it took on euler:
sacct -j <jobID> --format=JobID,JobName,Elapsed
# job ID here was 11433149, elapsed time: 02:07:47

In [None]:
# Evaluate the classifier
# Script for eval
#!/bin/bash

echo "starting eval"

qiime rescript evaluate-fit-classifier \
    --i-sequences unite_dynamic/sequences.qza   \
    --i-taxonomy unite_dynamic/taxonomy-no-SH.qza \
    --p-n-jobs 4 \
    --o-classifier unite_dynamic/classifier.qza \
    --o-evaluation unite_dynamic/classifier-evaluation.qzv \
    --o-observed-taxonomy unite_dynamic/predicted-taxonomy.qza

echo "finished fit eval"

qiime rescript evaluate-taxonomy \
  --i-taxonomies unite_dynamic/taxonomy-no-SH.qza unite_dynamic/predicted-taxonomy.qza \
  --p-labels ref-taxonomy predicted-taxonomy \
  --o-taxonomy-stats unite_dynamic/both-taxonomy-evaluation.qzv

echo "finished tax eval"

# Running the script
sbatch --time=360 --ntasks=4 --mem-per-cpu=30G scripts/ceval_dynamic.sh 


In [None]:
# Use the classifier on our data
# Script for classify

#!/bin/bash

echo "Start classifying!"

PDATA="/cluster/scratch/ameara/apbio/processed_data"
CDATA="/cluster/scratch/ameara/apbio/unite_dynamic"

qiime feature-classifier classify-sklearn \
    --i-classifier $CDATA/classifier.qza \
    --i-reads $PDATA/dada2_rep_seq.qza \
    --o-classification $PDATA/taxonomy_d.qza

echo "Classification done."

# Run classify script
sbatch --time=360 --ntasks=4 --mem-per-cpu=30G /cluster/home/ameara/applied_bioinformatics/project/scripts/classify_d.sh


In [None]:
# Use a pre-trained classifier on our data

# Uploading pre-trained classifier from
# https://github.com/colinbrislawn/unite-train/releases/tag/v10.0-v04.04.2024-qiime2-2024.5
scp unite_ver10_dynamic_s_all_04.04.2024-Q2-2024.5.qza ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/unite_dynamic/

# Run pre-trained classifier on our data:
#!/bin/bash

echo "Start classifying!"

PDATA="/cluster/scratch/ameara/apbio/processed_data"
CDATA="/cluster/scratch/ameara/apbio/unite_dynamic"

qiime feature-classifier classify-sklearn \
    --i-classifier $CDATA/unite_ver10_dynamic_s_all_04.04.2024-Q2-2024.5.qza \
    --i-reads $PDATA/dada2_rep_seq.qza \
    --o-classification $PDATA/taxonomy_d_pre.qza

echo "Classification done."

# Running the script
sbatch --time=360 --ntasks=1 --mem-per-cpu=128G /cluster/home/ameara/applied_bioinformatics/project/scripts/classify_pre.sh 


In [None]:
# Retrieve data to local computer
# Getting the evaluations
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/unite_dynamic/both-taxonomy-evaluation.qzv .
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/unite_dynamic/classifier-evaluation.qzv .
# Getting our classification data 
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/processed_data/taxonomy_d.qza .
scp ameara@euler.ethz.ch:/cluster/scratch/ameara/apbio/processed_data/taxonomy_d_pre.qza .