# FUNGI

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# import packages
import qiime2 as q2
import os
from qiime2.plugins import demux
from qiime2.plugins import cutadapt
from qiime2.plugins import dada2
from qiime2.plugins import feature_table

In [3]:
# define workdir
%env WORKDIR /home/nezapa/qiime-thesis
WORKDIR = os.environ.get("WORKDIR")

env: WORKDIR=/home/nezapa/qiime-thesis


### IMPORT DATA

In [4]:
# set manifest path
manifest_path = './00.manifest.f.tsv'

#import data
single_end_demux = q2.Artifact.import_data('SampleData[SequencesWithQuality]', 
    view_type='SingleEndFastqManifestPhred33V2', 
    view=manifest_path)

In [5]:
#summarise and visualise
demux_summary = demux.visualizers.summarize(single_end_demux)
demux_summary.visualization

<Figure size 432x288 with 0 Axes>

In [6]:
# load and view metadata
sample_metadata = q2.Metadata.load( f'{WORKDIR}/proteus_all/fungi/00.sample-metadata.tsv')
sample_metadata.to_dataframe()

Unnamed: 0_level_0,sample,location,origin,year,parkelj,specimen,population
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P01A,P01A,Planinska_jama,Paa200_t0,2015,no,Paa200,linija reke Ljubljanice
P03A,P03A,Planinska_jama,Paa201_t0,2015,no,Paa201,linija reke Ljubljanice
P05A,P05A,Stobe,Paa204_t10d,2016,no,Paa204,dolenjska linija
P07A,P07A,Planinska_jama,Paa209_t0,2017,no,Paa209,linija reke Ljubljanice
P09A,P09A,Planinska_jama,Paa210_t0,2019,no,Paa210,linija reke Ljubljanice
P11A,P11A,Planinska_jama,Paa211_t0,2018,no,Paa211,linija reke Ljubljanice
P13A,P13A,Planinska_jama,Paa219_t0,2019,no,Paa219,linija reke Ljubljanice
P15A,P15A,Planinska_jama,Paa220_t0,2019,no,Paa220,linija reke Ljubljanice
P17A,P17A,Planinska_jama,Paa221_t0,2019,no,Paa221,linija reke Ljubljanice
P19A,P19A,Obrsec,PB271_parkelj,2016,yes,PB271,črna podvrsta


### TRIM PRIMERS

In [7]:
# adaoter has to be a reverse complement
single_end_trimmed = cutadapt.methods.trim_single(
    demultiplexed_sequences = single_end_demux,
    front = ['TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGNNNNNAACTTTYRRCAAYGGATCWCT'],
    adapter = ['AYTTAAGCATATCAATAAGCGGAGGCTGTCTCTTATACACATCTCCGAGCCCACGAGAC'], 
    error_rate = 0.2,
    cores = 28
)

Running external command line application. This may print messages to stdout and/or stderr.
The commands to be run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: cutadapt --cores 28 --error-rate 0.2 --times 1 --overlap 3 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-l6x5dsvc/P01A_0_L001_R1_001.fastq.gz --adapter AYTTAAGCATATCAATAAGCGGAGGCTGTCTCTTATACACATCTCCGAGCCCACGAGAC --front TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGNNNNNAACTTTYRRCAAYGGATCWCT /tmp/qiime2-archive-wdeue40n/c619a7fd-9a40-4107-bd02-480e4e27ec83/data/P01A_0_L001_R1_001.fastq.gz

This is cutadapt 4.0 with Python 3.8.13
Command line parameters: --cores 28 --error-rate 0.2 --times 1 --overlap 3 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-l6x5dsvc/P01A_0_L001_R1_001.fastq.gz --adapter AYTTAAGCATATCAATAAGCGGAGGCTGTCTCTTATACACATCTCCGAGCCCACGAGAC --front TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGNNNNNAACTTTYRRCAAYGGATCWC

Finished in 1.21 s (38 µs/read; 1.57 M reads/minute).

=== Summary ===

Total reads processed:                  31,667
Reads with adapters:                    31,509 (99.5%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads written (passing filters):        31,667 (100.0%)

Total basepairs processed:     9,450,217 bp
Total written (filtered):      8,632,612 bp (91.3%)

=== Adapter 1 ===

Sequence: AYTTAAGCATATCAATAAGCGGAGGCTGTCTCTTATACACATCTCCGAGCCCACGAGAC; Type: regular 3'; Length: 59; Trimmed: 6 times

Minimum overlap: 3
No. of allowed errors:
1-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11

Bases preceding removed adapters:
  A: 50.0%
  C: 0.0%
  G: 33.3%
  T: 16.7%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
3	1	494.8	0	1
4	2	123.7	0	2
5	1	30.9	1	0 1
25	2	0.0	5	2


=== Adapter 2 ===

Sequence: TC

Finished in 0.85 s (88 µs/read; 0.69 M reads/minute).

=== Summary ===

Total reads processed:                   9,732
Reads with adapters:                     9,586 (98.5%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads written (passing filters):         9,732 (100.0%)

Total basepairs processed:     2,907,856 bp
Total written (filtered):      2,659,466 bp (91.5%)

=== Adapter 1 ===

Sequence: AYTTAAGCATATCAATAAGCGGAGGCTGTCTCTTATACACATCTCCGAGCCCACGAGAC; Type: regular 3'; Length: 59; Trimmed: 25 times

Minimum overlap: 3
No. of allowed errors:
1-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11

Bases preceding removed adapters:
  A: 84.0%
  C: 16.0%
  G: 0.0%
  T: 0.0%
  none/other: 0.0%
    The adapter is preceded by 'A' extremely often.
    The provided adapter sequence could be incomplete at its 5' end.

Overview of removed sequences
length	co

Finished in 1.53 s (27 µs/read; 2.19 M reads/minute).

=== Summary ===

Total reads processed:                  55,883
Reads with adapters:                    55,692 (99.7%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads written (passing filters):        55,883 (100.0%)

Total basepairs processed:    16,737,077 bp
Total written (filtered):     15,291,836 bp (91.4%)

=== Adapter 1 ===

Sequence: AYTTAAGCATATCAATAAGCGGAGGCTGTCTCTTATACACATCTCCGAGCCCACGAGAC; Type: regular 3'; Length: 59; Trimmed: 18 times

Minimum overlap: 3
No. of allowed errors:
1-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11

Bases preceding removed adapters:
  A: 22.2%
  C: 22.2%
  G: 33.3%
  T: 22.2%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
3	4	873.2	0	4
4	4	218.3	0	4
5	4	54.6	1	0 4
6	1	13.6	1	0 1
7	2	3.4	1	0 2
25	3	0.0	5	3




### START DADA2 MERGING AND CLUSTERING

In [8]:
# don't cut low quality ends!
denoisetable_all, rep_seqs_all, denoising_stats_all = dada2.methods.denoise_single(
    demultiplexed_seqs = single_end_trimmed.trimmed_sequences,
    pooling_method = 'pseudo',
    trunc_len = 0,
    n_threads = 14
)

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_single.R /tmp/qiime2-archive-buvx9ixb/785e3f81-e601-43c5-a033-2a7f2cb4a848/data /tmp/tmp9042hyth/output.tsv.biom /tmp/tmp9042hyth/track.tsv /tmp/tmp9042hyth 0 0 2.0 2 Inf pseudo consensus 1.0 14 1000000 NULL 16

R version 4.1.3 (2022-03-10) 


Loading required package: Rcpp


DADA2: 1.22.0 / Rcpp: 1.0.8.3 / RcppParallel: 5.1.5 
1) Filtering .............
2) Learning Error Rates
85087929 total bases in 311643 reads from 13 samples will be used for learning the error rates.
3) Denoise samples .............
  Pseudo-pool step .............
4) Remove chimeras (method = consensus)
5) Report read numbers through the pipeline
6) Write output


### FILTERING BY ABUNDANCE

In [9]:
# based od Mock community results set the min_frequency
denoisetable = feature_table.methods.filter_features(
    table = denoisetable_all,
    min_frequency = 20,
)

In [10]:
# summarise and visualise
feature_table.visualizers.summarize(
    table = denoisetable.filtered_table,
    sample_metadata = sample_metadata
).visualization

In [11]:
# visualise the new set of representative sequences
rep_seqs = feature_table.methods.filter_seqs(
    data = rep_seqs_all,
    table = denoisetable.filtered_table,
)

feature_table.visualizers.tabulate_seqs(data = rep_seqs.filtered_data).visualization

In [12]:
## save the output
denoisetable.filtered_table.save('./results/denoisetable.qza')
rep_seqs.filtered_data.save('./results/rep_seqs.qza')
denoising_stats_all.save('./results/denoising_stats.qza')

'./results/denoising_stats.qza'