In [None]:
import os

#make files if don't already exist
os.makedirs("profile_o", exist_ok=True)
os.makedirs("sorted_reads", exist_ok=True)
os.makedirs("seq_reads", exist_ok=True)
os.makedirs("summary_o", exist_ok=True)
os.makedirs("classifier", exist_ok=True)
os.makedirs("classifier_training_data", exist_ok=True)

#downstream analysis: https://www.youtube.com/watch?v=9AmM-BHYxJo

In [9]:
%%bash -s "$fastq_file"

#check if files are PHRED33
check_fastq_encoding() {
    # Check if a file is provided as an argument
    if [ $# -ne 1 ]; then
        echo "Usage: $0 <fastq_file>"
        exit 1
    fi
    
    fastq_file="$1"

    # Read the first quality line from the FASTQ file
    quality_line=$(head -n 40 "$fastq_file" | grep -E '^\+[[:space:]]*$' -A 1 | tail -n 1)

    # Loop through each character in the quality line and check its ASCII value
    phred33=1
    phred64=1
    for (( i=0; i<${#quality_line}; i++ )); do
        char=$(printf "%d" "'${quality_line:$i:1}")
        if [ $char -lt 33 ] || [ $char -gt 73 ]; then
            phred33=0
        fi
        if [ $char -lt 64 ] || [ $char -gt 104 ]; then
            phred64=0
        fi
    done

    # Output the result
    if [ $phred33 -eq 1 ]; then
        echo "PHRED 33 encoding detected."
    elif [ $phred64 -eq 1 ]; then
        echo "PHRED 64 encoding detected."
    else
        echo "Unknown encoding format."
    fi
}

processed="$(pwd)/processed"

echo "$processed/ERR1351807_1.fastq.gz"

check_fastq_encoding "$processed/ERR1351807_1.fastq.gz"


/home/miloscola/augene/training_data/processed/ERR1351807_1.fastq.gz
PHRED 33 encoding detected.


In [1]:
%%bash

current_dir=$(pwd)

#gzip any randomly unzipped files
for file in $current_dir/seq_reads/*.fastq; do
    # Re-zip each file
    gzip -f "$file"
    #remove unzipped file
    #rm "$file"
done

for file in $current_dir/seq_reads/*_R1_001.fastq.gz; do
    # Extract the prefix (e.g., ERR1351807) from the filename
    prefix="${file%_R1_001.fastq.gz}"
    
    # Rename the forward read file
    mv "$file" "${prefix}_1.fastq.gz"
    
done

for file in $current_dir/seq_reads/*_R2_001.fastq.gz; do
    # Extract the prefix (e.g., ERR1351807) from the filename
    prefix="${file%_R2_001.fastq.gz}"
    
    # Rename the forward read file
    mv "$file" "${prefix}_2.fastq.gz"
    
done
                                                                                                                                    
for file in $current_dir/seq_reads/*.fastq; do
    if [[ $file == *.fastq ]]; then
        echo "$file"
    fi
done 

/home/miloscola/augene/training_data/seq_reads/*.fastq


gzip: /home/miloscola/augene/training_data/seq_reads/*.fastq: No such file or directory


In [None]:
%%bash
#LIST FILES
current_dir=$(pwd)

for file in $current_dir/seq_reads/*; do
    # Extract the prefix (e.g., ERR1351807) from the filename
    echo $file
    
done

In [1]:
## SORT RAW SKINOME READS ##

import os
import shutil
import pandas as pd
import glob

# Define paths
current_dir = os.getcwd()
sorted_reads = os.path.join(current_dir, 'sorted_reads')
seq_reads = os.path.join(current_dir, 'seq_reads')
master_manifest = os.path.join(current_dir, 'DataFrame_2_Pruned.tsv')

# load in df
df = pd.read_csv(master_manifest, sep='\t', encoding='latin1')
df = df.dropna(subset=['Submission'])
submissions = df['Submission'].unique()
submissions = submissions.tolist()

#create seprate file for each submission
for sub in submissions: 
    new_direc = os.path.join(sorted_reads, sub)
    os.makedirs(new_direc, exist_ok=True)
    pruned_df = df[df['Submission'] == sub]
    p_s = pruned_df['Library_Layout'].unique().tolist()[0]
    if p_s == 'PAIRED':
        for run in pruned_df['Run'].tolist():
            sample_direc = os.path.join(new_direc, run)
            sample_1_path = os.path.join(seq_reads, run + '_1.fastq.gz')
            sample_2_path = os.path.join(seq_reads, run + '_2.fastq.gz')
            
            if (not os.path.exists(sample_1_path)) or  (not os.path.exists(sample_2_path)):
                
                if not os.path.exists(sample_1_path):
                    print('file path ' + sample_1_path + ' was not found for paired sample ' + run)
                    
                if not os.path.exists(sample_1_path):
                    print('file path ' + sample_1_path + ' was not found for paired sample ' + run)
                
                continue
                
            os.makedirs(sample_direc, exist_ok=True)
            shutil.move(sample_1_path, sample_direc)
            shutil.move(sample_2_path, sample_direc)
    elif p_s == 'SINGLE':
        for run in pruned_df['Run'].tolist():
            sample_direc = os.path.join(new_direc, run)
            sample_1_path = os.path.join(seq_reads, run + '_1.fastq.gz')
            
            if not os.path.exists(sample_1_path):
                print('file path ' + sample_1_path + ' was not found for single sample ' + run)
                continue
            
            os.makedirs(sample_direc, exist_ok=True)
            shutil.move(sample_1_path, sample_direc)

In [7]:
!pip install biopython



In [1]:
## Run Qiime2 ##

import os
import shutil
import pandas as pd
import glob
import subprocess
import re
import qiime2
from qiime2.plugins.dada2.methods import denoise_single, denoise_paired
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.feature_classifier.methods import classify_sklearn
from qiime2.plugins.taxa.visualizers import barplot
from Bio import SeqIO
from qiime2.plugins.feature_table.methods import relative_frequency
from Bio.Blast import NCBIWWW, NCBIXML
from qiime2 import Metadata


def qiime2_profile(sub_num, sub_path, master_manifest_path, profile_o_path, summary_o_folder, classifier_map, trim_left_f=13, trim_left_r=13, trunc_len_f=150, trunc_len_r=150, n_threads=8):
    """
    calculates microbiome profiles from a submission of reads and stores the results in a .qza file in the specified folder

    Parameters
    ----------
    sub_num : str
        ID of the submission containing the reads.
    sub_path : str
        path to the submission.
    master_manifest_path : str
        path to the master meta data file (should .tsv file) Manifest must 
        have the following columns: Run or id, Library_Layout, Region_16S, 
        Submission
    profile_o_path : str
        path to the output folder for microbiome profile
    summary_o_path : str:
        path to the output folder for read summary table
    classifier_map : dict
        map of 16s rRNA region to classifier built to classify that region
    trim_left_f : int
        number of nucleotides to trim from the 5' end of the forward read before denoising
    trim_left_r : int
        number of nucleotides to trim from the 5' end of the reverse read before denoising
    trim_left_f : int
        specifies the length at which the forward read will be truncated
    trim_left_r : int
        specifies the length at which the reverse read will be truncated

    Returns
    -------
    none
    """
    
    #define variables
    current_dir = os.getcwd()
    summary_o_path = os.path.join(summary_o_folder, sub_num)
    manifest_path = os.path.join(summary_o_path, f"{sub_num}-manifest.tsv")
    
    #string representing weather reads are paired or single
    p_s = None           
    #string representing what 16s regions were used
    region_16s = None           

    #validate inputs
    if not type(sub_num) == str:
        raise TypeEror("sub_num must be a string")
    if not os.path.exists(sub_path):
        raise FileNotFoundError(f"{sub_num} submission path is invalid")
    if not os.path.exists(master_manifest_path):
        raise FileNotFoundError("master manifest path is invalid")
    if not type(classifier_map) == dict:
        raise TypeError("classifier_map must be a dict")
    for classifier in classifier_map.values():
        if not os.path.exists(classifier):
            raise FileNotFoundError(f"classifier map contains invalid paths. {classifier} is not a valid path")
    
    #if output folders do not exist, create them
    os.makedirs(profile_o_path, exist_ok=True)
    os.makedirs(summary_o_folder, exist_ok=True)
    os.makedirs(summary_o_path, exist_ok=True)
    
    #generate manifest
    manifest = pd.read_csv(master_manifest_path, sep='\t', encoding='latin1')
    
    #Rename "Run" to 'id' to support compatability with qiime2
    if 'Run' in manifest.columns:
        manifest.rename(columns={'Run': 'id'}, inplace=True)
    
    #prune manifest
    manifest = manifest[manifest['Submission'] == sub_num]
    reads = os.listdir(sub_path)
    len_manifest = len(manifest)
    len_reads = len(reads)
    manifest = manifest[manifest['id'].isin(reads)]
    len_filtered_manifest = len(manifest)
    
    #check that names and number of samples match on manifest and in folder
    if manifest.empty:
        raise ValueError("Pruned manifest is empty. Submission number may not be in manifest or reads may be improperly named.")
    if len_manifest > len_reads:
        print("Warning: more entries in manifest than in folder")
    if len_manifest < len_reads:
        print("Warning: more entries in folder than in manifest")
    if len_manifest == len_reads and len_filtered_manifest < len_reads:
        print("Warning: mismatch between some names on manifest and in folder")
        
    #check that the submission does not have mixed single and paired reads
    unique_layouts = manifest['Library_Layout'].unique().tolist()
    
    if len(unique_layouts) > 1:
        raise ValueError("Multiple layouts detected. Submission may contain single and paired reads.")
    p_s = unique_layouts[0]
    
    #check that the submission does not have samples with different 16s regions
    unique_16s = manifest['Region_16S'].unique().tolist()
    if len(unique_layouts) > 1:
        raise ValueError("Multiple regions detected. Submission may contain reads from different 16s regions.")
    region_16s = unique_16s[0]
    
    #add file paths to manifest
    if p_s == 'PAIRED':
        manifest['forward-absolute-filepath'] = 0
        manifest['reverse-absolute-filepath'] = 0
        for sample in manifest['id'].tolist():
            sample_path = os.path.join(sub_path, sample)
            reads = os.listdir(sample_path)
            #check for correct number of reads
            if len(reads) != 2:
                raise ValueError(f"Incorrect number of files for {sample}. Should be 2. Got {len(reads)}.")
            #add read paths to manifest
            manifest.loc[manifest['id'] == sample, 'forward-absolute-filepath'] = os.path.join(sample_path, reads[0])
            manifest.loc[manifest['id'] == sample, 'reverse-absolute-filepath'] = os.path.join(sample_path, reads[1])
    elif p_s == 'SINGLE':
        manifest['absolute-filepath'] = 0
        for sample in manifest['id'].tolist():
            sample_path = os.path.join(sub_path, sample)
            reads = os.listdir(sample_path)
            #check for correct number of reads
            if len(reads) != 1:
                raise ValueError(f"incorrect number of files for {sample}. Should be 1. Got {len(reads)}.")
            #add read path to manifest
            manifest.loc[manifest['id'] == sample, 'absolute-filepath'] = os.path.join(sample_path, reads[0])
    else:
        raise ValueError("invalid value for library layout")
        
    #save the manifest
    manifest.to_csv(manifest_path, sep='\t', index=False)

    ## execute qiime2 pipeline ##
    
    print(f"Processing {sub_num}")
       
    # Execute import command
    if p_s == 'PAIRED':
        demux = qiime2.Artifact.import_data(
            'SampleData[PairedEndSequencesWithQuality]', 
            manifest_path, 
            view_type='PairedEndFastqManifestPhred33V2'
        )
    elif p_s == 'SINGLE':
        demux = qiime2.Artifact.import_data(
            'SampleData[SequencesWithQuality]', 
            manifest_path, 
            view_type='SingleEndFastqManifestPhred33V2'
        )

    # Execute denoise command
    denoising_stats_output = os.path.join(summary_o_path, f"{sub_num}-denoise-stats.qza") 

    if p_s == 'PAIRED':
        denoise_results = denoise_paired(
            demultiplexed_seqs=demux,
            trim_left_f=trim_left_f,
            trim_left_r=trim_left_r,
            trunc_len_f=trunc_len_f,
            trunc_len_r=trunc_len_r, 
            n_threads=n_threads
        )
    elif p_s == 'SINGLE':
        denoise_results = denoise_single(
            demultiplexed_seqs=demux,
            trim_left=trim_left_f,
            trunc_len=trunc_len_f, 
            n_threads=n_threads
        )

    denoise_results.denoising_stats.save(denoising_stats_output)
    
    #temporay? Keep for longer?
    denoise_results.representative_sequences.save(os.path.join(summary_o_path, f"{sub_num}-rep_seqs"))
    denoise_results.table.save(os.path.join(summary_o_path, f"{sub_num}-freq_table"))
    denoise_results.denoising_stats.save(denoising_stats_output)
    print(f"{sub_num} denoising complete")

    # Execute summary command
    visualization_output = os.path.join(summary_o_path, f"{sub_num}-summary")
    metadata = qiime2.Metadata.load(manifest_path)

    summary_result = summarize(
        table=denoise_results.table,
        sample_metadata=metadata
    )
    summary_result.visualization.save(visualization_output)
    print(f"{sub_num} summary table complete")

    # Execute classify command
    classifier = qiime2.Artifact.load(classifier_map[region_16s])
    taxonomy_output = os.path.join(summary_o_path, f"{sub_num}-taxonomy")

    taxonomy = classify_sklearn(
        reads=denoise_results.representative_sequences,
        classifier=classifier, 
        n_jobs=-1,
    )
    taxonomy.classification.save(taxonomy_output)
    
    print(f"{sub_num} taxonomy classification complete")
    
    """from qiime2 import Artifact
    current_dir = os.getcwd()
    profile_o_path = os.path.join(current_dir, "profile_o")
    taxonomy_path = os.path.join(profile_o_path, f"{sub_num}-taxonomy.qza")
    temp_path = os.path.join(current_dir, "temp")
    denoise_path = os.path.join(temp_path, "denoise.qza")
    manifest=os.path.join(temp_path, "manifest.tsv")
    taxonomy = Artifact.load(taxonomy_path)
    denoise_results = Artifact.load(denoise_path)"""
    
    #get assignments from BLAST
    
    
    #Generate Relitive Frequencies
    rel_freq = relative_frequency(denoise_results.table)[0] #add .table for denoise results
    print(f"{sub_num} relitive frequency table complete")
    
    #Generate Taxonomy Bar Graph
    barplot_path = os.path.join(summary_o_path, f"{sub_num}-barplot") 
    metadata = Metadata.load(manifest_path)
    
    barplot_vis = barplot(
        table=denoise_results.table, 
        taxonomy=taxonomy.classification, 
        metadata=metadata
    )
    barplot_vis.visualization.save(barplot_path)
    print(f"{sub_num} barplot complete")
    
    #generate final output data
    rel_freq_df = rel_freq.view(pd.DataFrame)
    rel_freq_df = rel_freq_df.transpose()
    taxonomy_df = taxonomy.classification.view(pd.DataFrame)
    
    #display_table(rel_freq_df)
    #display_table(taxonomy_df)
    
    merged_df = pd.merge(rel_freq_df, taxonomy_df, left_index=True, right_index=True, how='inner')
    merged_df = merged_df.drop(columns=['Confidence']) #used to also drop feature ID, will see if it works
    #sum accross multiple OTUs that are the same species
    merged_df = merged_df.groupby('Taxon').sum()
    
    #DEBUG: 
    #display_table(merged_df)
    
    
    #print("\n========== fel_freq_df ==========\n")
    #print(set(rel_freq_df.index))
    #print("\n========== taxon_df ==========\n")
    #print(set(taxonomy_df['Feature ID']))
    #print("\n========== join ==========\n")
    #print(set(rel_freq_df.index) & set(taxonomy_df['Feature ID']))
    
    #save final output data
    profile_output = os.path.join(profile_o_path, f"{sub_num}-profile.csv")
    merged_df.to_csv(profile_output, index=True)
    print(f"==================== {sub_num} processing complete ====================")


#TODO: double check sequence assignments with blast
def display_table(table):
    display(table.head())
    num_rows, num_cols = table.shape
    print("Number of rows:", num_rows)
    print("Number of columns:", num_cols)
    


In [2]:
#Run Pipeline
#generate file paths
current_dir = os.getcwd()
classifier_folder = os.path.join(current_dir, "classifier")
master_manifest_path = os.path.join(current_dir, "DataFrame_2_Pruned.tsv")
profile_o_path = os.path.join(current_dir, "profile_o")
summary_o_path = os.path.join(current_dir, "summary_o")
gg_nb = os.path.join(classifier_folder, "gg-13-8-99-nb-classifier.qza") 

#only run the pipeline on unprocessed reads
sub_nums = set(os.listdir(os.path.join(current_dir, "sorted_reads")))
completed = set(os.listdir(profile_o_path))
completed = [x[:-len("-profile.csv")] for x in completed]
sub_nums.difference_update(completed)


classifier_map = {
    "V4":gg_nb, 
    "V1-V3":gg_nb, 
    "V1-V2":gg_nb, 
    "V3-V4":gg_nb, 
    "V1-V2 and v4":gg_nb, 
    "V4-V5":gg_nb, 
    "V1-V4":gg_nb
}

#BUG: wrong set being used to join taxon df and freq df

print(f"Processing submissions {sub_nums}\n")
for sub_num in sub_nums:
    sub_path = os.path.join(current_dir, "sorted_reads", sub_num)
    try:
        qiime2_profile(sub_num, sub_path, master_manifest_path, profile_o_path, summary_o_path, classifier_map, trim_left_f=0, trim_left_r=0, trunc_len_f=0, trunc_len_r=0, )
    except Exception as e:
        print(f"Error processing {sub_num}: {e}")
        print(f"==================== {sub_num} processing failed ====================")

Processing submissions {'ERA1111396', 'ERA292152', 'SRA578693', 'SRA1038019', 'SRA298410', 'SRA860003', 'SRA575681', 'SRA996410'}

Processing ERA1111396
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_single.R /tmp/qiime2-archive-fz0pn7ls/2d82f39b-0744-4caa-8965-e42a868b27f3/data /tmp/tmp6_w64ewo/output.tsv.biom /tmp/tmp6_w64ewo/track.tsv /tmp/tmp6_w64ewo 0 0 2.0 2 Inf independent consensus 1.0 8 1000000 NULL 16

ERA1111396 denoising complete


  os.path.join(output_dir, 'sample-frequency-detail.csv'))
  os.path.join(output_dir, 'feature-frequency-detail.csv'))


ERA1111396 summary table complete
ERA1111396 taxonomy classification complete
ERA1111396 relitive frequency table complete
ERA1111396 barplot complete


Unnamed: 0_level_0,ERR2162452,ERR2162453,ERR2162454,ERR2162455,ERR2162456,ERR2162457,ERR2162458,ERR2162459,ERR2162460,ERR2162461,...,ERR2163209,ERR2163210,ERR2163211,ERR2163212,ERR2163213,ERR2163214,ERR2163215,ERR2163216,ERR2163217,ERR2163218
Taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unassigned,0.0,0.000261,0.052964,0.0,0.0,0.0,0.0,0.001969,0.000192,0.0,...,0.0,0.0,0.029153,0.000133,8.1e-05,3.4e-05,0.0,0.0,0.0,0.0
k__Archaea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Archaea; p__Crenarchaeota; c__Thaumarchaeota; o__Cenarchaeales; f__Cenarchaeaceae; g__Nitrosopumilus; s__,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Archaea; p__Crenarchaeota; c__Thaumarchaeota; o__Nitrososphaerales; f__Nitrososphaeraceae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Archaea; p__Crenarchaeota; c__Thaumarchaeota; o__Nitrososphaerales; f__Nitrososphaeraceae; g__Candidatus Nitrososphaera; s__,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Number of rows: 2181
Number of columns: 761
Processing ERA292152
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_single.R /tmp/qiime2-archive-3ffhmlxu/b1dea680-b371-4f96-9a85-3c2f07afbf8f/data /tmp/tmpcgfdg1sd/output.tsv.biom /tmp/tmpcgfdg1sd/track.tsv /tmp/tmpcgfdg1sd 0 0 2.0 2 Inf independent consensus 1.0 8 1000000 NULL 16

Error processing ERA292152: An error was encountered while running DADA2 in R (return code 1), please inspect stdout and stderr to learn more.
Error processing SRA578693: Pruned manifest is empty. Submission number may not be in manifest or reads may be improperly named.
Processing SRA1038019
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run a

  os.path.join(output_dir, 'sample-frequency-detail.csv'))
  os.path.join(output_dir, 'feature-frequency-detail.csv'))


SRA1038019 summary table complete
SRA1038019 taxonomy classification complete
SRA1038019 relitive frequency table complete
SRA1038019 barplot complete


Unnamed: 0_level_0,SRR11027332,SRR11027333,SRR11027334,SRR11027335,SRR11027336,SRR11027337,SRR11027338,SRR11027339,SRR11027340,SRR11027341,...,SRR11027344,SRR11027345,SRR11027346,SRR11027347,SRR11027348,SRR11027349,SRR11027350,SRR11027351,SRR11027352,SRR11027353
Taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
k__Bacteria,0.0,0.0,0.0,0.0,0.901408,0.34127,0.0,0.0,0.0,0.0,...,0.111842,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0
k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Porphyromonadaceae; g__Porphyromonas; s__,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.412214,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.916667,1.0,0.0,0.0,0.0,0.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; g__Clostridium; s__,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156489,...,0.0,0.0,0.425,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__[Tissierellaceae]; g__Anaerococcus; s__,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.526316,0.0,0.148855,...,0.888158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Number of rows: 15
Number of columns: 22
Processing SRA298410
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_paired.R /tmp/tmpscm0387u/forward /tmp/tmpscm0387u/reverse /tmp/tmpscm0387u/output.tsv.biom /tmp/tmpscm0387u/track.tsv /tmp/tmpscm0387u/filt_f /tmp/tmpscm0387u/filt_r 0 0 0 0 2.0 2.0 2 independent consensus 1.0 8 1000000

Error processing SRA298410: An error was encountered while running DADA2 in R (return code -11), please inspect stdout and stderr to learn more.
Processing SRA860003
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_paired.R /tmp/tmprx5v08qh/forward /tmp/tmprx5v

  os.path.join(output_dir, 'sample-frequency-detail.csv'))
  os.path.join(output_dir, 'feature-frequency-detail.csv'))


SRA860003 summary table complete
SRA860003 taxonomy classification complete
SRA860003 relitive frequency table complete
SRA860003 barplot complete


Unnamed: 0_level_0,SRR8728265
Taxon,Unnamed: 1_level_1
k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Corynebacteriaceae; g__Corynebacterium; s__,0.082248
k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Dermabacteraceae; g__Brachybacterium,0.020562
k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Dietziaceae; g__Dietzia; s__,0.002443
k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Micrococcaceae,0.00285
k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Micrococcaceae; g__Kocuria; s__palustris,0.036849


Number of rows: 49
Number of columns: 1
Error processing SRA575681: Pruned manifest is empty. Submission number may not be in manifest or reads may be improperly named.
Processing SRA996410
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_single.R /tmp/qiime2-archive-uoy8c3cn/d6fc8df8-2285-46bb-a7e6-d4210da98c38/data /tmp/tmp9eye8jzr/output.tsv.biom /tmp/tmp9eye8jzr/track.tsv /tmp/tmp9eye8jzr 0 0 2.0 2 Inf independent consensus 1.0 8 1000000 NULL 16

SRA996410 denoising complete


  os.path.join(output_dir, 'sample-frequency-detail.csv'))
  os.path.join(output_dir, 'feature-frequency-detail.csv'))


SRA996410 summary table complete
SRA996410 taxonomy classification complete
SRA996410 relitive frequency table complete
SRA996410 barplot complete


Unnamed: 0_level_0,SRR10447626,SRR10447627,SRR10447628,SRR10447629,SRR10447630,SRR10447631,SRR10447632,SRR10447633,SRR10447634,SRR10447635,...,SRR10447734,SRR10447735,SRR10447736,SRR10447737,SRR10447738,SRR10447739,SRR10447740,SRR10447741,SRR10447742,SRR10447743
Taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
k__Archaea; p__Crenarchaeota; c__Thaumarchaeota; o__Nitrososphaerales; f__Nitrososphaeraceae; g__Candidatus Nitrososphaera; s__SCA1170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Bacteria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006458,0.0,0.003238,0.0,0.0,0.0,0.0,0.0
k__Bacteria; p__; c__; o__; f__; g__; s__,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Bacteria; p__Acidobacteria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
k__Bacteria; p__Acidobacteria; c__Acidobacteria-5; o__; f__; g__; s__,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Number of rows: 994
Number of columns: 118


In [None]:
#test Pipeline

#generate file paths
current_dir = os.getcwd()
sub_num = "ERA1522711"
classifier_folder = os.path.join(current_dir, "classifier")
sub_path = os.path.join(current_dir, f"sorted_reads/{sub_num}")
master_manifest_path = os.path.join(current_dir, "DataFrame_2_Pruned.tsv")
profile_o_path = os.path.join(current_dir, "profile_o")
summary_o_path = os.path.join(current_dir, "summary_o")

#classifiers
silva_nb = os.path.join(classifier_folder, "silva-138-99-nb-classifier.qza") #warning: species are unreliable
gg_nb = os.path.join(classifier_folder, "gg-13-8-99-nb-classifier.qza") 

classifier_map = {
    "V4":gg_nb, 
    "V1-V3":gg_nb, 
    "V1-V2":gg_nb, 
    "V3-V4":gg_nb, 
    "V1-V2 and v4":gg_nb, 
    "V4-V5":gg_nb, 
    "V1-V4":gg_nb
}

qiime2_profile(sub_num, sub_path, master_manifest_path, profile_o_path, summary_o_path, classifier_map)

In [None]:
#no trim test
#generate bar plot graph
from qiime2 import Visualization
from qiime2 import Artifact
from qiime2.plugins import demux, dada2, feature_classifier, feature_table
from qiime2 import Metadata
import os
from qiime2.plugins.taxa.visualizers import barplot

current_dir = os.getcwd()
sub_num = "ERA1777984"
classifier_folder = os.path.join(current_dir, "classifier")
sub_path = os.path.join(current_dir, f"sorted_reads/{sub_num}")
master_manifest_path = os.path.join(current_dir, "DataFrame_2_Pruned.tsv")
profile_o_path = os.path.join(current_dir, "profile_o")
summary_o_path = os.path.join(current_dir, "summary_o")

#classifiers
silva_nb = os.path.join(classifier_folder, "silva-138-99-nb-classifier.qza") #warning: species are unreliable
gg_nb = os.path.join(classifier_folder, "gg-13-8-99-nb-classifier.qza") 

classifier_map = {
    "V4":gg_nb, 
    "V1-V3":gg_nb, 
    "V1-V2":gg_nb, 
    "V3-V4":gg_nb, 
    "V1-V2 and v4":gg_nb, 
    "V4-V5":gg_nb, 
    "V1-V4":gg_nb
}

qiime2_profile(sub_num, sub_path, master_manifest_path, profile_o_path, summary_o_path, classifier_map, trim_left_f=0, trim_left_r=0, trunc_len_f=0, trunc_len_r=0)


barplot_path = os.path.join(summary_o_path, sub_num, f"{sub_num}-barplot.qza")
barplot_vis = qiime2.Visualization.load('path_to_barplot.qzv')
barplot_vis

In [4]:
#generate bar plot graph
from qiime2 import Visualization
from qiime2 import Artifact
from qiime2.plugins import demux, dada2, feature_classifier, feature_table
from qiime2 import Metadata
import os
from qiime2.plugins.taxa.visualizers import barplot


sub_num = "ERA1522711"
current_dir = os.getcwd()
profile_o_path = os.path.join(current_dir, "profile_o")
taxonomy_path = os.path.join(profile_o_path, f"{sub_num}-taxonomy.qza")
temp_path = os.path.join(current_dir, "temp")
denoise_path = os.path.join(temp_path, "denoise.qza")
manifest=os.path.join(temp_path, "manifest.tsv")


metadata = Metadata.load(manifest)

taxonomy_qza = Artifact.load(taxonomy_path)

denoise_qza = Artifact.load(denoise_path)

# Calculate the relative frequency of features
relative_frequency_result = feature_table.methods.relative_frequency(denoise_qza)


# Generate the barplot visualization
barplot_viz = barplot(table=denoise_qza, taxonomy=taxonomy_qza, metadata=metadata)

# Visualize the barplot
barplot_viz.visualization



In [1]:
#test conversion to machine learning data
from qiime2 import Visualization
from qiime2 import Artifact
from qiime2.plugins import demux, dada2, feature_classifier, feature_table
from qiime2 import Metadata
import os
import pandas as pd
from qiime2.plugins.taxa.visualizers import barplot
from Bio.Blast import NCBIWWW
from Bio import SeqIO

sub_num = "ERA1522711"
current_dir = os.getcwd()
profile_o_path = os.path.join(current_dir, "profile_o")
summary_o_path = os.path.join(current_dir, "summary_o", sub_num)
taxonomy_path = os.path.join(profile_o_path, f"{sub_num}-taxonomy.qza")
temp_path = os.path.join(current_dir, "temp")
#denoise_path = os.path.join(temp_path, "denoise.qza")
manifest=os.path.join(temp_path, "manifest.tsv")
seq_path = os.path.join(summary_o_path, f"{sub_num}-rep_seqs.qza")
denoise_path = os.path.join(summary_o_path, f"{sub_num}-freq_table.qza")
profile_o_path = os.path.join(current_dir, "profile_o")
    

#assign new file paths
#denoise_biom_path = os.path.join(temp_path, f"{sub_num}-denoise.biom")
#taxonomy_tsv_path = os.path.join(temp_path, f"{sub_num}-taxonomy.tsv")

#re-format old files
denoise_qza = Artifact.load(denoise_path)
seq_qza = Artifact.load(seq_path)
taxonomy_qza = Artifact.load(taxonomy_path)

#get realitive frequencies
rel_freq_qza = feature_table.methods.relative_frequency(denoise_qza)[0]
#print(rel_freq_qza[0])
#rel_freq_qza.save(os.path.join(temp_path, "relative_frequency.qza"))
#denoise_qza.save(denoise_biom_path)
#taxonomy_qza.save(taxonomy_tsv_path)

rel_freq_df = rel_freq_qza.view(pd.DataFrame)
rel_freq_df = rel_freq_df.transpose()
taxonomy_df = taxonomy_qza.view(pd.DataFrame)
taxonomy_df.index=taxonomy_df['Feature ID']

#denoise_df = pd.read_csv(denoise_biom_path, sep='\t', index_col=0, skiprows=[1])
#taxonomy_df = pd.read_csv(taxonomy_tsv_path, sep='\t', index_col=0)


# Subset the DataFrames to only include common indices
#denoise_df = denoise_df.loc[common_indices]
#taxonomy_df = taxonomy_df.loc[common_indices]

merged_df = rel_freq_df.merge(taxonomy_df['Taxon'], left_index=True, right_index=True)
merged_df.set_index('Taxon', inplace=True)
merged_df = merged_df.groupby('Taxon').sum()

display(rel_freq_df.head())

num_rows, num_cols = rel_freq_df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

display(taxonomy_df.head())

num_rows, num_cols = taxonomy_df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

display(merged_df.head())

num_rows, num_cols = merged_df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

merged_df.to_csv(os.path.join(profile_o_path, f"{sub_num}-profile.csv"), index=False)

"""# Iterate over samples and write to CSV
for sample in feature_table_df.columns:
    sample_df = merged_df[[sample, 'Taxon']]
    sample_df.columns = ['Relative Frequency', 'Bacteria']
    sample_df = sample_df[['Bacteria', 'Relative Frequency']]
    sample_df.to_csv(os.path.join(output_dir, f'{sample}.csv'), index=False)"""

def displaylay_file(denoise, temp_path, sub_num):
    # Convert the denoised sequences to a pandas DataFrame
    sequences_df = denoise.view(pd.DataFrame)

    display(sequences_df.head())

    num_rows, num_cols = sequences_df.shape

    print("Number of rows:", num_rows)
    print("Number of columns:", num_cols)
    
def blast(denoise, temp_path, sub_num, blast_program="blastp", database="nr", num_alignments=10, evalue=0.01):
    sequences_df = denoise.view(pd.DataFrame)
    
    fasta_path = os.path.joinoin(temp_path, f'{sub_num}_output_sequences.fasta')
    
    with open(fasta_path, 'w') as fasta_file:
        for index, row in sequences_df.iterrows():
            fasta_file.write(f'>{index}\n{row[0]}\n')
    
    # Read the FASTA file
    query_sequence = SeqIO.read(fasta_path, "fasta")

    # Perform BLAST search
    result_handle = NCBIWWW.qblast(blast_program, database, query_sequence.seq, expect=evalue, hitlist_size=num_alignments)
    
    #convert blast results into a pandas dataframe
    rows = []
    for record in blast_records:
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                rows.append({
                    "Query ID": record.query,
                    "Hit ID": alignment.title,
                    "Length": alignment.length,
                    "E-value": hsp.expect,
                    "Score": hsp.score,
                    "Query Seq": hsp.query,
                    "Hit Seq": hsp.sbjct
                })
    return pd.DataFrame(rows)
    
displaylay_file(seq_qza)
blast_resultslts = blast(seq_qza, temp_path, sub_num)
displaylay_file(blast_results)



ValueError: /home/miloscola/augene/aggit2/SKINOME_training_data/profile_o/ERA1522711-taxonomy.qza does not exist.

In [None]:
#Visuialize data with pandas (quick and dirty)

import qiime2
import pandas as pd
import os
from qiime2.plugins import demux, dada2, feature_classifier, feature_table

current_dir = os.getcwd()

# Define path to taxonomy.qza file
taxonomy_qza_path = os.path.join(current_dir, "profile_o/ERA1522711-taxonomy.qza")

# Load the taxonomy.qza file
taxonomy_artifact = qiime2.Artifact.load(taxonomy_qza_path)

# Extract the data from the artifact
taxonomy_df = taxonomy_artifact.view(pd.DataFrame)

# Display the taxonomy dataframe
print(taxonomy_df)

In [None]:
    #OLD CODE PROB WONT NEED
  

    #execute import command
    if p_s == 'PAIRED':
        demux_output = os.path.join(temp_path, "paired-end-demux.qza")
        import_command = f"""qiime tools import --type SampleData[PairedEndSequencesWithQuality] \
            --input-path {manifest_path} \
            --output-path {demux_output} \
            --input-format PairedEndFastqManifestPhred33V2""" 
    elif p_s == 'SINGLE':
        demux_output = os.path.join(temp_path, "single-end-demux.qza")
        import_command = f"""qiime tools import --type SampleData[SequencesWithQuality] \
            --input-path {manifest_path} \
            --output-path {demux_output} \
            --input-format SingleEndFastqManifestPhred33V2"""
        
    #run_command(import_command)
    
    #execute denoise command
    rep_seqs_dada2_output = os.path.join(temp_path, "rep-seqs-dada2.qza")
    table_dada2_output = os.path.join(temp_path, "table-dada2.qza")
    denoising_stats_output = os.path.join(summary_o_path, f"{sub_num}-stats-dada2.qza") 
    
    if p_s == 'PAIRED':
        denoise_command = f"""qiime dada2 denoise-paired \
        --i-demultiplexed-seqs {demux_output} \
        --p-trim-left-f {trim_left_f} \
        --p-trim-left-r {trim_left_r} \
        --p-trunc-len-f {trunc_len_f} \
        --p-trunc-len-r {trunc_len_r} \
        --o-representative-sequences {rep_seqs_dada2_output} \
        --o-table {table_dada2_output} \
        --o-denoising-stats {denoising_stats_output}"""
    elif p_s == 'SINGLE':
        denoise_command = f"""qiime dada2 denoise-paired \
        --i-demultiplexed-seqs {demux_output} \
        --p-trim-left {trim_left_f} \
        --p-trunc-len {trunc_len_f} \
        --o-representative-sequences {rep_seqs_dada2_output} \
        --o-table {table_dada2_output} \
        --o-denoising-stats {denoising_stats_output}"""
    
    #run_command(denoise_command)
    
    #execute summary command
    visuialization_output = os.path.join(summary_o_path, f"{sub_num}-table-dada2.qzv")
    
    summerize_command = f"""qiime feature-table summarize \
    --i-table {table_dada2_output} \
    --o-visualization {visuialization_output} \
    --m-sample-metadata-file {manifest_path}"""
    
    #run_command(summerize_command)
    
    #execute classify command
    classifier = classifier_map[region_16s]
    taxonomy_output = os.path.join(profile_o_path, f"{sub_num}-taxonomy.qza")
    
    classify_command = f"""qiime feature-classifier classify-sklearn \
        --i-classifier {classifier} \
        --i-reads {rep_seqs_dada2_output} \
        --o-classification {taxonomy_output}"""
    
    run_command(classify_command) 
    
    
def run_command(command):
    """
    runs a bash command

    Parameters
    ----------
    command : str
        string of the command to execute

    Returns
    -------
    none
    """
    print("__________________________________________________________")
    pretty_command = re.sub('--', '\n--', command)
    print(f"Running: {pretty_command}")
    try:
        result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
        print(f"\n{result.stdout}")
        if result.stderr:
            print(f"\n{result.stderr}")
    except subprocess.CalledProcessError as e:
        print(f"\nCommand failed with exit code {e.returncode}")
        print(f"\nstdout: {e.stdout}")
        print(f"\nstderr: {e.stderr}")
        raise

In [None]:
#PYTHON TRANSLATION OF THE ABOVE CODE:
# Execute import command
if p_s == 'PAIRED':
    demux_output = os.path.join(temp_path, "paired-end-demux.qza")
    demux = qiime2.Artifact.import_data('SampleData[PairedEndSequencesWithQuality]', manifest_path, view_type='PairedEndFastqManifestPhred33V2')
elif p_s == 'SINGLE':
    demux_output = os.path.join(temp_path, "single-end-demux.qza")
    demux = qiime2.Artifact.import_data('SampleData[SequencesWithQuality]', manifest_path, view_type='SingleEndFastqManifestPhred33V2')
    
demux.save(demux_output)

# Execute denoise command
rep_seqs_dada2_output = os.path.join(temp_path, "rep-seqs-dada2.qza")
table_dada2_output = os.path.join(temp_path, "table-dada2.qza")
denoising_stats_output = os.path.join(summary_o_path, f"{sub_num}-stats-dada2.qza") 

if p_s == 'PAIRED':
    denoise_results = denoise_paired(
        demultiplexed_seqs=demux,
        trim_left_f=trim_left_f,
        trim_left_r=trim_left_r,
        trunc_len_f=trunc_len_f,
        trunc_len_r=trunc_len_r
    )
elif p_s == 'SINGLE':
    denoise_results = denoise_single(
        demultiplexed_seqs=demux,
        trim_left=trim_left_f,
        trunc_len=trunc_len_f
    )

denoise_results.representative_sequences.save(rep_seqs_dada2_output)
denoise_results.table.save(table_dada2_output)
denoise_results.denoising_stats.save(denoising_stats_output)

# Execute summary command
visualization_output = os.path.join(summary_o_path, f"{sub_num}-table-dada2.qzv")
metadata = qiime2.Metadata.load(manifest_path)

summary_result = summarize(
    table=denoise_results.table,
    sample_metadata=metadata
)
summary_result.visualization.save(visualization_output)

# Execute classify command
classifier = qiime2.Artifact.load(classifier_map[region_16s])
taxonomy_output = os.path.join(profile_o_path, f"{sub_num}-taxonomy.qza")

classification_result = classify_sklearn(
    reads=denoise_results.representative_sequences,
    classifier=classifier
)
classification_result.classification.save(taxonomy_output)

Licenzing and Citations:
QIIME2 overall:


QIIME2 pluging:


classifiers:

Silva:
Michael S Robeson II, Devon R O’Rourke, Benjamin D Kaehler, Michal Ziemski, Matthew R Dillon, Jeffrey T Foster, Nicholas A Bokulich. RESCRIPt: Reproducible sequence taxonomy reference database management for the masses. bioRxiv 2020.10.05.326504; doi: https://doi.org/10.1101/2020.10.05.326504

See the SILVA website for the latest citation information for SILVA. https://www.arb-silva.de/

GTDB:
UUID: 5d5461cc-6a51-434b-90ab-040f388e4221
SHA256: 07aadcf7472d9cc6f853f6b4615348619f1a3eceb56c1fb1b6d8dbb20554765f
Sklearn Version: 1.4.2
Date Trained: 2024-05-30
Citations: Parks et al. (2021), Parks et al. (2020), Parks et al. (2018), Rinke et al. (2021)

In [6]:
#Remove old manifest and generate new manifest #OUTDATED

import os
import shutil
import pandas as pd

#TODO: make manifest for unpaired

# Define paths
current_dir = os.getcwd()
paired_unproc = os.path.join(current_dir, 'paired_unproc')
unpaired_unproc = os.path.join(current_dir, 'unpaired_unproc')
paired_master_manifest = os.path.join(current_dir, 'DataFrame_2_Pruned_paired.tsv')
unpaired_master_manifest = os.path.join(current_dir, 'DataFrame_2_Pruned_paired.tsv')

# Remove old manifest files if they exist
paired_manifest = os.path.join(paired_unproc, 'manifest.tsv')
unpaired_manifest = os.path.join(unpaired_unproc, 'manifest.tsv')

if os.path.exists(paired_manifest):
    os.remove(paired_manifest)

if os.path.exists(unpaired_manifest):
    os.remove(unpaired_manifest)

### Generate manifest in paired folder ####
samples = [f for f in os.listdir(paired_unproc) if os.path.isdir(os.path.join(paired_unproc, f))]

# Read the manifest file
df = pd.read_csv(paired_master_manifest, sep='\t', encoding='latin1')
df.rename(columns={'Run': 'id'}, inplace=True)


# Update the paired manifest
for sample in samples:
    sample_path = os.path.join(paired_unproc, sample)
    reads = os.listdir(sample_path)
    
    print(sample)
    
    df.loc[df['id'] == sample, 'forward-absolute-filepath'] = os.path.join(sample_path, reads[0])
    df.loc[df['id'] == sample, 'reverse-absolute-filepath'] = os.path.join(sample_path, reads[1])

print(df.head)

#remove all rows of df without sample
df = df[df['forward-absolute-filepath'] != 'na']

print(df.head)

# Save the modified TSV
df.to_csv(paired_manifest, sep='\t', index=False)
    
# You can add similar code to handle the unpaired manifest if needed
# For example:
# shutil.copy(os.path.join(current_dir, 'DataFrame_2_Pruned_unpaired.tsv'), unpaired_manifest)
# for sample in unpaired_samples:
#     sample_path = os.path.join(unpaired_unproc, sample)
#     update_manifest(unpaired_manifest, sample_path, sample)

ERR1351807
<bound method NDFrame.head of               id  Year_of_release     Release_Date       Load_Date Submission  \
0     ERR1351807             2017  8/21/2017 23:46  8/23/2017 3:56  ERA600241   
1     ERR1351808             2017  8/21/2017 23:46  8/23/2017 3:57  ERA600241   
2     ERR1351809             2017  8/21/2017 23:46  8/23/2017 3:57  ERA600241   
3     ERR1351810             2017  8/21/2017 23:46  8/23/2017 3:57  ERA600241   
4     ERR1351811             2017  8/21/2017 23:46  8/23/2017 3:57  ERA600241   
...          ...              ...              ...             ...        ...   
5941  SRR9660388             2020    8/8/2020 0:00  7/9/2019 14:28  SRA919014   
5942  SRR9660389             2020    8/8/2020 0:00  7/9/2019 14:28  SRA919014   
5943  SRR9660390             2020    8/8/2020 0:00  7/9/2019 14:28  SRA919014   
5944  SRR9660391             2020    8/8/2020 0:00  7/9/2019 14:29  SRA919014   
5945  SRR9660392             2020    8/8/2020 0:00  7/9/2019 14:29  

In [7]:
%%bash
#run qiime2

#different sequencing areas
#V4
#V1-V3
#V1-V2
#V3-V4
#V1-V2 and v4
#V4-V5
#V1-V4

current_dir=$(pwd)
paired_unproc="$current_dir/paired_unproc"
paired_proc="$current_dir/paired_proc"

qiime tools import \
    --type 'SampleData[PairedEndSequencesWithQuality]' \
    --input-path "$paired_unproc/manifest.tsv" \
    --output-path "$current_dir/qza_files/paired-end-demux.qza" \
    --input-format PairedEndFastqManifestPhred33V2 

mv $paired_unproc/manifest.tsv $current_dir/qza_files/sample-metadata.tsv

#mv $paired_unproc/* $paired_proc
        
echo "initial processing complete"
        
qiime dada2 denoise-paired \
    --i-demultiplexed-seqs qza_files/paired-end-demux.qza \
    --p-trim-left-f 13 \
    --p-trim-left-r 13 \
    --p-trunc-len-f 150 \
    --p-trunc-len-r 150 \
    --o-representative-sequences "$current_dir/qza_files/rep-seqs-dada2.qza" \
    --o-table "$current_dir/qza_files/table-dada2.qza" \
    --o-denoising-stats "$current_dir/qza_files/stats-dada2.qza"
        
echo "denoising complete"
        
qiime feature-table summarize \
    --i-table "$current_dir/qza_files/table-dada2.qza" \
    --o-visualization "$current_dir/qza_files/table-dada2.qzv" \
    --m-sample-metadata-file "$current_dir/qza_files/sample-metadata.tsv"
        
echo "feature table complete"

#FIX COMMAND INPUTS
qiime feature-classifier classify-sklearn \
    --i-classifier "$current_dir/classifier/gg-13-8-99-515-806-nb-classifier.qza" \
    --i-reads "$current_dir/qza_files/rep-seqs-dada2.qza" \
    --o-classification "$current_dir/qza_files/taxonomy.qza"
    
echo "classiication complete"
    
#FIX COMMAND INPUTS
qiime metadata tabulate \
    --m-input-file "$current_dir/qza_files/taxonomy.qza" \
    --o-visualization "$current_dir/qza_files/taxonomy.qzv"
    
echo "meta data tabulation complete"
        
qiime taxa barplot \
    --i-table "$current_dir/qza_files/table-dada2.qza" \
    --i-taxonomy "$current_dir/qza_files/taxonomy.qza" \
    --m-metadata-file "$current_dir/qza_files/sample-metadata.tsv" \
    --o-visualization "$current_dir/qza_files/taxa-bar-plots.qzv"
        
echo "bar plot complete"

rm $current_dir/qza_files/sample-metadata.tsv

Imported /home/miloscola/augene/training_data/paired_unproc/manifest.tsv as PairedEndFastqManifestPhred33V2 to /home/miloscola/augene/training_data/qza_files/paired-end-demux.qza
initial processing complete
Saved FeatureTable[Frequency] to: /home/miloscola/augene/training_data/qza_files/table-dada2.qza
Saved FeatureData[Sequence] to: /home/miloscola/augene/training_data/qza_files/rep-seqs-dada2.qza
Saved SampleData[DADA2Stats] to: /home/miloscola/augene/training_data/qza_files/stats-dada2.qza
denoising complete
Saved Visualization to: /home/miloscola/augene/training_data/qza_files/table-dada2.qzv
feature table complete
bar plot complete


Usage: qiime taxa barplot [OPTIONS]

  This visualizer produces an interactive barplot visualization of
  taxonomies. Interactive features include multi-level sorting, plot
  recoloring, sample relabeling, and SVG figure export.

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency]
                         Feature table to visualize at various taxonomic
                         levels.                                    [required]
  --i-taxonomy ARTIFACT FeatureData[Taxonomy]
                         Taxonomic annotations for features in the provided
                         feature table. All features in the feature table must
                         have a corresponding taxonomic annotation. Taxonomic
                         annotations that are not present in the feature table
                         will be ignored.                           [required]
Parameters:
  --m-metadata-file METADATA...
    (multiple            The sample metadata.
     arguments will be   
     merged)

In [16]:
#SORT RAW DATA INTO SAMPLE FILES #OUTDATED
import os
import shutil
import glob

# Enable extended pattern matching
# (In Python, glob.glob() handles pattern matching)

current_dir = os.getcwd()
seq_reads = os.path.join(current_dir, "seq_reads")

# TEST WITHOUT FOR LOOP
files = sorted(glob.glob(os.path.join(seq_reads, "*"))[:2])
print(files)

# Check if files list is not empty and contains expected files
if len(files) >= 2 and files[0].endswith("_1.fastq.gz") and files[1].endswith("_2.fastq.gz"):
    print("IF CONDITION TRIGGERED")
    # Move both files to the processing directory
    filename = os.path.basename(files[0])
    print(filename)
    sample_id = filename[:-11]  # Remove '_1.fastq.gz' from the filename

    paired_unproc_dir = os.path.join(current_dir, "paired_unproc", sample_id)
    os.makedirs(paired_unproc_dir)
    shutil.move(files[0], paired_unproc_dir)
    shutil.move(files[1], paired_unproc_dir)

else:
    # Move only the top file to the processing directory
    if len(files) > 0:
        filename = os.path.basename(files[0])
        sample_id = filename[:-11]  # Remove '_1.fastq.gz' from the filename

        unpaired_unproc_dir = os.path.join(current_dir, "unpaired_unproc", sample_id)
        os.makedirs(unpaired_unproc_dir)
        shutil.move(files[0], unpaired_unproc_dir)

['/home/miloscola/augene/aggit2/SKINOME_training_data/seq_reads/ERR1514335_1.fastq.gz', '/home/miloscola/augene/aggit2/SKINOME_training_data/seq_reads/SRR5869529_1.fastq.gz']


In [12]:
%%bash
#NO FOR LOOP TEST #OUTDATED
#Sort Raw Data

# Enable extended pattern matching
shopt -s extglob

current_dir=$(pwd)
seq_reads="seq_reads"
seq_reads="$current_dir/$seq_reads"


#TEST WITHOUT FOR LOOP
files=($(ls -1 "$seq_reads" | head -n 2 | sort))
    
#echo "Moved ${files[0]} and ${files[1]} to processing directory."

if [[ "${files[0]}" == *_1.fastq.gz && "${files[1]}" == *_2.fastq.gz ]]; then
    # Move both files to the processing directory
    filename=${files[0]}
    sample_id="${filename%_1.fastq.gz}"
    mkdir "$current_dir/paired_unproc/$sample_id" 
    mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/paired_unproc/$sample_id"

else
    # Move only the top file to the processing directory
    filename=${files[0]}
    sample_id="${filename%_1.fastq.gz}"
    mkdir "$current_dir/unpaired_unproc/$sample_id" 
    mv "$seq_reads/${files[0]}" "$current_dir/unpaired_unproc/$sample_id"
fi

In [3]:
%%bash
#Sort Raw Data #REPLACE WITH CODE ABOVE WHEN DONE TESTING #OUTDATED

# Enable extended pattern matching
shopt -s extglob

current_dir=$(pwd)
seq_reads="seq_reads"
seq_reads="$current_dir/$seq_reads"

# Enter the loop
while [ "$(ls -A $seq_reads)" ]; do
    files=($(ls -1 "$seq_reads" | head -n 2 | sort))
    
    #echo "Moved ${files[0]} and ${files[1]} to processing directory."

    if [[ "${files[0]}" == *_1.fastq.gz && "${files[1]}" == *_2.fastq.gz ]]; then
        # Move both files to the processing directory
        filename=${files[0]}
        sample_id="${filename%_1.fastq.gz}"
        mkdir "$seq_reads/$sample_id" 
        mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/paired_unproc/$sample_id"

    else
        # Move only the top file to the processing directory
        mv "$seq_reads/${files[0]}" "$current_dir/unpaired_unproc/"
    fi
done

Process is interrupted.


In [5]:
%%bash
#Remove old manifest and Generate new Manifests
#OUTDATED
shopt -s extglob

current_dir=$(pwd)
paired_unproc="$current_dir/paired_unproc"
unpaired_unproc="$current_dir/unpaired_unproc"

# Enable extended pattern matching
rm "${paired_unproc}/manifest.tsv"
rm "${unpaired_unproc}/manifest.tsv"

#generate manifest in paired folder

# Get the list of folder names in the specified directory and store in an array
samples=($(find "$paired_unproc" -maxdepth 1 -type d -exec basename {} \;))

#generate new manifest
cp "${current_dir}/DataFrame_2_Pruned_paired.tsv" "${paired_unproc}/manifest.tsv"

for sample in "${samples[@]}"; do
    sample_path="${paired_unproc}/${sample}"
    reads=($(ls -1 "$sample_path"))
    
    awk -v row="$row_index" -v col="$column_index" -v val="$new_value" -F'\t' 'BEGIN {OFS=FS} { if (NR == row) $col = val; print }' "$tsv_file" > modified.tsv
done

#generate manifest in unpaired folder



paired_unproc
ERR1351807


rm: cannot remove '/home/miloscola/augene/training_data/paired_unproc/manifest.tsv': No such file or directory
rm: cannot remove '/home/miloscola/augene/training_data/unpaired_unproc/manifest.tsv': No such file or directory


In [2]:
%%bash
#import qiime tools
qiime tools import --show-importable-formats

#qiime tools import --show-importable-types

AlignedDNAFASTAFormat
AlignedDNASequencesDirectoryFormat
AlphaDiversityDirectoryFormat
AlphaDiversityFormat
BIOMV100DirFmt
BIOMV100Format
BIOMV210DirFmt
BIOMV210Format
BooleanSeriesDirectoryFormat
BooleanSeriesFormat
Bowtie2IndexDirFmt
CasavaOneEightLanelessPerSampleDirFmt
CasavaOneEightSingleLanePerSampleDirFmt
DADA2StatsDirFmt
DADA2StatsFormat
DNAFASTAFormat
DNASequencesDirectoryFormat
DeblurStatsDirFmt
DeblurStatsFmt
DifferentialDirectoryFormat
DifferentialFormat
DistanceMatrixDirectoryFormat
EMPPairedEndCasavaDirFmt
EMPPairedEndDirFmt
EMPSingleEndCasavaDirFmt
EMPSingleEndDirFmt
ErrorCorrectionDetailsDirFmt
FastqGzFormat
FirstDifferencesDirectoryFormat
FirstDifferencesFormat
HeaderlessTSVTaxonomyDirectoryFormat
HeaderlessTSVTaxonomyFormat
ImportanceDirectoryFormat
ImportanceFormat
LSMatFormat
MultiplexedPairedEndBarcodeInSequenceDirFmt
MultiplexedSingleEndBarcodeInSequenceDirFmt
NewickDirectoryFormat
NewickFormat
OrdinationDirectoryFormat
OrdinationFormat
PairedDNASequencesDirectory

In [None]:
%%bash
#single for loop run test

# Enable extended pattern matching
shopt -s extglob

current_dir=$(pwd)
seq_reads="seq_reads"
seq_reads="$current_dir/$seq_reads"

files=($(ls -1 "$seq_reads" | head -n 2 | sort))
    
# Move both files to the processing directory
mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/processing/Sample1/"
echo "Moved ${files[0]} and ${files[1]} to processing directory."

if [[ "${files[0]}" == *_1.fastq.gz && "${files[1]}" == *_2.fastq.gz ]]; then
    # Move both files to the processing directory
    #mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/processing/"
        
    #echo "Moved ${files[0]} and ${files[1]} to processing directory."
        
    echo "if condition triggered"
        
    #for gzip_file in $current_dir/processing/*.gz; do
    #    # Unzip each file
    #    gunzip -f "$gzip_file"
    #done
    
#    T=$(printf '\t')
    
#    cat > $current_dir/processing/manifest/manifest.tsv << EOF
#sample-id $T forward-absolute-filepath $T reverse-absolute-filepath
#sample-1 $T $current_dir/processing/Sample1/${files[0]} $T $current_dir/processing/Sample1/${files[1]}
#EOF
    
#    cat <<EOF > "$current_dir/processing/manifest/manifest.tsv"
#sample-id"\t"forward-absolute-filepath"\t"reverse-absolute-filepath
#sample-1"\t""$current_dir/processing/Sample1/${files[0]}""\t""$current_dir/processing/Sample1/${files[1]}"
#EOF
        
    qiime tools import \
      --type 'SampleData[PairedEndSequencesWithQuality]' \
      --input-path "$current_dir/processing/manifest/manifest.tsv" \
      --output-path "$current_dir/qza_files/paired-end-demux.qza" \
      --input-format PairedEndFastqManifestPhred33V2 
        
    #rm $current_dir/processing/manifest/manifest.tsv
          
    #re-zip files
    #for file in $current_dir/processing/*; do
    #   # Re-zip each file
    #    gzip -f "$file"
    #done
        
    mv $current_dir/processing/Sample1/* $current_dir/processed/
        
    echo "initial processing complete"
        
    qiime dada2 denoise-paired \
      --i-demultiplexed-seqs qza_files/paired-end-demux.qza \
      --p-trim-left-f 13 \
      --p-trim-left-r 13 \
      --p-trunc-len-f 150 \
      --p-trunc-len-r 150 \
      --o-representative-sequences "$current_dir/qza_files/rep-seqs-dada2.qza" \
      --o-table "$current_dir/qza_files/table-dada2.qza" \
      --o-denoising-stats "$current_dir/qza_files/stats-dada2.qza"
        
    echo "denoising complete"
        
    qiime feature-table summarize \
      --i-table "$current_dir/qza_files/table-dada2.qza" \
      --o-visualization "$current_dir/qza_files/table-dada2.qzv" \
      --m-sample-metadata-file "$current_dir/qza_files/sample-metadata.tsv"
        
    echo "feature table complete"
    
    #FIX COMMAND INPUTS
    qiime feature-classifier classify-sklearn \
      --i-classifier "$current_dir/classifiers/gg-13-8-99-515-806-nb-classifier.qza" \
      --i-reads "$current_dir/qza_files/rep-seqs-dada2.qza" \
      --o-classification "$current_dir/qza_files/taxonomy.qza"
    
    #FIX COMMAND INPUTS
    #qiime metadata tabulate \
    #--m-input-file "$current_dir/qza_files/taxonomy.qza" \
    #--o-visualization "$current_dir/qza_files/taxonomy.qzv"

        
    #qiime taxa barplot \
    #  --i-table "$current_dir/qza_files/table-dada2.qza" \
    #  --i-taxonomy "$current_dir/qza_files/taxonomy.qza" \
    #  --m-metadata-file "$current_dir/qza_files/sample-metadata.tsv" \
    #  --o-visualization "$current_dir/qza_files/taxa-bar-plots.qzv"
        
    #echo "bar plot complete"
        
else
    # Move only the top file to the processing directory
    echo "else condition triggered"
    break 
    #TODO: add single read seq code
    #mv "${files[0]}" processing/
    #echo "Moved ${files[0]} to processing directory."
fi

In [1]:
%%bash

# Enable extended pattern matching
shopt -s extglob

current_dir=$(pwd)
seq_reads="seq_reads"
seq_reads="$current_dir/$seq_reads"

# Enter the loop
while [ "$(ls -A $seq_reads)" ]; do
    files=($(ls -1 "$seq_reads" | head -n 2 | sort))
    
    # Move both files to the processing directory
    mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/processing/"
    echo "Moved ${files[0]} and ${files[1]} to processing directory."

    if [[ "${files[0]}" == *_1.fastq.gz && "${files[1]}" == *_2.fastq.gz ]]; then
        # Move both files to the processing directory
        #mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/processing/"
        
        #echo "Moved ${files[0]} and ${files[1]} to processing directory."
        
        echo "if condition triggered"
        
        for gzip_file in $current_dir/processing/*.gz; do
            # Unzip each file
            gunzip -f "$gzip_file"
            # Remove the original gzip file
            rm "$gzip_file"
        done
        
        qiime tools import \
          --type 'SampleData[PairedEndSequencesWithQuality]' \
          --input-path "$current_dir/processing/" \
          --output-path "$current_dir/qza_files/paired-end-demux.qza" \
          --input-format CasavaOneEightSingleLanePerSampleDirFmt
          
        #re-zip files
        for file in $current_dir/processing/*; do
            # Re-zip each file
            gzip -f "$file"
            #remove unzipped file
            rm "$file"
        done
        
        mv "$current_dir/processing/*" "$current_dir/processed/"
        
        echo "initial processing complete"
        
        qiime dada2 denoise-paired \
          --i-demultiplexed-seqs qza_files/paired-end-demux.qza \
          --p-trim-left-f 13 \
          --p-trim-left-r 13 \
          --p-trunc-len-f 150 \
          --p-trunc-len-r 150 \
          --o-representative-sequences "$current_dir/qza_files/rep-seqs-dada2.qza" \
          --o-table "$current_dir/qza_files/table-dada2.qza" \
          --o-denoising-stats "$current_dir/qza_files/stats-dada2.qza"
        
        echo "denoising complete"
        
        qiime feature-table summarize \
          --i-table "$current_dir/qza_files/table-dada2.qza" \
          --o-visualization "$current_dir/qza_files/table-dada2.qzv" \
          --m-sample-metadata-file "$current_dir/qza_files/sample-metadata.tsv"
        
        echo "feature table complete"
        
        qiime taxa barplot \
          --i-table "$current_dir/qza_files/table-dada2.qza" \
          --i-taxonomy "$current_dir/qza_files/taxonomy.qza" \
          --m-metadata-file "$current_dir/qza_files/sample-metadata.tsv" \
          --o-visualization "$current_dir/qza_files/taxa-bar-plots.qzv"
        
        echo "bar plot complete"
        
    else
        # Move only the top file to the processing directory
        echo "else condition triggered"
        break 
        #TODO: add single read seq code
        #mv "${files[0]}" processing/
        #echo "Moved ${files[0]} to processing directory."
    fi
done

Process is interrupted.


In [1]:
%%bash

qiime info

System versions
Python version: 3.6.10
QIIME 2 release: 2020.8
QIIME 2 version: 2020.8.0
q2cli version: 2020.8.0

Installed plugins
alignment: 2020.8.0
composition: 2020.8.0
cutadapt: 2020.8.0
dada2: 2020.8.0
deblur: 2020.8.0
demux: 2020.8.0
diversity: 2020.8.0
diversity-lib: 2020.8.0
emperor: 2020.8.0
feature-classifier: 2020.8.0
feature-table: 2020.8.0
fragment-insertion: 2020.8.0
gneiss: 2020.8.0
longitudinal: 2020.8.0
metadata: 2020.8.0
phylogeny: 2020.8.0
quality-control: 2020.8.0
quality-filter: 2020.8.0
sample-classifier: 2020.8.0
taxa: 2020.8.0
types: 2020.8.0
vsearch: 2020.8.0

Application config directory
/home/miloscola/anaconda3/envs/qiime2/var/q2cli

Getting help
To get help with QIIME 2, visit https://qiime2.org


In [10]:
%%bash

# Enable extended pattern matching
shopt -s extglob

current_dir=$(pwd)
seq_reads="seq_reads"
seq_reads="$current_dir/$seq_reads"

files=($(ls -1 "$seq_reads" | head -n 2 | sort))
    
# Move both files to the processing directory
mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/processing/Sample1/"
echo "Moved ${files[0]} and ${files[1]} to processing directory."

if [[ "${files[0]}" == *_1.fastq.gz && "${files[1]}" == *_2.fastq.gz ]]; then
    # Move both files to the processing directory
    #mv "$seq_reads/${files[0]}" "$seq_reads/${files[1]}" "$current_dir/processing/"
        
    #echo "Moved ${files[0]} and ${files[1]} to processing directory."
        
    echo "if condition triggered"
        
    #for gzip_file in $current_dir/processing/*.gz; do
    #    # Unzip each file
    #    gunzip -f "$gzip_file"
    #done
    
    T=$(printf '\t')
    
    cat > $current_dir/processing/manifest/manifest.tsv << EOF
sample-id $T forward-absolute-filepath $T reverse-absolute-filepath
sample-1 $T $current_dir/processing/Sample1/${files[0]} $T $current_dir/processing/Sample1/${files[1]}
EOF
    
#    cat <<EOF > "$current_dir/processing/manifest/manifest.tsv"
#sample-id"\t"forward-absolute-filepath"\t"reverse-absolute-filepath
#sample-1"\t""$current_dir/processing/Sample1/${files[0]}""\t""$current_dir/processing/Sample1/${files[1]}"
#EOF
        
    qiime tools import \
        --type 'SampleData[PairedEndSequencesWithQuality]' \
        --input-path "$current_dir/processing/manifest/manifest.tsv" \
        --output-path "$current_dir/qza_files/paired-end-demux.qza" \
        --input-format PairedEndFastqManifestPhred33V2 
        
    #rm $current_dir/processing/manifest/manifest.tsv
          
    #re-zip files
    #for file in $current_dir/processing/*; do
    #   # Re-zip each file
    #    gzip -f "$file"
    #done
        
    mv $current_dir/processing/Sample1/* $current_dir/processed/
        
    echo "initial processing complete"
        
    qiime dada2 denoise-paired \
      --i-demultiplexed-seqs qza_files/paired-end-demux.qza \
      --p-trim-left-f 13 \
      --p-trim-left-r 13 \
      --p-trunc-len-f 150 \
      --p-trunc-len-r 150 \
      --o-representative-sequences "$current_dir/qza_files/rep-seqs-dada2.qza" \
      --o-table "$current_dir/qza_files/table-dada2.qza" \
      --o-denoising-stats "$current_dir/qza_files/stats-dada2.qza"
        
    echo "denoising complete"
        
    qiime feature-table summarize \
      --i-table "$current_dir/qza_files/table-dada2.qza" \
      --o-visualization "$current_dir/qza_files/table-dada2.qzv" \
      --m-sample-metadata-file "$current_dir/qza_files/sample-metadata.tsv"
        
    echo "feature table complete"
        
    qiime taxa barplot \
      --i-table "$current_dir/qza_files/table-dada2.qza" \
      --i-taxonomy "$current_dir/qza_files/taxonomy.qza" \
      --m-metadata-file "$current_dir/qza_files/sample-metadata.tsv" \
      --o-visualization "$current_dir/qza_files/taxa-bar-plots.qzv"
        
    echo "bar plot complete"
        
else
    # Move only the top file to the processing directory
    echo "else condition triggered"
    break 
    #TODO: add single read seq code
    #mv "${files[0]}" processing/
    #echo "Moved ${files[0]} to processing directory."
fi

Moved ERR1351843_R1_001.fastq.gz and ERR1351843_R2_001.fastq.gz to processing directory.
if condition triggered
Imported /home/miloscola/augene/training_data/processing/manifest/manifest.tsv as PairedEndFastqManifestPhred33V2 to /home/miloscola/augene/training_data/qza_files/paired-end-demux.qza
initial processing complete
denoising complete
feature table complete
bar plot complete


Plugin error from dada2:

  An error was encountered while running DADA2 in R (return code 1), please inspect stdout and stderr to learn more.

Debug info has been saved to /tmp/qiime2-q2cli-err-zwl7r5vc.log
There was an issue with loading the file /home/miloscola/augene/training_data/qza_files/sample-metadata.tsv as metadata:

  Metadata file path doesn't exist, or the path points to something other than a file. Please check that the path exists, has read permissions, and points to a regular file (not a directory): /home/miloscola/augene/training_data/qza_files/sample-metadata.tsv

  There may be more errors present in the metadata file. To get a full report, sample/feature metadata files can be validated with Keemei: https://keemei.qiime2.org

  Find details on QIIME 2 metadata requirements here: https://docs.qiime2.org/2020.8/tutorials/metadata/

There was an issue with loading the file /home/miloscola/augene/training_data/qza_files/sample-metadata.tsv as metadata:

  Metadata file 

In [None]:
#TODO:
#0) standardize file names -DONE
#1) make seprate folders for each sample **account for single and paired reads
#2) make a manifest with sample names corresponding 
#   to the sample names in the metadata TSV for all files -DONE
#3) run pipeline with all files all at once

Greengeens 2

In [1]:
!pip install q2-greengenes2
!pip install redbiom

Processing /home/miloscola/.cache/pip/wheels/ad/64/10/9d66020ccd16f0754e1059f75c0fb0162dba4792f4fdf7a8bf/q2_greengenes2-2024.1-py3-none-any.whl
Collecting iow
  Using cached iow-1.0.7.tar.gz (1.2 MB)
Building wheels for collected packages: iow


  Building wheel for iow (setup.py) ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /home/miloscola/anaconda3/envs/qiime2/bin/python3.6 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-r9m06ags/iow/setup.py'"'"'; __file__='"'"'/tmp/pip-install-r9m06ags/iow/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-m6go0kak
       cwd: /tmp/pip-install-r9m06ags/iow/
  Complete output (36 lines):
  running bdist_wheel
  running build
  running build_py
  make: Entering directory '/tmp/pip-install-r9m06ags/iow/bp/BitArray'
  make: 'libbitarr.a' is up to date.
  make: Leaving directory '/tmp/pip-install-r9m06ags/iow/bp/BitArray'
  creating build
  creating build/lib.linux-x86_64-3.6
  creating build/lib.linux-x86_64-3.6/bp
  copying bp/_cli.py -> build/lib.linux-x86_64-3.6/bp
  copying



In [None]:
def qiime2_profile(sub_num, sub_path, master_manifest_path, profile_o_path, summary_o_folder, classifier_map, trim_left_f=13, trim_left_r=13, trunc_len_f=150, trunc_len_r=150, n_threads=8):
    """
    calculates microbiome profiles from a submission of reads and stores the results in a .qza file in the specified folder

    Parameters
    ----------
    sub_num : str
        ID of the submission containing the reads.
    sub_path : str
        path to the submission.
    master_manifest_path : str
        path to the master meta data file (should .tsv file) Manifest must 
        have the following columns: Run or id, Library_Layout, Region_16S, 
        Submission
    profile_o_path : str
        path to the output folder for microbiome profile
    summary_o_path : str:
        path to the output folder for read summary table
    classifier_map : dict
        map of 16s rRNA region to classifier built to classify that region
    trim_left_f : int
        number of nucleotides to trim from the 5' end of the forward read before denoising
    trim_left_r : int
        number of nucleotides to trim from the 5' end of the reverse read before denoising
    trim_left_f : int
        specifies the length at which the forward read will be truncated
    trim_left_r : int
        specifies the length at which the reverse read will be truncated

    Returns
    -------
    none
    """
    
    #define variables
    current_dir = os.getcwd()
    summary_o_path = os.path.join(summary_o_folder, sub_num)
    manifest_path = os.path.join(summary_o_path, f"{sub_num}-manifest.tsv")
    
    #string representing weather reads are paired or single
    p_s = None           
    #string representing what 16s regions were used
    region_16s = None           

    #validate inputs
    if not type(sub_num) == str:
        raise TypeEror("sub_num must be a string")
    if not os.path.exists(sub_path):
        raise FileNotFoundError(f"{sub_num} submission path is invalid")
    if not os.path.exists(master_manifest_path):
        raise FileNotFoundError("master manifest path is invalid")
    if not type(classifier_map) == dict:
        raise TypeError("classifier_map must be a dict")
    for classifier in classifier_map.values():
        if not os.path.exists(classifier):
            raise FileNotFoundError(f"classifier map contains invalid paths. {classifier} is not a valid path")
    
    #if output folders do not exist, create them
    os.makedirs(profile_o_path, exist_ok=True)
    os.makedirs(summary_o_folder, exist_ok=True)
    os.makedirs(summary_o_path, exist_ok=True)
    
    #generate manifest
    manifest = pd.read_csv(master_manifest_path, sep='\t', encoding='latin1')
    
    #Rename "Run" to 'id' to support compatability with qiime2
    if 'Run' in manifest.columns:
        manifest.rename(columns={'Run': 'id'}, inplace=True)
    
    #prune manifest
    manifest = manifest[manifest['Submission'] == sub_num]
    reads = os.listdir(sub_path)
    len_manifest = len(manifest)
    len_reads = len(reads)
    manifest = manifest[manifest['id'].isin(reads)]
    len_filtered_manifest = len(manifest)
    
    #check that names and number of samples match on manifest and in folder
    if manifest.empty:
        raise ValueError("Pruned manifest is empty. Submission number may not be in manifest or reads may be improperly named.")
    if len_manifest > len_reads:
        print("Warning: more entries in manifest than in folder")
    if len_manifest < len_reads:
        print("Warning: more entries in folder than in manifest")
    if len_manifest == len_reads and len_filtered_manifest < len_reads:
        print("Warning: mismatch between some names on manifest and in folder")
        
    #check that the submission does not have mixed single and paired reads
    unique_layouts = manifest['Library_Layout'].unique().tolist()
    
    if len(unique_layouts) > 1:
        raise ValueError("Multiple layouts detected. Submission may contain single and paired reads.")
    p_s = unique_layouts[0]
    
    #check that the submission does not have samples with different 16s regions
    unique_16s = manifest['Region_16S'].unique().tolist()
    if len(unique_layouts) > 1:
        raise ValueError("Multiple regions detected. Submission may contain reads from different 16s regions.")
    region_16s = unique_16s[0]
    
    #add file paths to manifest
    if p_s == 'PAIRED':
        manifest['forward-absolute-filepath'] = 0
        manifest['reverse-absolute-filepath'] = 0
        for sample in manifest['id'].tolist():
            sample_path = os.path.join(sub_path, sample)
            reads = os.listdir(sample_path)
            #check for correct number of reads
            if len(reads) != 2:
                raise ValueError(f"Incorrect number of files for {sample}. Should be 2. Got {len(reads)}.")
            #add read paths to manifest
            manifest.loc[manifest['id'] == sample, 'forward-absolute-filepath'] = os.path.join(sample_path, reads[0])
            manifest.loc[manifest['id'] == sample, 'reverse-absolute-filepath'] = os.path.join(sample_path, reads[1])
    elif p_s == 'SINGLE':
        manifest['absolute-filepath'] = 0
        for sample in manifest['id'].tolist():
            sample_path = os.path.join(sub_path, sample)
            reads = os.listdir(sample_path)
            #check for correct number of reads
            if len(reads) != 1:
                raise ValueError(f"incorrect number of files for {sample}. Should be 1. Got {len(reads)}.")
            #add read path to manifest
            manifest.loc[manifest['id'] == sample, 'absolute-filepath'] = os.path.join(sample_path, reads[0])
    else:
        raise ValueError("invalid value for library layout")
        
    #save the manifest
    manifest.to_csv(manifest_path, sep='\t', index=False)

    ## execute qiime2 pipeline ##
    
    print(f"Processing {sub_num}")
       
    # Execute import command
    if p_s == 'PAIRED':
        demux = qiime2.Artifact.import_data(
            'SampleData[PairedEndSequencesWithQuality]', 
            manifest_path, 
            view_type='PairedEndFastqManifestPhred33V2'
        )
    elif p_s == 'SINGLE':
        demux = qiime2.Artifact.import_data(
            'SampleData[SequencesWithQuality]', 
            manifest_path, 
            view_type='SingleEndFastqManifestPhred33V2'
        )

    # Execute denoise command
    denoising_stats_output = os.path.join(summary_o_path, f"{sub_num}-denoise-stats.qza") 

    if p_s == 'PAIRED':
        denoise_results = denoise_paired(
            demultiplexed_seqs=demux,
            trim_left_f=trim_left_f,
            trim_left_r=trim_left_r,
            trunc_len_f=trunc_len_f,
            trunc_len_r=trunc_len_r, 
            n_threads=n_threads
        )
    elif p_s == 'SINGLE':
        denoise_results = denoise_single(
            demultiplexed_seqs=demux,
            trim_left=trim_left_f,
            trunc_len=trunc_len_f, 
            n_threads=n_threads
        )

    denoise_results.denoising_stats.save(denoising_stats_output)
    
    #temporay? Keep for longer?
    denoise_results.representative_sequences.save(os.path.join(summary_o_path, f"{sub_num}-rep_seqs"))
    denoise_results.table.save(os.path.join(summary_o_path, f"{sub_num}-freq_table"))
    denoise_results.denoising_stats.save(denoising_stats_output)
    print(f"{sub_num} denoising complete")

    # Execute summary command
    visualization_output = os.path.join(summary_o_path, f"{sub_num}-summary")
    metadata = qiime2.Metadata.load(manifest_path)

    summary_result = summarize(
        table=denoise_results.table,
        sample_metadata=metadata
    )
    summary_result.visualization.save(visualization_output)
    print(f"{sub_num} summary table complete")

    # Execute classify command
    classifier = qiime2.Artifact.load(classifier_map[region_16s])
    taxonomy_output = os.path.join(summary_o_path, f"{sub_num}-taxonomy")

    taxonomy = classify_sklearn(
        reads=denoise_results.representative_sequences,
        classifier=classifier, 
        n_jobs=-1,
    )
    taxonomy.classification.save(taxonomy_output)
    
    print(f"{sub_num} taxonomy classification complete")
    
    """from qiime2 import Artifact
    current_dir = os.getcwd()
    profile_o_path = os.path.join(current_dir, "profile_o")
    taxonomy_path = os.path.join(profile_o_path, f"{sub_num}-taxonomy.qza")
    temp_path = os.path.join(current_dir, "temp")
    denoise_path = os.path.join(temp_path, "denoise.qza")
    manifest=os.path.join(temp_path, "manifest.tsv")
    taxonomy = Artifact.load(taxonomy_path)
    denoise_results = Artifact.load(denoise_path)"""
    
    #get assignments from BLAST
    
    
    #Generate Relitive Frequencies
    rel_freq = relative_frequency(denoise_results.table)[0] #add .table for denoise results
    print(f"{sub_num} relitive frequency table complete")
    
    #Generate Taxonomy Bar Graph
    barplot_path = os.path.join(summary_o_path, f"{sub_num}-barplot") 
    metadata = Metadata.load(manifest_path)
    
    barplot_vis = barplot(
        table=denoise_results.table, 
        taxonomy=taxonomy.classification, 
        metadata=metadata
    )
    barplot_vis.visualization.save(barplot_path)
    print(f"{sub_num} barplot complete")
    
    #generate final output data
    rel_freq_df = rel_freq.view(pd.DataFrame)
    rel_freq_df = rel_freq_df.transpose()
    taxonomy_df = taxonomy.classification.view(pd.DataFrame)
    
    #display_table(rel_freq_df)
    #display_table(taxonomy_df)
    
    merged_df = pd.merge(rel_freq_df, taxonomy_df, left_index=True, right_index=True, how='inner')
    merged_df = merged_df.drop(columns=['Confidence']) #used to also drop feature ID, will see if it works
    #sum accross multiple OTUs that are the same species
    merged_df = merged_df.groupby('Taxon').sum()
    
    #DEBUG: 
    #display_table(merged_df)
    
    
    #print("\n========== fel_freq_df ==========\n")
    #print(set(rel_freq_df.index))
    #print("\n========== taxon_df ==========\n")
    #print(set(taxonomy_df['Feature ID']))
    #print("\n========== join ==========\n")
    #print(set(rel_freq_df.index) & set(taxonomy_df['Feature ID']))
    
    #save final output data
    profile_output = os.path.join(profile_o_path, f"{sub_num}-profile.csv")
    merged_df.to_csv(profile_output, index=True)
    print(f"==================== {sub_num} processing complete ====================")


#TODO: double check sequence assignments with blast
def display_table(table):
    display(table.head())
    num_rows, num_cols = table.shape
    print("Number of rows:", num_rows)
    print("Number of columns:", num_cols)

In [None]:
#Run Pipeline
#generate file paths
current_dir = os.getcwd()
classifier_folder = os.path.join(current_dir, "classifier")
master_manifest_path = os.path.join(current_dir, "DataFrame_2_Pruned.tsv")
profile_o_path = os.path.join(current_dir, "profile_o")
summary_o_path = os.path.join(current_dir, "summary_o")
gg_nb = os.path.join(classifier_folder, "gg-13-8-99-nb-classifier.qza") 

#only run the pipeline on unprocessed reads
sub_nums = set(os.listdir(os.path.join(current_dir, "sorted_reads")))
completed = set(os.listdir(profile_o_path))
completed = [x[:-len("-profile.csv")] for x in completed]
sub_nums.difference_update(completed)


classifier_map = {
    "V4":gg_nb, 
    "V1-V3":gg_nb, 
    "V1-V2":gg_nb, 
    "V3-V4":gg_nb, 
    "V1-V2 and v4":gg_nb, 
    "V4-V5":gg_nb, 
    "V1-V4":gg_nb
}

#BUG: wrong set being used to join taxon df and freq df

print(f"Processing submissions {sub_nums}\n")
for sub_num in sub_nums:
    sub_path = os.path.join(current_dir, "sorted_reads", sub_num)
    try:
        qiime2_profile(sub_num, sub_path, master_manifest_path, profile_o_path, summary_o_path, classifier_map, trim_left_f=0, trim_left_r=0, trunc_len_f=0, trunc_len_r=0, )
    except Exception as e:
        print(f"Error processing {sub_num}: {e}")
        print(f"==================== {sub_num} processing failed ====================")