# Clean Cami Output
This notebook will read gold standard taxonomic output from the CamiSim files, the convert them to the standardized relative abundance tables.

In [4]:
import numpy as np
import pandas as pd
import os
import re

path = os.path.abspath("/Volumes/TBHD_share/valencia/pipelines/cami_marine/data/simulation_short_read")

## Movement of Short Read Files
The original short read files were in a structure with sampleID/reads/anonymous_reads.fq.gz

We want to change these to sampleID.fastq.gz, since we want to know what the sample is.

In [5]:
def find_and_rename_fq_files(path: str):
    for root, dirs, files in os.walk(path):
        for f in files:
            if f.endswith(".fq.gz"):
                joined = os.path.join(root, f)
                new_file_name = joined.split("/")[-3] + ".fastq.gz"
                new_file_name = ("_").join(new_file_name.split("_")[2:])
                
                new_path = os.path.join(root, new_file_name)
                print(joined)
                print(new_path)

                os.rename(joined, new_path)

find_and_rename_fq_files(path)

## Parsing of Gold Standard

In [6]:
# Return files where the name contains the pattern "taxonomic_profile_{0-9}.txt"
def get_files(path, pattern):
    files = []
    for file in os.listdir(path):
        if re.match(pattern, file):
            files.append(file)
    return files

files = get_files(path, "taxonomic_profile_[0-9].txt")

In [7]:
def split_col_and_replace(df: pd.DataFrame, col_name: str, sep: str = "|"):
    df[col_name] = df[col_name].str.split(sep).str[-1]
    return df

def get_summary(df: pd.DataFrame):
    # Summarize the total RA percentage.
    pct = df["PERCENTAGE"].sum()
    print(f"Total RA percentage: {pct}")

    # Summarize the number of unique species.
    unique_species = df["TAXPATHSN"].unique()
    print(f"Number of unique species: {len(unique_species)}")

    # Get the top 10 species.
    top_10 = df.sort_values(by="PERCENTAGE", ascending=False).head(10)
    print("Top 10 species by RA percentage:")
    display(top_10)

    # Get the bottom 10 species.
    bottom_10 = df.sort_values(by="PERCENTAGE", ascending=True).head(10)
    print("Bottom 10 species by RA percentage:")
    display(bottom_10)

def make_df(files: list, rank: str):
    for f in files:
        print(f)
        df = pd.read_csv(os.path.join(path, f), sep='\t', header=3, index_col=0, usecols=[0,1,3,4])

        df = df.loc[df["RANK"] == "species"].drop(columns=["RANK"])
        split_col_and_replace(df, "TAXPATHSN")

        # Remove rows where the percentage is 0.0000
        df = df.loc[df["PERCENTAGE"] != 0.0000]

        # Get the summary.
        get_summary(df)

        break

make_df(files, "species")

taxonomic_profile_9.txt
Total RA percentage: 92.69010000000002
Number of unique species: 274
Top 10 species by RA percentage:


Unnamed: 0_level_0,TAXPATHSN,PERCENTAGE
@@TAXID,Unnamed: 1_level_1,Unnamed: 2_level_1
45202.0,unidentified plasmid,37.7568
32644.0,unidentified,20.7057
1335757.0,Spiribacter curvatus,2.3293
314275.0,Alteromonas mediterranea,2.1011
29542.0,Pelobacter acetylenicus,1.9202
54248.0,Hyperthermus butylicus,1.5043
133539.0,Nitrosococcus halophilus,1.3588
84980.0,Desulfotalea psychrophila,1.2617
1214906.0,unidentified virus,1.2479
62322.0,Shewanella baltica,0.8977


Bottom 10 species by RA percentage:


Unnamed: 0_level_0,TAXPATHSN,PERCENTAGE
@@TAXID,Unnamed: 1_level_1,Unnamed: 2_level_1
1250205.0,Gramella sp. MAR_2010_147,0.0016
2746.0,Halomonas elongata,0.0022
1920883.0,Donghicola sp. JLT3646,0.0026
42444.0,Marinovum algicola,0.0034
1850250.0,Rhodobacter sp. LPB0142,0.005
59600.0,Cellulophaga algicola,0.007
911045.0,Pseudovibrio sp. FO-BEG1,0.0087
307121.0,Micromonospora krabiensis,0.0088
2724.0,Hirschia baltica,0.0119
1526571.0,Lacimicrobium alkaliphilum,0.0121
