# Data Preparation

### Counts data

In [1]:
from moonstone.parsers.counts.taxonomy import SunbeamKraken2Parser


krakenfile = "moonstone_tuto_kraken2_file.tsv"
parser = SunbeamKraken2Parser(krakenfile)
counts_dataframe = parser.dataframe.drop('NCBI_taxonomy_ID', axis=1)

In [2]:
counts_dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,SAMPLE_1,SAMPLE_2,SAMPLE_3,SAMPLE_4,SAMPLE_5,SAMPLE_6,SAMPLE_7,SAMPLE_8,SAMPLE_9,SAMPLE_10
kingdom,phylum,class,order,family,genus,species,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Bacteria,Bacteria (kingdom),Bacteria (kingdom),Bacteria (kingdom),Bacteria (kingdom),Bacteria (kingdom),Bacteria (kingdom),8.5,6.0,52.8,1.1,4.2,9.9,5.5,3.0,7.8,6.1
Bacteria,Actinobacteria,Actinomycetia,Bifidobacteriales,Bifidobacteriaceae,Gardnerella,Gardnerella_vaginalis,27.9,2.9,0.0,22.0,2.8,21.5,0.0,25.5,7.3,23.0
Bacteria,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus_faecalis,6.1,4.8,0.0,3.0,3.1,0.0,0.0,0.0,0.0,0.0
Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus (genus),9.8,9.1,19.2,6.7,0.0,7.0,3.2,0.0,2.8,8.3
Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus_crispatus,8.5,53.6,894.0,14.6,42.0,18.0,78.1,0.0,58.2,16.1
Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia_coli,6.3,14.0,56.4,6.0,11.1,12.3,7.0,7.8,2.8,8.6
Bacteria,Proteobacteria,Betaproteobacteria,Neisseriales,Neisseriaceae,Neisseria,Neisseria_animalis,0.7,0.0,0.0,1.9,0.0,0.0,5.0,5.1,0.0,2.7
Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas_monteilii,1.6,5.3,40.8,5.5,3.2,0.4,4.5,2.6,4.7,6.3


In [3]:
counts_dataframe.sum()

SAMPLE_1       69.4
SAMPLE_2       95.7
SAMPLE_3     1063.2
SAMPLE_4       60.8
SAMPLE_5       66.4
SAMPLE_6       69.1
SAMPLE_7      103.3
SAMPLE_8       44.0
SAMPLE_9       83.6
SAMPLE_10      71.1
dtype: float64

#### Normalization of data

SAMPLE_3 number of classified reads is 10 to 24 times greater than other samples' reads counts. So that futur analyses not to be biased towards SAMPLE_3, we need to perform some kind of normalization.

Moonstone offers assistance in normalizing your data using many different methods (see [list](https://moonstone.readthedocs.io/en/latest/api_docs/normalization.html))

(To better understand what each normalization method entails, you can watch this [youtube video](https://www.youtube.com/watch?v=UFB993xufUU&t=683s))

In [5]:
from moonstone.normalization.counts.geometric_mean import (
    GeometricMeanNormalization
)

geom_mean_norm = GeometricMeanNormalization(counts_dataframe)     # instantiation
counts_dataframe_normalized = geom_mean_norm.normalized_df

In [6]:
counts_dataframe_normalized.sum()

SAMPLE_1      68.635840
SAMPLE_2      73.590681
SAMPLE_3     126.625161
SAMPLE_4      76.402948
SAMPLE_5      81.216336
SAMPLE_6      59.961868
SAMPLE_7     118.107261
SAMPLE_8      58.882472
SAMPLE_9      92.671242
SAMPLE_10     66.123165
dtype: float64

### Metadata

In [7]:
import pandas as pd

metadata_file = "metadata_file.csv"
metadata_dataframe = pd.read_csv(metadata_file, sep=",", index_col="SAMPLE_ID")

In [8]:
metadata_dataframe

Unnamed: 0_level_0,SMOKER,GROUP
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
SAMPLE_1,yes,A
SAMPLE_2,yes,C
SAMPLE_3,no,A
SAMPLE_4,no,C
SAMPLE_5,no,B
SAMPLE_6,no,C
SAMPLE_7,yes,A
SAMPLE_8,no,B
SAMPLE_9,yes,A
SAMPLE_10,no,A


# Data visualization/exploration

In [9]:
from moonstone.plot.counts import PlotTaxonomyCounts

instance = PlotTaxonomyCounts(counts_dataframe)

In [11]:
fig1 = instance.plot_most_prevalent_taxa(
    mode="bar",
    mean_threshold=6,
    taxa_number=3,
    taxa_level="species",
    higher_classification=False,               # Set to False remove every rows "xxx(higher taxa)"
    ascending=False,
)

In [12]:
fig2 = instance.plot_most_abundant_taxa(
    mode="boxplot",
    taxa_level="species",
    prevalence_threshold=None,
    average_relative_abundance_threshold=5,
    higher_classification=False,                
    ascending=False,
    output_file="Most_abundant_species.html"        # It's also possible to generate a static image. You just need to change the extension of the file
)

In [13]:
fig3 = instance.plot_sample_composition_most_abundant_taxa(
    taxa_level="species",
    taxa_number=3,                        # the X top species will be represented, the other will be under "Others"
    cluster_samples=True,                 # cluster samples according to their composition in top species/Others (default set to True)
    colors={"Others" : "#d1dae8"},        # set the color of a species (or of "Others")
    color_df=metadata_dataframe["GROUP"], # series or dataframe of metadata to add at the bottom of the graph
    sep_series=metadata_dataframe["SMOKER"].replace({"yes": "smoker", "no": "non smoker"}),
    sep_how="labels",
    output_file=None,
    plotting_options={"layout": {"yaxis_title": "Relative abundance"}}   # all moonstone's plot and graph methods allow you to give your own plotting options
                                                                         # it relies on the fig.update_X({*dictionary*}) methods of plotly
                                                                         # it should be given in a dictionary with X being the first key and then a dictionary of the
                                                                         # parameters to update as you would give the fig.update_X method
)