<a href="https://colab.research.google.com/github/matthewbegun/MXN500/blob/main/MXN500_2024_WOR_02_Py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MXN500 Workshop 2 (Python)

## ENV

Python packages required for the workshop:
- pandas
- plotnine
- scikit-misc
- scikit-bio
- plus dependencies

If any are not installed use the `!pip` command to install them first.

Ensure the `Ecology2.csv` file is uploaded.

In [None]:
# need scikit-misc for curve fitting
!pip install scikit-misc scikit-bio

In [None]:
## Week 2 Workshop
## Visualisation of Ecology data
# pandas is our python equivalent to tidyverse
import pandas as pd

# plotnine is the python implementation of ggplot2
import plotnine as p9
from plotnine import ggplot, aes, geom_histogram, geom_point, geom_bar, geom_col, geom_smooth, facet_wrap, xlab, ylab, theme_bw

# common plotting package is seaborn
import seaborn as sns

# dispaly all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Activity 1 – Reading in data
Ecology = pd.read_csv("drive/MyDrive/mxn500/data/Ecologyv2.csv")
Ecology

In [None]:
# Activity 2 – Building a plot
# to use the `+` format, we need to place the ggplot call inside of brackets
# so Python doesn't complain about indentation
(
  ggplot(Ecology, aes(x="DO",))
  + geom_histogram(bins=30)
)

# seaborn version
sns.histplot(Ecology, x='DO', bins=30)


In [None]:
# let's fix the binwidths
(
  ggplot(Ecology, aes(x="DO",))
  + geom_histogram(binwidth=1)
)

# seaborn version IS DIFFERENT!
sns.histplot(Ecology, x='DO', bins=6)


In [None]:
# let's fix the axis labels next
(
  ggplot(Ecology, aes(x="DO",))
  + geom_histogram(binwidth = 1)
  + xlab("Dissolved oxygen (mg/L)")
  + ylab("Number of observations")
)

In [None]:
# Atyidae
(
  ggplot(Ecology, aes(x="Atyidae",))
  + geom_bar()
)

In [None]:
# let's fix this one too - moving back towards seaborns default color scheme!
(
  ggplot(Ecology, aes(x="Atyidae",))
  + geom_histogram(binwidth = 20, color = "grey", fill = "lightskyblue")
  + p9.scale_x_continuous(breaks=range(0, 161, 20))
  + theme_bw()
)

In [None]:
# change the color to be based on count - this one looks really different in python vs R
(
  ggplot(Ecology, aes(x="Atyidae",))
  + geom_histogram(aes(fill=p9.after_stat('count')), bins=30)
)

In [None]:
# now for some scatter plots (comparing 2 continuous vars)
(
  ggplot(Ecology, aes(x="Turbidity",y="DO",))
  + xlab("Turbidity")
  + geom_point()
)

In [None]:
# this one needs scikit-misc (see pip install at start)
(
  ggplot(Ecology, aes(x="Turbidity",y="DO",))
  + xlab("Turbidity")
  + geom_point()
  + geom_smooth(method = "loess")
)

In [None]:
# in python we explicitly see that the color scheme is qualitative
# explicit is better than implicit!
(
  ggplot(Ecology, aes(x="Turbidity",y="DO",))
  + geom_point(aes(color="Location"), size=2)
  + p9.scale_color_brewer(type="qualitative", palette="Dark2",)
)

In [None]:
# have a look at column names to find the range that are species counts
Ecology.columns

In [None]:
# looks like the following columns are species count
# need to specify all rows before slicing on columns
Ecology_species = Ecology.loc[:, 'Parastacidae':'Plecoptera']


In [None]:
# need scikit-bio to do taxon richness calculation - note this downgrades a few things,
# could cause issues in a large notebook
from skbio.diversity import alpha_diversity

Ecology['TaxonRichness'] = alpha_diversity('observed_otus', Ecology_species.values)
Ecology[['Site', 'TaxonRichness']]

In [None]:
# My colors are different from R (and the plotnine documentation)
(
  ggplot(Ecology,
          aes(x = "Site",
              y = "TaxonRichness",
              fill = "DO"))
  + geom_col()
  + theme_bw()
  + facet_wrap(facets = "Location", nrow = 1, scales = "free_x")
  + ylab("Taxon Richness (species)")
)

In [None]:
# What's each line doing?
(
  ggplot(Ecology, # Use the Ecology dataframe
          aes(x = "Site", #Set the x-axis to show Site
              y = "TaxonRichness", #Set the y-axis to show TaxonRichness
              fill = "DO")) #change the colour of each column to map to DO
  + geom_col() # this is the columns
  + theme_bw() # Change the background to white
  + facet_wrap(facets = "Location",  # create the facets for each of the locations
               nrow = 1, #Include only one row of plots
               scales = "free_x") #have a different x-axis on each plot
  + ylab("Taxon Richness (species)") #change the y-axis label
)



In [None]:
# Activity 5 – Making a better graph
# Let's go for a scatterplot (because two cont vars)
# (
#   ggplot(Ecology,
#           aes(x = "Site",
#               y = "TaxonRichness",
#               fill = "DO"))
#   + geom_col()
#   + theme_bw()
#   + facet_wrap(facets = "Location", nrow = 1, scales = "free_x")
#   + ylab("Taxon Richness (species)")
# )
(
  ggplot(Ecology,
         aes(x = "DO",
             y = "TaxonRichness",
             color="Location"))
  + geom_point()
  + xlab("Dissolved Oxygen (mg/L)")
  + ylab("Taxon Richness")
  + p9.scale_color_brewer(type="qualitative", palette="Dark2",)
  + theme_bw()
)