Plotting with matplotlib - 2
========================

In [None]:
# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# other imports
import numpy as np
import pandas as pd
from scipy import stats

What about other plot types?
---

Such as boxplots, heatmaps, complex plots *a la* ggplot.
Find a galery with code here: https://matplotlib.org/gallery/index.html

**However:** seaborn covers them much better, leveraging the power of pandas `DataFrame`s

In [None]:
# load a nice, tidy, test dataset
tips = sns.load_dataset('tips')

In [None]:
tips.head()

In [None]:
tips.info()

In [None]:
sns.relplot(x="total_bill",
            y="tip",
            col="time",
            # same category used for both style and color
            hue="smoker",
            style="smoker",
            size="size",
            data=tips,
            # stylistic stuff
            height=4,
            aspect=1,
            palette=['xkcd:indigo', 'xkcd:grass green'],
            hue_order=['No', 'Yes']);

Statistical relationships: `relplot`
---

In [None]:
sns.relplot(x="total_bill", y="tip",
            size="size",
            # change size range
            sizes=(1, 100),
            #
            data=tips);

In [None]:
# some more data
fmri = sns.load_dataset("fmri")

In [None]:
fmri.head()

In [None]:
sns.relplot(x="timepoint", y="signal",
            data=fmri);

In [None]:
sns.relplot(x="timepoint", y="signal",
            # lineplot instead of scatterplot
            # by default mean and 95% confidence interval
            # if we are aggregating multiple lines
            kind="line",
            #
            data=fmri);

In [None]:
sns.relplot(x="timepoint", y="signal",
            # lineplot instead of scatterplot
            kind="line",
            #
            ci='sd', estimator="median",
            data=fmri);

In [None]:
sns.relplot(x="timepoint", y="signal",
            col="region",
            hue="event",
            kind="line",
            data=fmri);

In [None]:
sns.relplot(x="timepoint", y="signal",
            col="region",
            hue="event",
            kind="line",
            # no aggregation
            estimator=None,
            #
            data=fmri);

In [None]:
sns.relplot(x="timepoint", y="signal",
            col="region",
            hue="event",
            kind="line",
            # no aggregation
            units="subject", estimator=None,
            #
            data=fmri);

**I do I access individual axes?**

In [None]:
rp = sns.relplot(x="timepoint", y="signal",
                 col="region",
                 hue="event",
                 kind="line",
                 data=fmri)
for ax in rp.axes.flatten():
    ax.set_xlabel('Changing stuff')
    ax.set_ylabel('Also here')
    # crazy thing
    ax.set_yscale('log', base=2)

In [None]:
rp.axes    

In [None]:
rp.axes.shape

In [None]:
# not only columns, but also rows
sns.relplot(x="timepoint", y="signal", hue="subject",
            col="region", row="event", height=3,
            kind="line", estimator=None, palette='hsv',
            data=fmri);

Categorical data: `catplot`
---

Categorical scatterplots:

    stripplot() (with kind="strip"; the default)

    swarmplot() (with kind="swarm")

Categorical distribution plots:

    boxplot() (with kind="box")

    violinplot() (with kind="violin")

    boxenplot() (with kind="boxen")

Categorical estimate plots:

    pointplot() (with kind="point")

    barplot() (with kind="bar")

    countplot() (with kind="count")


In [None]:
sns.catplot(x="day", y="total_bill",
#             jitter=False,
#             jitter=0.4,
            data=tips);

In [None]:
sns.catplot(x="day", y="total_bill",
#             jitter=0.4,
            kind='swarm',
            data=tips);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            hue_order=['Female', 'Male'],
            order=['Sun', 'Sat', 'Fri', 'Thur'],
            kind='swarm',
            data=tips);

In [None]:
sns.catplot(x="size", y="total_bill",
            kind="swarm",
#             orient='h',
            data=tips);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            hue_order=['Female', 'Male'],
            order=['Sun', 'Sat', 'Fri', 'Thur'],
            kind='swarm',
            data=tips);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            hue_order=['Female', 'Male'],
            order=['Sun', 'Sat', 'Fri', 'Thur'],
            kind='box',
            notch=True,
            data=tips);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            hue_order=['Female', 'Male'],
            order=['Sun', 'Sat', 'Fri', 'Thur'],
            kind='boxen',
            data=tips);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            hue_order=['Female', 'Male'],
            order=['Sun', 'Sat', 'Fri', 'Thur'],
            kind='violin',
            palette='Blues',
#             split=True,
#             bw=0.1, # bandwith argument for the underlying kde
#             palette='jet',
            data=tips);

In [None]:
g = sns.catplot(x="day", y="total_bill",
                kind="violin",
                inner=None, data=tips)
sns.swarmplot(x="day", y="total_bill",
              color="k",
              size=3, data=tips,
              ax=g.ax);

In [None]:
g = sns.catplot(y="day", x="total_bill",
                kind="box", color='xkcd:pale grey',
                orient='h',
                data=tips,
                height=3, aspect=2)
sns.swarmplot(y="day", x="total_bill",
              orient='h',
              color="k", alpha=0.7,
              size=6, data=tips,
              ax=g.ax);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            data=tips,
            kind='bar',
#             ci='sd', estimator=np.median,
            height=5, aspect=0.7);

In [None]:
sns.catplot(x="day", y="total_bill",
            hue='sex',
            data=tips,
            kind='point',
            height=5, aspect=0.7);

Linear regressions: `lmplot`
---

In [None]:
sns.lmplot(x="total_bill", y="tip", data=tips);

In [None]:
anscombe = sns.load_dataset("anscombe")
anscombe.head()

In [None]:
sns.lmplot(x="x", y="y", data=anscombe,
           ci=None, height=4, aspect=1,
           col='dataset', col_wrap=2,
#            order=2,
#            robust=True,
#            lowess=True,
          );

In [None]:
sns.lmplot(x="total_bill", y="tip", hue="smoker",
#            ci=None,
           data=tips);

Univariate and bivariate distributions: `distplot`
---

In [None]:
# random distribution
x = np.random.normal(size=100)

In [None]:
sns.histplot(x,
#              kde=False,
#              rug=True,
             bins=20,
             )
sns.despine();

In [None]:
# random gamma distribution
x = np.random.gamma(6, size=200)

In [None]:
sns.histplot(x, 
             kde=True,
            );

In [None]:
# generate bivariate
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])

In [None]:
sns.jointplot(x="x", y="y", data=df,
#               kind='hex',
#               kind='kde',
             );

In [None]:
# another dataset
iris = sns.load_dataset("iris")
iris.sample(5)

In [None]:
sns.pairplot(iris,
#              hue='species'
            );

Heatmaps, clustermaps
---

- Heatmap `sns.heatmap` example: http://seaborn.pydata.org/examples/many_pairwise_correlations.html
- Clustermap `sns.clustermap` example: http://seaborn.pydata.org/examples/structured_heatmap.html

In [None]:
df = sns.load_dataset('flights')

In [None]:
df

In [None]:
# need a long dataframe, not tidy
long = df.pivot_table(index='year', columns='month', values='passengers')

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))

sns.heatmap(data=long,
            cmap='viridis',
#             vmin=0, vmax=650,
#             robust=True,
#             square=True,
#             linewidths=1,
#             annot=True,
            );

In [None]:
sns.clustermap(data=long,
               cmap='viridis',
#              vmin=0, vmax=650,
#              robust=True,
#              linewidths=1,
#              annot=True,
               figsize=(6, 6));

In [None]:
df = sns.load_dataset('tips')

In [None]:
df

In [None]:
long = df.pivot_table(index='day', columns='size', values='tip')

In [None]:
sns.clustermap(data=long,
               cmap='viridis',
#              robust=True,
#              linewidths=1,
#              annot=True,
               figsize=(4, 4));

In [None]:
long[np.isnan(long)] = 0

In [None]:
sns.clustermap(data=long,
               cmap='viridis',
#              robust=True,
#              linewidths=1,
#              annot=True,
               col_cluster=False,
               figsize=(4, 4));

In [None]:
corr = long.corr()

In [None]:
sns.clustermap(data=corr,
               cmap='vlag',
               center=0,
#              vmin=-1, vmax=1,
#              robust=True,
#              linewidths=1,
#              annot=True,
               figsize=(4, 4));

Last: changing styles
---

In [None]:
plt.style.available

In [None]:
for style in plt.style.available:
    with plt.style.context(style):
        lp = sns.lmplot(x="total_bill", y="tip", hue="smoker",
                        ci=None,
                        data=tips)
        lp.axes.flatten()[0].set_title(style);

---


Exercises
---------

Using the data from this URL: https://evocellnet.github.io/ecoref/data/phenotypic_data.tsv

Make a histogram with the disribution of s-scores across all conditions. Play with the number of bins or other distribution types to find the best representation

Make a boxplot to show the distribution of s-scores for 10 random conditions

Make a barplot with the number of growth defect phenotypes for each condition

Can you think of another plot type to show the data from a single condition?

Pick one of the following datasets (which you can load using the `load_dataset` function) and create a plot showing the correlation between numerical variables. Which variable pair has the highest correlation and what is its value?

Available datasets:

* `geyser`: duration of eruptions of the ["Old-faithful" geyser](https://en.wikipedia.org/wiki/Old_Faithful)
* `titanic`: characteristics of survivors Vs. drowned people in the Titanic disaster
* `taxis`: various taxi rides, when they happened, fares, distance, etc...

**Note:** the last dataset also has variables with dates. These have a special type in pandas, which could be fun to work with for advanced users