[communicate-plots](https://aeturrell.github.io/python4DS/communicate-plots.html)

In [2]:
from lets_plot import *
from lets_plot.mapping import as_discrete
import pandas as pd
import numpy as np

LetsPlot.setup_html()

In [3]:
# load the data
mpg = pd.read_csv(
    "https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/mpg.csv", index_col=0
)

In [4]:
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point()
 )

使用标题显示我们发现的数据间的关系，而不是再次描述图表中的内容。

- add a title that summarises the main finding you’d like the viewer to take away (as opposed to one just describing the obvious!)
- add a subtitle that provides more info on the y-axis, and make the x-label more understandable
- remove the y-axis label that is at an awkward viewing angle
- add a caption with the source of the data

In [5]:
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(aes(colour="class"))
 + geom_smooth(se=False, method="loess", size=1)
 + labs(
            title="Fuel efficiency generally decreases with engine size",
            subtitle="Highway fuel efficiency (miles per gallon)",
            caption="Source: fueleconomy.gov",
            y="",
            x="Engine displacement (litres)",
        )
 )

In [6]:
# 或者我们也可以根据自己的需求，进行调整
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(aes(colour="class"))
 + geom_smooth(se=False, method="loess", size=1)
 + labs(
            x="Engine displacement (L)",
            y="Highway fuel economy (mpg)",
            colour="Car type",
            title="Fuel efficiency generally decreases with engine size",
            subtitle="Two seaters (sports cars) are an exception because of their light weight",
            caption="Source: fueleconomy.gov",
        )
 )

# Annotations

以自定义的方式对数据进行标签显示，标签的数据来源有两类：
- 来自于数据本身，使用 `geom_text` 进行显示
- 直接使用 `geom_lable` 进行显示

除了上面两种之外，还有 `geom_hline`, `geom_vline`, `geom_rect`, `geom_segment`，这些都是用来进行标签显示的。

In [7]:
mapping = {
    "4": "4-wheel drive",
    "f": "front-wheel drive",
    "r": "rear-wheel drive",
}
label_info = (
    mpg.groupby("drv")
    .agg({"hwy": "mean", "displ": "mean"})
    .reset_index()
    .assign(drive_type=lambda x: x["drv"].map(mapping))
    .round(2)
)
label_info

Unnamed: 0,drv,hwy,displ,drive_type
0,4,19.17,4.0,4-wheel drive
1,f,28.16,2.56,front-wheel drive
2,r,21.0,5.18,rear-wheel drive


In [8]:
(ggplot(mpg, aes(x="displ", y="hwy", color="drv"))
 + geom_point(alpha=0.5)
 + geom_smooth(se=False, method="loess")
 + geom_text(
            aes(x="displ", y="hwy", label="drive_type"),
            data=label_info,
            fontface="bold",
            size=8,
            hjust="left",
            vjust="bottom",
        )
 + theme(legend_position="none")  # remove the legend
 )

In [9]:
# geom_label 的第一种模式

potential_outliers = mpg.query("hwy > 40 | (hwy > 20 & displ > 5)")
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(color="black")
 + geom_smooth(se=False, method="loess", color="black")
 + geom_point(
            data=potential_outliers,
            color="red",
        )
 + geom_label(
            aes(label="model"),
            data=potential_outliers,
            color="red",
            position=position_jitter(),
            fontface="bold",
            size=5,
            hjust="left",
            vjust="bottom",
        )
 + theme(legend_position="none")
 )

In [10]:
# geom_label 的第二种模式

import textwrap

# wrap the text so it is over multiple lines:
trend_text = textwrap.fill("Larger engine sizes tend to have lower fuel economy.", 30)

(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point()
 + geom_label(x=3.5, y=38, label=trend_text, hjust="left", color="red")
 + geom_segment(x=2, y=40, xend=5, yend=25, arrow=arrow(type="closed"), color="red")
 )

# Scales

In [11]:
# 默认 scales
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(aes(color="class"))
 )

`lets-plot` 自动以以下方式进行处理：
```
(
    ggplot(mpg, aes(x="displ", y="hwy")) +
    geom_point(aes(color="class")) +
    scale_x_continous() +
    scale_y_continuous() +
    scale_color_discrete()
)
```

当然用户也可以对上述的 `scale_` 进行替换。主要有以下两种：
- breaks
- labels

In [12]:
(ggplot(mpg, aes(x="displ", y="hwy", color="drv"))
 + geom_point()
 + scale_y_continuous(breaks=np.arange(15, 40, step=5))
 )

In [13]:
(ggplot(mpg, aes(x="displ", y="hwy", color="drv"))
 + geom_point()
 + scale_color_discrete(labels=["4-wheel", "front", "rear"])
 )

改变刻度标签的可以使用 `format=`。以下还是以钻石数据为例。

In [14]:
diamonds = pd.read_csv(
    "https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv",
    index_col=0,
)
diamonds["cut"] = diamonds["cut"].astype(
    pd.CategoricalDtype(
        categories=["Fair", "Good", "Very Good", "Premium", "Ideal"], ordered=True
    )
)
diamonds["color"] = diamonds["color"].astype(
    pd.CategoricalDtype(categories=["D", "E", "F", "G", "H", "I", "J"], ordered=True)
)

In [15]:
(ggplot(diamonds, aes(x="cut", y="price"))
 + geom_boxplot()
 + coord_flip()
 + scale_y_continuous(format="$.2s", breaks=np.arange(0, 19000, step=6000))
 )

In [16]:
# 另外使用 `breaks` 的就是对少量的数据，进行部分高亮
presidential = pd.read_csv(
    "https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/presidential.csv",
    index_col=0,
)
presidential = presidential.astype({"start": "datetime64[ns]", "end": "datetime64[ns]"})
presidential["id"] = 33 + presidential.index
presidential.head()

Unnamed: 0_level_0,name,start,end,party,id
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Eisenhower,1953-01-20,1961-01-20,Republican,34
2,Kennedy,1961-01-20,1963-11-22,Democratic,35
3,Johnson,1963-11-22,1969-01-20,Democratic,36
4,Nixon,1969-01-20,1974-08-09,Republican,37
5,Ford,1974-08-09,1977-01-20,Republican,38


In [17]:
(ggplot(presidential, aes(x="start", y="id"))
 + geom_point()
 + geom_segment(aes(xend="end", yend="id"))
 + scale_x_datetime(breaks=presidential["start"])
 )

另外一个控制 `legend` 的就是 `theme`。`theme` 可以控制非数据的部分。

In [18]:
base = ggplot(mpg, aes(x="displ", y="hwy")) + geom_point(aes(color="class"))

p1 = base + theme(legend_position="right")  # the default
p2 = base + theme(legend_position="left")
p3 = base + theme(legend_position="top") + guides(color=guide_legend(nrow=3))
p4 = base + theme(legend_position="bottom") + guides(color=guide_legend(nrow=3))

# legend_position 为 none 就是不显示

gggrid([p1, p2, p3, p4], ncol=2)

In [19]:
(ggplot(diamonds, aes(x="carat", y="price"))
 + geom_bin2d()
 + scale_x_log10()
 + scale_y_log10()
 )

In [20]:
prng = np.random.default_rng(1837)  # prng=probabilistic random number generator
df_rnd = pd.DataFrame(prng.standard_normal((1000, 2)), columns=["x", "y"])
(ggplot(df_rnd, aes(x="x", y="y"))
 + geom_bin2d()
 + coord_fixed()
 + scale_fill_viridis(option="plasma")
 + labs(title="Plasma, continuous")
 )

## zooming

In [21]:
mpg_condition = (
        (mpg["displ"] >= 5) & (mpg["displ"] <= 6) & (mpg["hwy"] >= 10) & (mpg["hwy"] <= 25)
)

(ggplot(mpg.loc[mpg_condition], aes(x="displ", y="hwy"))
 + geom_point(aes(color="drv"))
 + geom_smooth(method="loess")
 )

In [22]:
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(aes(color="drv"))
 + geom_smooth(method="loess")
 + scale_x_continuous(limits=(5, 6))
 + scale_y_continuous(limits=(10, 25))
 )

In [23]:
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(aes(color="drv"))
 + geom_smooth(method="loess")
 + coord_cartesian(xlim=(5, 6), ylim=(10, 25))
 )

# Themes

In [24]:
(ggplot(mpg, aes(x="displ", y="hwy"))
 + geom_point(aes(color="class"))
 + geom_smooth(se=False)
 + theme_grey()
 )

In [25]:
(ggplot(mpg, aes(x="displ", color="drv"))
 + geom_density(size=2)
 + ggtitle("Density of drives")
 + theme(
            axis_line=element_line(size=4),
            axis_ticks_length=10,
            axis_title_y="blank",
            legend_position=[1, 1],
            legend_justification=[1, 1],
            panel_background=element_rect(color="black", fill="#eeeeee", size=2),
            panel_grid=element_line(color="black", size=1),
        )
 )

# Layout

In [26]:
p1 = ggplot(mpg, aes(x="displ", y="hwy")) + geom_point() + labs(title="Plot 1")
p2 = ggplot(mpg, aes(x="drv", y="hwy")) + geom_boxplot() + labs(title="Plot 2")
gggrid([p1, p2])

# Saving plots

In [27]:
ggsave(p1, "chart.svg", path=".")

'E:\\sourcecode\\py\\ml_py_learning\\data_viz\\py_4_ds\\chart.svg'