In [None]:
import altair as alt
from vega_datasets import data
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Chapter 3

### Exercise 1
Check the contents of the Vega dataset co2_concentration.json . Plot with the x variable ‘Date’ as quantitative and y with variable ‘CO2’ as quantitative. What happens? Now try to use the date as a temporal variable? What happens if you use the date as a categorical variable?

In [None]:
co2 = data.co2_concentration()
co2.head()

In [None]:
# Nothing is plotted, date is not quantitative
alt.Chart(co2).mark_line().encode(
    x="Date:Q",
    y="CO2:Q",
)

In [None]:
# zig-zag pattern
alt.Chart(co2).mark_line().encode(
    x="Date:T",
    y="CO2:Q",
)

In [None]:
# very long plot
alt.Chart(co2).mark_line().encode(
    x="Date:N",
    y="CO2:Q",
)

### Exercise 2
Create a scatterplot using the dataset gapminder.json with the X axis as the population, and the Y axis as the life expectancy. Now use the X axis as the life expectancy, and the Y axis as the fertility rate.

In [None]:
gapminder = data.gapminder.url
alt.Chart(gapminder).mark_point().encode(
    x="pop:Q",
    y="life_expect:Q",
)

In [None]:
alt.Chart(gapminder).mark_point().encode(
    x="life_expect:Q",
    y="fertility:Q",
)

### Exercise 3
Compare the previous chart with another one where the marks are depicted as circles. Make another experiment with the charts as squares.

In [None]:
alt.Chart(gapminder).mark_point(shape="circle").encode(
    x="life_expect:Q",
    y="fertility:Q",
)

In [None]:
alt.Chart(gapminder).mark_point(shape="square").encode(
    x="life_expect:Q",
    y="fertility:Q",
)

### Exercise 4
With the previous example, now encode in the circle mark the population of the country. Compare the sizes using circles, squares, and points.

In [None]:
alt.Chart(gapminder).mark_point(shape="circle", filled=True).encode(
    x="life_expect:Q",
    y="fertility:Q",
    size="pop:Q"
)

### Exercise 5
The last plot shows all the data in a single chart, but this includes every year. In order to make sense of them, color code the years.

In [None]:
alt.Chart(gapminder).mark_point(shape="circle", filled=True).encode(
    x="life_expect:Q",
    y="fertility:Q",
    size="pop:Q",
    color="year:Q"
)

# Chapter 5.1

### Exercise 1
Use the Vega dataset wheat.json . Plot with the x variable ‘year’ as temporal and y with variable ‘wheat’ as quantitative. Use a line plot, with a dashed pink line of width 3.

In [None]:
wheat = data.wheat.url
alt.Chart(wheat).mark_line(strokeDash=[7, 3], color="pink", strokeWidth=3).encode(
    x="year:T",
    y="wheat:Q",
)

### Exercise 2
Use the same dataset to produce a scatterplot with gray triangles filled with blue.

In [None]:
alt.Chart(wheat).mark_point(shape="triangle", color="gray", fill="blue").encode(
    x="year:T",
    y="wheat:Q",
)

### Exercise 3
Increase the size of the previous triangles.

In [None]:
alt.Chart(wheat).mark_point(shape="triangle", color="gray", fill="blue", size=50).encode(
    x="year:T",
    y="wheat:Q",
)

### Exercise 4
With the cars dataset, render a scatterplot of the acceleration versus the horsepower, with the points encoded as red crosses with black stroke, and their size proportional to the horsepower.


In [None]:
cars = data.cars.url
alt.Chart(cars).mark_point(shape="cross", color="black", fill="red").encode(
    x="Acceleration:Q",
    y="Horsepower:Q",
    size="Horsepower:Q",
)

### Exercise 5
Modify the previous plot so that the marks are circles with an opacity of 0.25 and the outline is the same color than the filling color.

In [None]:
cars = data.cars.url
alt.Chart(cars).mark_point(shape="circle", color="red", filled=True, opacity=0.25).encode(
    x="Acceleration:Q",
    y="Horsepower:Q",
    size="Horsepower:Q",
)

# Chapter 5.2

### Exercise 1
Use the cars Vega dataset and plot a bar chart that counts the number of cars with each quantity of cylinders.

In [None]:
alt.Chart(cars).mark_bar().encode(
    x=alt.X("Cylinders:N"),
    y="count():Q",
)

### Exercise 2
Use the cars Vega dataset and plot a bar chart that shows the maximum displacement of the cars from each origin.

In [None]:
alt.Chart(cars).mark_bar().encode(
    x="Origin:N",
    y="max(Displacement):Q",
)

### Exercise 4
Use the cars Vega dataset and plot chart that shows the average miles per gallon of the cars from each origin.

In [None]:
alt.Chart(cars).mark_bar().encode(
    x="Origin:N",
    y="mean(Miles_per_Gallon):Q",
)

### Exercise 5
Use the stocks Vega dataset and plot a bar chart that shows the average stock price for each company.

In [None]:
stocks = data.stocks.url
alt.Chart(stocks).mark_bar().encode(
    x="symbol:N",
    y="max(price):Q",
)

### Exercise 6
Use the stocks Vega dataset and plot a chart that shows the average stock price for each company per year.

In [None]:
alt.Chart(stocks).mark_line().encode(
    x="year:N",
    y="mean(price):Q",
    color="symbol:N"
).transform_calculate(
    year="year(datum.date)"
)

### Exercise 7
With the cars Vega dataset, plot a histogram of the cars with different miles per gallon, stacked per origin, inverting the current sorting.

In [None]:
alt.Chart(cars).mark_bar().encode(
    x=alt.X("Miles_per_Gallon:Q", bin=True),
    y="count():Q",
    color=alt.Color("Origin:N", sort="descending"),
)

### Exercise 8
Create a line plot that shows the price evolution of the yield barley in the different sites from the barley dataset.

In [None]:
barley = data.barley.url
alt.Chart(barley).mark_line().encode(
    x="year:T",
    y="yield:Q",
    color="site:N"
)

### Exercise 9
Create a bar chart with the maximum yield value of each variety using the barley dataset.

In [None]:
alt.Chart(barley).mark_bar().encode(
    x="variety:N",
    y="max(yield):Q",
)

### Exercise 10
Create a bar chart with the average yield value for each site using the barley dataset.

In [None]:
alt.Chart(barley).mark_bar().encode(
    x="site:N",
    y="mean(yield):Q",
)

### Exercise 11
Create a line chart using the cars dataset, with the average displacement of the cars per year, with different lines for each origin.


In [None]:
alt.Chart(cars).mark_line().encode(
    x="Year:T",
    y="mean(Displacement):Q",
    color="Origin:N"
)

### Exercise 12
Create a line chart using the cars dataset, with the average miles per gallon per year.

In [None]:
alt.Chart(cars).mark_line().encode(
    x="Year:T",
    y="mean(Miles_per_Gallon):Q",
)

### Exercise 13
Improve the previous chart by adding the confidence intervals of the miles per gallon. Plot both charts together.

In [None]:
main = alt.Chart(cars).mark_line().encode(
    x="Year:T",
    y="mean(Miles_per_Gallon):Q",
)

ci = alt.Chart(cars).mark_area(opacity=0.5).encode(
    x="Year:T",
    y="ci0(Miles_per_Gallon):Q",
    y2="ci1(Miles_per_Gallon):Q",
)

main + ci

### Exercise 14
Create another line chart, also with the cars dataset, with the average horsepower per year, with confidence intervals, separated per origin, and render both of them separately.

In [None]:
main = alt.Chart(cars).mark_line().encode(
    x="Year:T",
    y="mean(Horsepower):Q",
    color="Origin:N"
)

ci = alt.Chart(cars).mark_area(opacity=0.5).encode(
    x="Year:T",
    y="ci0(Horsepower):Q",
    y2="ci1(Horsepower):Q",
    color="Origin:N"
)

main & ci

### Exercise 15
Render both charts on top of each other. Resolve the ambiguities.

In [None]:
main = alt.Chart(cars).mark_line().encode(
    x="Year:T",
    y="mean(Horsepower):Q",
    color="Origin:N"
)

ci = alt.Chart(cars).mark_area(opacity=0.5).encode(
    x="Year:T",
    y="ci0(Horsepower):Q",
    y2="ci1(Horsepower):Q",
    color="Origin:N"
)

(main + ci).resolve_scale(color="independent")

### Exercise 16
Render the first three columns of the employment dataset with three different colors.

In [None]:
unemployment = data.us_employment()
(
    unemployment
    .iloc[:, : 3].style
    .set_properties(**{'background-color': 'blue'}, subset=['month'])
    .set_properties(**{'background-color': 'red'}, subset=['nonfarm'])
    .set_properties(**{'background-color': 'yellow'}, subset=['private'])
)

### Exercise 17
Add a rule on the average of the employment values for those data.

In [None]:
df = unemployment.iloc[:, list(range(4))].melt("month", var_name="Produce", value_name="amount")

base = alt.Chart(df).mark_line().encode(
    x="month:T",
    y="amount:Q",
    color="Produce:N"
)

rule = alt.Chart(df).mark_rule(color="red", strokeDash=[4, 2]).encode(
    y="mean(amount):Q",
)

base + rule

### Exercise 18
Plot the monthly average of the all employment types of the employment dataset.

In [None]:
df = unemployment.melt("month", var_name="Produce", value_name="amount")
alt.Chart(df).mark_line(clip=True).encode(
    x=alt.X("month:T", timeUnit="yearmonth"),
    y=alt.Y("mean(amount):Q", scale=alt.Scale(domain=(0,150000))),
    # color="Produce:N"
)

### Exercise 19
Plot as a rule the average of the all employment types of the employment dataset.

In [None]:
base = alt.Chart(df).mark_line(strokeWidth=0).encode(
    x="month:T"
) 

rule = alt.Chart(df).mark_rule(color="red", strokeDash=[4, 2]).encode(
    y=alt.Y("average(amount):Q", scale=alt.Scale(domain=(0,150000))),
)

base + rule

### Exercise 20
Use the dataset “Nivell academic de la població per sexe de la ciutat de Barcelona” of year 2018 from the BCN Open Data web, and show the number of people in the city of each academic degree.

In [None]:
import pandas as pd
alt.data_transformers.disable_max_rows()
academic = "https://opendata-ajuntament.barcelona.cat/resources/bcn/EstadisticaPadro/pad/2018/2018_pad_mdbas_niv-educa-esta_sexe.csv"

academic = pd.read_csv(academic)
df = academic[academic["Valor"] != ".."]

alt.Chart(df).mark_bar().encode(
    x="NIV_EDUCA_esta:N",
    y="sum(Valor):Q"
)


### Exercise 21
Show the number of people holding each academic degree (“Nivell academic”) of the Horta-Guinardó neighborhood for a concrete month.

In [None]:
df = academic[academic["Valor"] != ".."]
df = df[df["Nom_Districte"] == "Horta-Guinardó"]

df["Data_Referencia"].unique()

# Not possible to do as we only have one date

### Exercise 22
Compare the higher academic degrees (high school and university) of each neighborhood in Barcelona for a certain year.

In [None]:
values = [4, 5]

df = academic[academic["Valor"] != ".."]
df = df[df["NIV_EDUCA_esta"].isin(values)]

alt.Chart(df).mark_bar().encode(
    x=alt.X("NIV_EDUCA_esta:N", axis=alt.Axis(title='', labels=False)),
    y="sum(Valor):Q",
    color=alt.Color("NIV_EDUCA_esta:N",
                   scale=alt.Scale(
            domain=values,
            range=["blue", "green", "yellow"])),
    column="Nom_Districte:N"
)

### Exercise 23
Plot the evolution of the academic degree for men and women in Barcelona.

In [None]:
def l(year):
    return f"https://opendata-ajuntament.barcelona.cat/resources/bcn/EstadisticaPadro/pad/{year}/{year}_pad_mdbas_niv-educa-esta_sexe.csv"

first_year = 1997
dfs = [pd.read_csv(l(year)) for year in range(first_year + 1, 2024)]

df = pd.concat(dfs, axis=0)

In [None]:
education = [4, 5, 6]

dff = df[df["Valor"] != ".."]
dff = dff[dff["NIV_EDUCA_esta"].isin(education)]

alt.Chart(dff).mark_area().encode(
    x=alt.X("year:T", axis=alt.Axis(title='Year')),
    y="sum(Valor):Q",
    color=alt.Color("SEXE:N")
).transform_calculate(
    year="year(datum.Data_Referencia)"
)

### Exercise 24
Plot the evolution of the academic degree for men and women in a selected neighborhood in Barcelona.

In [None]:
neighborhood = "Horta-Guinardó"
education = [4, 5, 6]

dff = df[df["Valor"] != ".."]
dff = dff[dff["NIV_EDUCA_esta"].isin(education)]
dff = dff[dff["Nom_Districte"] == neighborhood]

alt.Chart(dff).mark_area().encode(
    x=alt.X("year:T", axis=alt.Axis(title='Year')),
    y="sum(Valor):Q",
    color=alt.Color("SEXE:N")
).transform_calculate(
    year="year(datum.Data_Referencia)"
).properties(
    title=f"Academic degree for men and women in {neighborhood}"
)

### Exercise 25
Show how the university degree has evolved in Barcelona.

In [None]:
# Similar to above