# Musée National des Beaux-Arts du Québec

## 1. Setup

### Loading csv file

In [191]:
using DataFrames, CSV, CairoMakie, Makie, CategoricalArrays, StatsBase

CairoMakie.activate!()

df = DataFrame(CSV.File("./collection-beauxarts.csv"));

select!(df,
    :"Numéro inventaire" => :id,
    :"Département/Collection" => :departement,
    :"Collection  " => :collection,
    :Catégorie => :category,
    :Objet => :object,
    :Artiste => :artist,
    :Titre => :title,
    :Date => :date,
    :"Lieu de production" => :creation_place,
    :Culture => :culture,
    :Technique => :technics,
    :Dimensions => :dimensions,
    :"Nombre objets" => :nb_objects,
    :Sujet => :subject,
    :Description => :description,
    :"Date de mise à jour" => :last_updated
);
names(df)

16-element Vector{String}:
 "id"
 "departement"
 "collection"
 "category"
 "object"
 "artist"
 "title"
 "date"
 "creation_place"
 "culture"
 "technics"
 "dimensions"
 "nb_objects"
 "subject"
 "description"
 "last_updated"

## 2. Data cleaning

In [192]:
# create new columns based on original ones
select!(df, :, 
    :id => ByRow(s -> (
            res = match(r"\d{4}", string(s));
            isnothing(res) ? missing : parse(Int, res.match)
        )
    ) => :acquisition_date,

    :id => ByRow(s -> (
        res = collect(eachmatch(r"(\w+)?(.)?\d{4}.\d+", string(s), overlap=true));
        length(res) > 0 ? res[1].match : missing
    )) => :uid,

    :date => ByRow(s -> (
            res = collect(eachmatch(r"\d{4}", string(s), overlap=true));
            length(res) > 0 ? parse(Int, res[1].match) : missing
            
        )
    ) => :creation_early,

    :date => ByRow(s -> (
            res = collect(eachmatch(r"\d{4}", string(s), overlap=true));
            length(res) > 1 ? parse(Int, res[2].match) : missing
        )
    ) => :creation_late
);

dropmissing!(df, [:acquisition_date, :category])

sort!(df, [:id])
unique!(df, :uid)

df = df[df.collection .!== "Archives privées", :]

# Collection permanente
collections = groupby(df, :collection)

colp = df[df.collection .== "Collection permanente", :]
unique!(colp, :uid)

size(df)

(28141, 20)

## 3. Basic stats (collection permanente)

### Number of artworks by collection 

In [193]:
sort(unique(combine(groupby(df, [:collection]), nrow => :nb_artworks)), :nb_artworks, rev=true)

Row,collection,nb_artworks
Unnamed: 0_level_1,String31,Int64
1,Collection permanente,24732
2,Collection de prêt...,1876
3,Collection d'étude,1529
4,Archives privées,4


In [194]:
acq_collection_y = unique(combine(groupby(df, [:acquisition_date, :collection]), :acquisition_date, :collection, nrow => :nb_artworks))

f = Figure(resolution=(1000, 1000))

for (i, c) in enumerate(unique(acq_collection_y.collection))
    ax = Axis(f[i,1])
    ax.title = c
    xlims!(ax, (1930, 2020))
    ylims!(ax, (0, 4000))
    data = acq_collection_y[(acq_collection_y.collection .=== c), :]
    barplot!(data.acquisition_date, data.nb_artworks, offset=1)
end

f

In [195]:
sort(unique(combine(groupby(collections[("Collection permanente",)], [:departement]), nrow => :nb_artworks)), :nb_artworks, rev=true)

Row,departement,nb_artworks
Unnamed: 0_level_1,String,Int64
1,Art ancien (des origines à 1900),6619
2,Art contemporain (1950-2000),6340
3,Art moderne (1900-1950),6191
4,Arts décoratifs,2682
5,Art inuit,2189
6,Art actuel (2000 à ce jour),711


In [213]:
categories_sorted = sort(unique(combine(groupby(collections[("Collection permanente", )], [:category]), nrow => :nb_artworks)), :nb_artworks, rev=true)
top_categories = Array(categories_sorted.category[1:7])
categories_sorted

Row,category,nb_artworks
Unnamed: 0_level_1,String,Int64
1,Dessin,5995
2,Photographie,5670
3,Estampe,3406
4,Sculpture,3223
5,Peinture,2752
6,Orfèvrerie,967
7,Graphisme,656
8,Céramique,436
9,Livre/Album,218
10,Mobilier,212


In [197]:
sort(unique(combine(groupby(collections[("Collection permanente", )], [:technics]), nrow => :nb_artworks)), :nb_artworks, rev=true)

Row,technics,nb_artworks
Unnamed: 0_level_1,String?,Int64
1,Épreuve à l'albumine argentique,2573
2,Épreuve à la gélatine argentique,2084
3,Huile sur toile,1520
4,Mine de plomb sur papier,1040
5,Argent,947
6,Fusain sur papier,525
7,Serpentinite,401
8,Encre sur papier,386
9,Mine de plomb sur papier Ingres,369
10,Sérigraphie,336


## 4. Basic plots

https://juliadatascience.io/cairomakie

https://docs.makie.org/stable/tutorials/basic-tutorial/

In [198]:
hist_acq = combine(groupby(unique(df), :acquisition_date, skipmissing=true), nrow => :nb_artwork)
hist_crea = combine(groupby(unique(df), :creation_early, skipmissing=true), nrow => :nb_artwork)

f = Figure(backgroundcolor = RGBf(0.98, 0.98, 0.98),
    resolution = (1000, 700))

axa, ba = barplot(f[1,1], hist_crea[:, :creation_early], hist_crea[:, :nb_artwork], xlabel="acquisition year")
axb, bb = barplot(f[1,2], hist_acq[:, :acquisition_date], hist_acq[:, :nb_artwork])

axa.xlabel = "Creation year"
axa.ylabel = "# of artworks"
axb.xlabel = "Acquisition year"
axb.ylabel = "# of artworks"

# f

"# of artworks"

In [267]:
dpt_y_count = unique(combine(groupby(colp, [:acquisition_date, :departement]), nrow))

f = Figure(resolution=(2000, 1000))
i = 1
j = 1

for (idx, g) in enumerate(groupby(dpt_y_count, :departement))
    ax = Axis(f[i, j])
    xlims!(ax, (1930, 2020))
    ylims!(ax, (0, 1000))
    ax.title = g.departement[1]
    j === 1 ? ax.ylabel = "acquisitions" : 0
    barplot!(g.acquisition_date, g.nrow)

    i += 1
    if i === 4
        j += 1
        i = 1
    end
end

f

## Category vs acquisition/creation

https://docs.makie.org/stable/examples/plotting_functions/rainclouds/#example_13222721075935979891

In [282]:
using Makie
colors = Makie.wong_colors()

f = Figure(resolution=(2000, 1000))

i = 1
j = 1

for (idx, g) in enumerate(groupby(colp[in(top_categories).(colp.category), :], :departement))
    ax = Axis(f[i, j])
    ax.xlabel = "acquisition date"
    ax.ylabel = "creation date"
    ax.title = g.departement[1]

    println(levelcode.(categorical(g.category)))

    scatter!(ax, g.acquisition_date, g.creation_early, colors=colors[levelcode.(categorical(g.category))])
    
    i += 1
    if i % 3 === 0
        i = 1
        j += 1
    end
end
f

In [204]:
# Top categories

top_cat = unique(
    sort(
        combine(
            groupby(
                acq, [:category]), :category, :nb_acquisition => sum => :nb_acquisition), :nb_acquisition, rev=true))

UndefVarError: UndefVarError: acq not defined

## Number of acquisitions per year by categories

### Bar chart

In [205]:
f = Figure(resolution=(1000, 1000)) 

for i in 1:3, j in 1:3
    ax = Axis(f[i, j])
    xlims!(ax, (1930, 2020))
    ylims!(ax, (0, 500))
    cat = reshape(unique(top_cat.category[1:9]), 3, 3)[i,j]
    ax.title = cat
    ax.ylabel = "Number of acquisitions"
    ax.xlabel = "Acquisition year"
    ax.xticklabelrotation = 45
    g = filter(r -> r.category == cat, acq)
    barplot!(g.acquisition_date, g.nb_acquisition)
end

f

UndefVarError: UndefVarError: top_cat not defined

### Box plot

In [206]:
f = Figure(resolution=(1000, 1000)) 
ax = Axis(f[1, 1])
# ax.ylabel = "Number of acquisitions"
# ax.xlabel = "Acquisition year"

boxplot!(levelcode.(categorical(top_cat)), filter(r -> r.category in unique(top_cat.category), acq).nb_acquisition, show_notch=true)
f

UndefVarError: UndefVarError: acq not defined

### Compute z-score

In [207]:
rain_data = dropmissing(df[:, [:acquisition_date, :category]])
rain_data = filter(r -> r.category in top_cat, rain_data)

f = Figure(resolution=(1000, 1000))

colors = Makie.wong_colors()

rainclouds!(Axis(f[1, 1]), rain_data[:, :category,], rain_data[:, :acquisition_date];
    gap=0.1,
    orientation = :horizontal,
    xlabel = "Année d'acquisition", ylabel = "Catégorie", title = "Catagory vs acquisition",
    clouds=violin,
    markersize=1,
    plot_boxplots = true, cloud_width=2,)

f

UndefVarError: UndefVarError: top_cat not defined

In [208]:
ldata = filter(row -> row.category in top_cat, combine(
        groupby(
            dropmissing(df, [:acquisition_date]), 
            [:category, :acquisition_date]
        ), 
        [:acquisition_date, :category], nrow
    ))

unique!(ldata)
categories = ldata.category


f = Figure(resolution=(1000, 1000))
Axis(
    f[1, 1], 
    title = "Acquisitions per categories",
)

for (i, g) in enumerate(groupby(ldata, [:category]))
    scatter!(g.acquisition_date, (g.nrow) ./ sizeof(g), linestyle = nothing, linewidth = 1)
end

f

UndefVarError: UndefVarError: top_cat not defined

## Artits

- Combien de nouveaux artistes entre dans la collection par année ? 
- Artistes réccurents ?

Questions en réunion : 
- sujet/type d'oeuvre en fonction du genre de l'individu ? 
- paysages vs genre
- technique par département par année
- années de création vs années d'acquisition / genre
