# Préparation des figures de statistiques descriptives

### Todo : 
Produire les figures, en global et par institution :
1. Ratio Femme/reste pour les acquisitions par année depuis 1945 : **DONE**
2. Age moyen à l'acquisition des hommes et des femmes (abandonné : deux bars a cote, mettre aussi le Q1 et Q3): **DONE**
3. Ratio oeuvres d'artistes Français/reste dans les acquisitions par nationalité (une courbe pour les hommes, une pour les femmes) : **DONE**
4. Sortir sous forme de matrice colorée le ratio H/F dans les 29 domaines en fonction du temps : **DONE**
5. Sous forme de courbe, une par mode d'acquisition, le ratio de femme

### Important pour le rapport :
- sortir les figures en ratio 2:1
- format .png, background transparent

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LinearSegmentedColormap
import re
import unicodedata
# %matplotlib inline

In [2]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)
global color_h, color_f, color_b, PLOTDIM, FREQYEARS
color_h = "gold"
color_f = "royalblue"
color_b = "lightgray"
PLOTDIM = (9,6)
FREQYEARS = 10

In [3]:
matplotlib.font_manager._rebuild()
plt.rcParams['font.family'] = 'Roboto'
plt.rcParams['font.sans-serif'] = 'Roboto'
plt.rcParams['font.weight'] = 500
# matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')

In [4]:
# Les données sont stockées dans le dossier "data", c'est plus clean
authors  = pd.read_csv('../data/ALL_AUTHORS (live_work + clean nat).csv',sep=',', low_memory=False)
art = pd.read_csv('../data/ALL_ARTWORKS.csv',low_memory=False)

In [5]:
def validname(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    value = re.sub('[-\s]+', '-', value)
    return value

### Etape préliminaire 1 : transformer les collections mal orthographiées

In [6]:
# Transform bad collections
art.loc[art['collection'] == "Musée national d'art moderne / Centre de cr��ation industrielle", 'collection'] = "Musée national d'art moderne / Centre de création industrielle"
art.loc[art['collection'] == "Mus��e d'Art moderne et contemporain de la Ville de Strasbourg", 'collection'] = "Musée d'Art moderne et contemporain de la Ville de Strasbourg"
art.loc[art['collection'] == "Mus��e national d'art moderne / Centre de création industrielle", 'collection'] = "Musée national d'art moderne / Centre de création industrielle"
art.loc[art['collection'] == "La Piscine, Mus��e d'art et d'industrie André Diligent, Roubaix", 'collection'] = "La Piscine, Musée d'art et d'industrie André Diligent, Roubaix"
art.loc[art['collection'] == "Mus��e d'art contemporain de Lyon", 'collection'] = "Musée d'art contemporain de Lyon"
#print(art['collection'].value_counts())
print(len(art['collection'].unique()))

59


### Etape préliminaire 2 : créer les sous-dossiers s'ils n'y sont pas déjà

In [7]:
# Create subfolders to store figures by museum:
if not os.path.exists('./figures/_global'):
    os.makedirs('./figures/_global')
if not os.path.exists('./figures/_fracs'):
    os.makedirs('./figures/_fracs')     
for museum in art['collection'].dropna().unique():
    n = validname(museum)
    if not os.path.exists('./figures/' + n):
        os.makedirs('./figures/' + n)

### Etape préliminaire 3 : "donner un genre" aux oeuvres

In [8]:
authors.sample(20)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Id artist,name,name extended,type,Birth year,Death year,Birth city,Birth state,Birth country,Death city,Death state,Death country,Gender,Nationality (original),ID artworks,average_year,artworks_creation_years,acquisition_year,name_lower,live_and_work,nationality_clean
31705,31705,31705,9000000000081047,Raphaël Boccanfuso,,artiste,1964.0,,Suresnes,Hauts-de-Seine,France,,,,masculin,française et suisse,290000000038291|400000000000741|14000000008496...,1999.0,1997|1998|1999|1999|2000|2005|2011|2011|2013,2015|2000|2010|2010|2015|2015|2012|2013|2014,raphaël boccanfuso,"Vit et travaille à Paris (Paris, France)",française|suisse
26979,26979,26979,9000000000075309,"Mathilde Thomas-Soyer (Mathilde Thomas, dit)",,artiste,1860.0,1940.0,Troyes,Aube,France,,,,féminin,française,200000000022015|140000000040409|14000000004616...,1894.0,1883,1903|1883|1892|1883,"mathilde thomas-soyer (mathilde thomas, dit)",0,française
9040,9040,9040,9000000000086312,Deborah Hay,,artiste,1941.0,,New York,New York,États-Unis,,,,féminin,américaine,150000000082934,1953.0,1966,2008,deborah hay,"Vit et travaille à Austin (Texas, États-Unis)",américaine
19118,19118,19118,250000000000739,Louis Oppenheim,,artiste,1879.0,1936.0,Cobourg,,Empire fédéral allemand,Berlin,,Empire fédéral allemand,masculin,allemande (avant 1949),250000000014081,1907.0,,,louis oppenheim,0,allemande
25730,25730,25730,9000000000067196,Oscar Spielmann,,artiste,1901.0,1973.0,Brno,,Tchécoslovaquie,,,,masculin,tchécoslovaque,140000000036959|150000000019076,1930.0,1923|1925,1955|1939,oscar spielmann,0,tchécoslovaque
11680,11680,11680,200000000001746,Rachid Koraichi,,artiste,1947.0,,Ain Beida,,Algérie,,,,masculin,algérienne,200000000006675,1947.0,,1976,rachid koraichi,"Vit et travaille à Paris (Ile-de-France, France)",algérienne
34235,34235,34235,9000000000068772,Hervé Carrier,,artiste,1932.0,2000.0,Martigues,Bouches-du-Rhône,France,Veynes,Hautes-Alpes,France,masculin,française,60000000006365|60000000065456|80000000000315|8...,1979.0,1973|1974|1974|1977|1978|1978|1979|1980|1979|1...,,hervé carrier,0,française
29235,29235,29235,9000000000081077,Carlo Bartoli,,artiste,1931.0,,Milan,,Italie,,,,masculin,italienne,150000000037880|150000000037739,1957.0,1970|1970,2000|2000,carlo bartoli,Vit et travaille à Monza (Italie),italienne
23820,23820,23820,9000000000070873,Stephen Neil Sack,,artiste,1955.0,,,,,,,,masculin,américaine,460000000001540|460000000001541|46000000000154...,1978.0,1983|1983|1983|1986,1986|1986|1986|1986,stephen neil sack,Vit et travaille à Bruxelles (Belgique),américaine
3141,3141,3141,140000000006766,Marius Dorier,,artiste,,,,,,,,,masculin,française,140000000028760,,,1947,marius dorier,0,française


In [9]:
art['Gender'] = None
counter = 0
done = 0
length = len(art)
done_percentage = 0

for _,i in art.iterrows():
    counter += 1
    if pd.isnull(i['authors']) == False:
        c = re.split(r'\|', i['authors'])#split
        # If several authors: we see if they all have same gender, else "groupe"
        if len(c) > 1:
            gender_list = [authors[authors['Id artist'] == int(b)]['Gender'].values[0] for b in c]
            if all("masculin" == g for g in gender_list):
                var = "masculin"
            elif all("féminin" == g for g in gender_list):
                var = "féminin"
            else:
                var = "groupe"
        # If one author: trivial
        else:
            b = c[0]
            var = authors[authors['Id artist'] == int(b)]['Gender'].values[0]

        art.at[_,'Gender'] = var
        done += 1
    
    if counter % 100000==0:
        print(f'percentage: {round(100*counter/length, 3)}%, total = {counter}, done = {done}, done percentage = {round(100*done/length, 3)}%')
        # print(i['Gender'])

percentage: 26.667%, total = 100000, done = 100000, done percentage = 26.667%
percentage: 53.334%, total = 200000, done = 200000, done percentage = 53.334%
percentage: 80.002%, total = 300000, done = 299996, done percentage = 80.001%


In [10]:
art.sample(20)

Unnamed: 0,_id,ensemble_id,nb_elements,related,type,recap_inventory,recap_title,recap_nature,title_notice,title_list,title_ensemble,collection_department,dimensions_additional,inscriptions,expositions_without_current,expositions,bibliography,copyright,author_in_common,is_dissoc,collection,acquisition_mode,recap_copyright,date_creation,acquisition_year,domain,domain_leaf,domain_deno_for_grid,domain_description_mst,comments,recap_description,recap_authors,authors_notice,authors_list,dimensions,recap_dimensions,acquisition,inventory,inventory_for_grid,key_words_thema,rights_management_leaf,default_tooltip_ua_description,authors_name_complement,authors_site,authors_documents,authors_video,authors_nationality,authors_birth_death,live_and_work,author_bibliography,recap_nationality,recap_name_complement,recap_birth_death,recap_live_and_work,recap_author_bibliography,recap_live_work,nb_images,medias,recap_image_unavailable,authors,localisation_if_deposit,number_provisory,key_words_icono,image_unavailable,title_serial,key_words_movement,title_attributed,creation_stage,domain_deno,deposit_number,deposit_number_for_grid,tirage_design,number_exhibition,realisation_location,recap_multi,dimensions_without_margin,tirage,collaborators_design,authors_live_work,collaborators,tirage_photo,title_old,number_entry,production_circumstances,subtitle,number_identification,number_artist_studio,title_other,text_notes,title_old_by_artist,number_document,title_collection,number_depositary_or_loaner,recap_bibliography,ensemble,old_owners,recap_title_trad,number_artist,trans_title_attributed,number_isbn,number_succession,trans_subtitle,number_issn,mnam_mnr,trans_title_serial,number_catalogue,number_frame,trans_title_ensemble,trans_title_other,trans_title_old,old_attributions,acquisition_mode_clean,Gender
64748,150000000028921,,,,individual,,,,Femme drapée levant les bras,Femme drapée levant les bras,,Cabinet d'art graphique,,MO.D.B.DR. : 31-7-39/ J.G.,,,Pradel (Marie-Noëlle).- La donation Gonzalez a...,Domaine public,a,,Musée national d'art moderne / Centre de créat...,Don,,1939,1964.0,Dessin,Dessin,Dessin,"Mine graphite, plume et lavis d'encre de Chine...",,,,Julio GONZÁLEZ,GONZÁLEZ Julio,"31 x 19,6 cm",,Don de Mme Roberta González en 1964,AM 3216 D,AM 3216 D,nu,..Domaine public,"Mine graphite, plume et lavis d'encre de Chine...",,,,,espagnole,"Barcelone (Espagne), 1876 - Arcueil (Val-de-Ma...",,,,,,,,,1.0,9000000069065,,9000000000066151,,,bras levé,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Don,masculin
58609,140000000076720,,,,individual,,,,Roi Louis-Philippe,Roi Louis-Philippe,,Fonds historique,,,,,,Domaine public,a,,Centre national des arts plastiques/Fonds nati...,Achat par commande,,1842,1842.0,Peinture,Peinture,Peinture,Portrait en piedHuile sur toile,,,,E. GÉRAULT,GÉRAULT E.,,,Achat par commande à l'artiste en 1842,FNAC PFH-2600Centre national des arts plastiques,FNAC PFH-2600Centre national des arts plastiques,,..Domaine public,Portrait en piedHuile sur toile,,,,,,"- ?, ?",,,,,,,,,,,,140000000004913,<b>En dépôt depuis 1842 : Mairie d'Evaux-les-B...,,,no_image,,,,Copie d'après Winterhalter,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Achat par commande,masculin
43610,200000000004933,,,,individual,,,,Femme au bar,Femme au bar,,,,,,,,© droits réservés,a,,Fonds municipal d'art contemporain de la Ville...,Achat,,s.d.,1954.0,Peinture,Peinture,Peinture,Huile sur toile,,,,Raymond FEUILLATTE,FEUILLATTE Raymond,81 x 65 cm,,Achat en 1954,CMP9869,CMP9869,,..Type de droits non identifié,Huile sur toile,,,,,française,"Neuilly-sur-Seine (Seine), 1901 - Neuilly-sur-...",,,,,,,,,1.0,90000000144940,,9000000000066021,<b>En dépôt depuis le 16/03/2005 : Bureau de l...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Achat,masculin
202762,140000000104656,1303.0,468.0,140000000097300|140000000104488|14000000010452...,separable,FNAC 01-435 (1 à 468),Boring photographs,Ensemble,Boring Photographs,Boring Photographs,,Photographie,,,,,,© Martin Parr / Magnum Photos,n,,Centre national des arts plastiques/Fonds nati...,Achat,© Martin Parr / Magnum Photos,mai 2000,2001.0,Photographie,Photographie,Photographie,Photographie couleur,,Ensemble de 468 photographies couleur2000Photo...,Martin PARR,Martin PARR,PARR Martin,"10,1 x 14,9 cm",10 x 15 cm (chaque),Achat à la Galerie du Jour Agnès b. en 2001,FNAC 01-435 (120)Centre national des arts plas...,FNAC 01-435 (120)Centre national des arts plas...,,Magnum Photos (Paris),Photographie couleur,,<a href='http://www.martinparr.com' rel='link'...,,,britannique,"Epsom (Royaume-Uni), 1952",Vit et travaille à Bristol (Royaume-Uni) et à ...,,britannique,,"Epsom (Royaume-Uni), 1952",Vit et travaille à Bristol (Royaume-Uni) et à ...,,Vit et travaille à Bristol (Royaume-Uni) et à ...,1.0,90000000211114,recap_no_image,9000000000069243,,,,,,,,,,,,,,,,,,,,,2/12,,,,,,,,,,,,,,Appartient à un album de 468 photographies,,,,,,,,,,,,,,,,,Achat,masculin
341538,150000000031597,,,,individual,,,,Petite baie de La Ciotat,Petite baie de La Ciotat,,Arts Plastiques,,S.B.G. : Braque,Les sources du XXe siècle: les arts en Europe ...,Les sources du XXe siècle: les arts en Europe ...,Leymarie (Jean).- Le Fauvisme.- Genève : Skira...,"© Adagp, Paris",a,,Musée national d'art moderne / Centre de créat...,Donation,,juin 1907,1965.0,Peinture,Peinture,Peinture,Huile sur toile,,,,Georges BRAQUE,BRAQUE Georges,36 x 48 cm,,Donation de Mme Georges Braque en 1965,AM 4298 P,AM 4298 P,"paysage, marine",Adagp (Paris),Huile sur toile,,,,,française,"Argenteuil (Val-d'Oise, France), 1882 - Paris ...",,,,,,,,,2.0,90000000307269|9000000064639,,9000000000065625,,,mer,,,Fauvisme,,,,,,,,"La Ciotat, juin 1907",,,,,,,,,,,,,,,,,,,,,,"BOURDON Marcelle, Paris (France)|Sidney Janis ...",,,,,,,,,,,,,,,,Donation,masculin
174546,70000000008063,37.0,430.0,70000000007610|70000000007613|70000000007616|7...,separable,2012-5 (1 à 430),Ensemble de 443 papiers gouachés et découpés,Ensemble,Papier gouaché et découpé,Papier gouaché et découpé,,,,"R. : au crayon de papier : ""233""","Henri Matisse, La couleur découpée. Une donati...","Henri Matisse, La couleur découpée. Une donati...",,© Succession H. Matisse,a,(dissociable),"Musée Matisse, Le Cateau-Cambrésis",Don,voir © sur les éléments,1945 - 1954,2012.0,Dessin,Dessin,Dessin,Papier gouaché de couleur jaunePapier gouaché,,Ensemble de 443 éléments en papier gouaché et ...,Henri MATISSE,Henri MATISSE,MATISSE Henri,"10,4 x 9,3 cm",,Don de la Famille Matisse en 2012,2012-5 (152),2012-5 (152),,les Héritiers Matisse (Issy-les-Moulineaux),Papier gouaché de couleur jaunePapier gouaché,,,,,française,"Le Cateau-Cambrésis (Nord, France), 1869 - Nic...",,,française,,"Le Cateau-Cambrésis (Nord, France), 1869 - Nic...",,,,1.0,90000000320318,recap_no_image,9000000000066694,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Don,masculin
116006,140000000055831,,,,individual,,,,Diplôme pour le Salon des Armées,Diplôme pour le Salon des Armées,,Fonds historique,,,,,,Domaine public,a,,Centre national des arts plastiques/Fonds nati...,Achat,,s.d.,1917.0,Dessin,Dessin,Dessin,,,,,Jean-Paul LAURENS,LAURENS Jean-Paul,,,Achat à l'artiste en 1917,FNAC 5587Centre national des arts plastiques,FNAC 5587Centre national des arts plastiques,,..Domaine public,,,,,,française,"Fourquevaux (Haute-Garonne, France), 1838 - Pa...",,,,,,,,,,,,9000000000074562,<b>En dépôt depuis le 20/01/1917 : Direction G...,,,no_image,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Achat,masculin
211297,160000000021897,,,,individual,,,,Pourquoi la journée vole,Pourquoi la journée vole,,,,,,,,"© Succession Picasso, © droits réservés",a,,Musée national Picasso-Paris,Dation,,1960,1979.0,"Publication, livre, reliure","Publication, livre, reliure","Publication, livre, reliure","Exemplaire sur papier Vélin d'Arches, avec une...",,,,"Pablo PICASSO, René CHAR","PICASSO Pablo, CHAR René","10,9 x 21,2 x 0,8 cm",,Dation en 1979,MP3572,MP3572,,Picasso Administration (Paris),"Exemplaire sur papier Vélin d'Arches, avec une...",,,,,,,,,,,,,,,,,,160000000217584|9000000000066930,,,,no_image,,,,,,,,,,,,,,,,Ecrivain : René CHAREditeur : Edition Pierre A...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Dation,masculin
124228,140000000109899,2133.0,21.0,140000000109899|140000000109900|14000000010990...,separable,FNAC PFH-5786 (1 à 21),Coins de Paris,ensemble,"Feuille de titre ""Petite série d'eaux-fortes :...","Feuille de titre ""Petite série d'eaux-fortes :...",,Fonds historique,,S.DR. sur la planche : A. LEPERE,,,,Domaine public,n,,Centre national des arts plastiques/Fonds nati...,Inscription à l'inventaire,Domaine public,s.d.,,Estampe,Estampe,Estampe,Eau-forte,,Série de 20 gravures,Auguste Louis LEPÈRE,Auguste Louis LEPÈRE,LEPÈRE Auguste Louis,"50,4 x 32,3 cm19,7 x 23,4 cm (hors marge)",,Inscription à l'inventaire,FNAC PFH-5786 (1)Centre national des arts plas...,FNAC PFH-5786 (1)Centre national des arts plas...,,..Domaine public,Eau-forte,,,,,française,"Paris (France), 1849 - Domme (Dordogne, France...",,,française,,"Paris (France), 1849 - Domme (Dordogne, France...",,,,,,recap_no_image,9000000000077083,<b>En dépôt depuis 1913 : Musée des Beaux-Arts...,,,element_no_image,,,,,,,,,,,,"19,7 x 23,4",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Inscription à l'inventaire,masculin
66550,520000000000070,,,,individual,,,,"Sec équarri, abouté en ligne courbe fermée","Sec équarri, abouté en ligne courbe fermée",,,,,[mac] 2000Les collections du [mac] de 1960 à 2...,[mac] 2000Les collections du [mac] de 1960 à 2...,,"© Adagp, Paris",a,,"[mac] Musée d'art contemporain, Marseille",Achat,,1970 - 1975,1977.0,Sculpture,Sculpture,Sculpture,Bois,,,,"Toni GRAND (Antoine GRAND, dit)","GRAND Toni (GRAND Antoine, dit)",diamètre: 210 cm,,Achat à la Galerie Eric Fabre en 1977,C.77.12,C.77.12,,Adagp (Paris),Bois,,,,,française,"Gallargues-le-Montueux (Gard, France), 1935 - ...",,,,,,,,,2.0,90000000511094|90000000268317,,9000000000067578,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Achat,masculin


In [11]:
art['Gender'].value_counts()

masculin         293730
féminin           47270
groupe            11602
non renseigné      6703
Name: Gender, dtype: int64

In [12]:
art['Gender'].count()/len(art) # 4,2% de NaN

0.9581671075649614

In [13]:
fracs = []
for museum in art['collection'].dropna().unique():
    if "frac" in museum.lower():
        fracs.append(museum)
len(fracs)

18

## Partie 1 : Proportion du nombre de femmes dans les acquisitions

In [127]:
# Function to plot the proportion of femmes in any subset of Arworks
def get_ratioF(art, subset, subset_name, path):
    art = art[(art['acquisition_year'] >= 1945) & (art['acquisition_year'] <= 2017)]
    subset = subset[(subset['acquisition_year'] >= 1945) & (subset['acquisition_year'] <= 2017)]
    fig, ax = plt.subplots()
    # We always plot the general evolution on the background
    f = art[art['Gender']=='féminin'].groupby('acquisition_year').count()['_id']
    total = art.groupby('acquisition_year').count()['_id']
    ratio_general = f/total
    # ratio_general.plot(figsize=PLOTDIM, color=color_b, linewidth=2, ax=ax)
    # We plot the proportion in the subset of artworks of interest
    total1 = subset.groupby('acquisition_year').count()['_id']
    f1 = subset[subset['Gender']=='féminin'].groupby('acquisition_year').count()
    if len(f1) == 0:
        f1 = total1.copy()
        f1[:] = 0
    else:
        f1 = f1['_id']
    ratio = f1/total1
    ratio = ratio.reindex(range(1945, 2018))
    ratio.plot(figsize=PLOTDIM, color=color_f, marker='.', linewidth=2, ax=ax)
    # Then we customize the plot
    # ax.set_title("Proportion d'artistes femmes dans les acquisitions, " + subset_name, fontsize=20)
    # ax.set_xlabel("Année d'acquisition", fontsize=20)
    # ax.set_ylabel("Proportion d'artistes femmes", fontsize=20)
    ax.set_xlim(left = 1945,right = 2017)
    ax.set_ylim(bottom = -0.01,top = 1.01)
    ax.tick_params(labelsize = 15)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(FREQYEARS))
    ax.set_xlabel('')
    ax.set_yticklabels(['{:,.0%}'.format(x) for x in ax.get_yticks()])
    plt.grid()
    plt.savefig(path, bbox_inches='tight',format="png", dpi=300, transparent=True)
    plt.close()

In [128]:
get_ratioF(art, art, "ensemble des collections", "./figures/_global/ratio_overall.png")
get_ratioF(art, art[art['collection'].isin(fracs)], "ensemble des 18 FRAC", "./figures/_fracs/ratio_overall.png")

In [106]:
for museum in art['collection'].dropna().unique():
    # print("Musée en cours:", museum)
    subset = art[art['collection'] == museum]
    get_ratioF(art, subset, museum, "./figures/" + validname(museum) + "/ratio_overall.png")
    del subset
print('Done')

Done


### Aparté

In [17]:
# A study on "is it a good idea to group by separable artworks" => NO
uniquegroups = art[art['type'] == 'separable'].groupby('related').first()
# uniquegroups = art[art['type'] == 'separable'].groupby('related').agg(lambda x:x.value_counts().index[0])
# matisse = art[art['related'] == '70000000007610|70000000007613|70000000007616|70000000007619|70000000007622|70000000007625|70000000007628|70000000007631|70000000007634|70000000007637|70000000007640|70000000007643|70000000007646|70000000007649|70000000007652|70000000007655|70000000007658|70000000007661|70000000007664|70000000007667|70000000007670|70000000007673|70000000007676|70000000007679|70000000007682|70000000007685|70000000007688|70000000007691|70000000007694|70000000007697|70000000007700|70000000007703|70000000007706|70000000007709|70000000007712|70000000007715|70000000007718|70000000007721|70000000007724|70000000007727|70000000007730|70000000007733|70000000007736|70000000007739|70000000007742|70000000007745|70000000007748|70000000007751|70000000007754|70000000007757|70000000007760|70000000007763|70000000007766|70000000007769|70000000007772|70000000007775|70000000007778|70000000007781|70000000007784|70000000007787|70000000007790|70000000007793|70000000007796|70000000007799|70000000007802|70000000007805|70000000007808|70000000007811|70000000007814|70000000007817|70000000007820|70000000007823|70000000007826|70000000007829|70000000007832|70000000007835|70000000007838|70000000007841|70000000007844|70000000007847|70000000007850|70000000007853|70000000007856|70000000007859|70000000007862|70000000007865|70000000007868|70000000007871|70000000007874|70000000007877|70000000007880|70000000007883|70000000007886|70000000007889|70000000007892|70000000007895|70000000007898|70000000007901|70000000007904|70000000007907|70000000007910|70000000007913|70000000007916|70000000007919|70000000007922|70000000007925|70000000007928|70000000007931|70000000007934|70000000007937|70000000007940|70000000007943|70000000007946|70000000007949|70000000007952|70000000007955|70000000007958|70000000007961|70000000007964|70000000007967|70000000007970|70000000007973|70000000007976|70000000007979|70000000007982|70000000007985|70000000007988|70000000007991|70000000007994|70000000007997|70000000008000|70000000008003|70000000008006|70000000008009|70000000008012|70000000008015|70000000008018|70000000008021|70000000008024|70000000008027|70000000008030|70000000008033|70000000008036|70000000008039|70000000008042|70000000008045|70000000008048|70000000008051|70000000008054|70000000008057|70000000008060|70000000008063|70000000008071|70000000008074|70000000008077|70000000008080|70000000008083|70000000008086|70000000008089|70000000008092|70000000008095|70000000008098|70000000008101|70000000008104|70000000008107|70000000008110|70000000008113|70000000008116|70000000008119|70000000008122|70000000008125|70000000008128|70000000008131|70000000008134|70000000008137|70000000008140|70000000008143|70000000008146|70000000008149|70000000008152|70000000008155|70000000008158|70000000008161|70000000008164|70000000008167|70000000008170|70000000008173|70000000008176|70000000008179|70000000008182|70000000008185|70000000008188|70000000008191|70000000008194|70000000008197|70000000008200|70000000008203|70000000008206|70000000008209|70000000008212|70000000008215|70000000008218|70000000008221|70000000008224|70000000008227|70000000008230|70000000008233|70000000008236|70000000008239|70000000008242|70000000008245|70000000008248|70000000008251|70000000008254|70000000008257|70000000008260|70000000008263|70000000008266|70000000008269|70000000008272|70000000008275|70000000008278|70000000008281|70000000008284|70000000008287|70000000008290|70000000008293|70000000008296|70000000008299|70000000008302|70000000008305|70000000008308|70000000008311|70000000008314|70000000008317|70000000008320|70000000008323|70000000008326|70000000008329|70000000008332|70000000008335|70000000008338|70000000008341|70000000008344|70000000008347|70000000008350|70000000008353|70000000008356|70000000008359|70000000008362|70000000008365|70000000008368|70000000008371|70000000008374|70000000008377|70000000008380|70000000008383|70000000008386|70000000008389|70000000008392|70000000008395|70000000008398|70000000008401|70000000008404|70000000008407|70000000008410|70000000008413|70000000008416|70000000008419|70000000008422|70000000008425|70000000008428|70000000008431|70000000008434|70000000008437|70000000008440|70000000008443|70000000008446|70000000008449|70000000008452|70000000008455|70000000008458|70000000008461|70000000008464|70000000008467|70000000008470|70000000008473|70000000008476|70000000008479|70000000008482|70000000008485|70000000008488|70000000008491|70000000008494|70000000008497|70000000008500|70000000008503|70000000008506|70000000008509|70000000008512|70000000008515|70000000008518|70000000008521|70000000008524|70000000008533|70000000008536|70000000008539|70000000008542|70000000008545|70000000008548|70000000008551|70000000008554|70000000008557|70000000008560|70000000008563|70000000008566|70000000008569|70000000008572|70000000008575|70000000008578|70000000008581|70000000008584|70000000008587|70000000008590|70000000008593|70000000008596|70000000008599|70000000008602|70000000008605|70000000008608|70000000008611|70000000008614|70000000008617|70000000008620|70000000008623|70000000008626|70000000008629|70000000008632|70000000008635|70000000008638|70000000008641|70000000008644|70000000008647|70000000008650|70000000008653|70000000008656|70000000008659|70000000008662|70000000008665|70000000008668|70000000008671|70000000008674|70000000008677|70000000008680|70000000008683|70000000008686|70000000008689|70000000008692|70000000008695|70000000008698|70000000008701|70000000008704|70000000008707|70000000008710|70000000008713|70000000008716|70000000008719|70000000008722|70000000008725|70000000008728|70000000008731|70000000008734|70000000008737|70000000008740|70000000008743|70000000008746|70000000008749|70000000008752|70000000008755|70000000008758|70000000008761|70000000008764|70000000008767|70000000008770|70000000008773|70000000008776|70000000008779|70000000008782|70000000008785|70000000008788|70000000008791|70000000008794|70000000008797|70000000008800|70000000024504|70000000024507|70000000024510|70000000024513|70000000024516|70000000024519|70000000024522|70000000024525|70000000024528|70000000024531|70000000024534|70000000024537|70000000024540|70000000024543|70000000024546|70000000024549|70000000024552|70000000024555|70000000024558|70000000024561|70000000024564|70000000024567|70000000024570|70000000024573|70000000024576|70000000024579|70000000024582|70000000024585|70000000024588|70000000024591|70000000024594|70000000024597|70000000024600|70000000024603|70000000024606|70000000024609']
counterex = art[art['related'] == '10000000001440|10000000001441|10000000001442|10000000001443|10000000001444|10000000001445|10000000001439|10000000001446|10000000001447|10000000001448|10000000001449|10000000001450|10000000001451|10000000001452|10000000001453|10000000001454|10000000001481']
#counterex

<h2>Partie 2 : Age moyen à l'acquisition, par genre

In [18]:
# Computing age at acquisition
art['age_at_acquisition'] = None
counter = 0
done = 0
length = len(art)
done_percentage = 0

for _,i in art.iterrows():
    counter += 1
    if pd.isnull(i['authors']) == False:
        c = re.split(r'\|', i['authors'])#split
        # If several authors: we take the average age
        if len(c) > 1:
            ages_list = [authors[authors['Id artist'] == int(b)]['Birth year'].values[0] for b in c]
            var = i['acquisition_year'] - np.nanmean(ages_list)
        # If one author: trivial
        else:
            b = c[0]
            var = i['acquisition_year'] - authors[authors['Id artist'] == int(b)]['Birth year'].values[0]

        art.at[_,'age_at_acquisition'] = var
        done += 1
    
    if counter % 100000==0:
        print(f'percentage: {round(100*counter/length, 3)}%, total = {counter}, done = {done}, done percentage = {round(100*done/length, 3)}%')

art['age_at_acquisition'] = pd.to_numeric(art['age_at_acquisition'])
art['age_at_acquisition'].count()/len(art) #18% de NaN

  from ipykernel import kernelapp as app


percentage: 26.667%, total = 100000, done = 100000, done percentage = 26.667%
percentage: 53.334%, total = 200000, done = 200000, done percentage = 53.334%
percentage: 80.002%, total = 300000, done = 299996, done percentage = 80.001%


0.8275989887784273

In [19]:
art['age_at_acquisition'].sample(10)

234686      NaN
281741     48.0
24750      91.0
235858     88.0
239341      NaN
123267     65.0
318407    132.0
74999       NaN
262697     77.0
258181     40.0
Name: age_at_acquisition, dtype: float64

In [107]:
# Function to plot the age at acquisition (separated between hommes and femmes)
def get_age_acquisition(art, subset, subset_name, path):
    art = art[(art['acquisition_year'] >= 1945) & (art['acquisition_year'] <= 2017)]
    subset = subset[(subset['acquisition_year'] >= 1945) & (subset['acquisition_year'] <= 2017)]
    fig, ax = plt.subplots()
    total = subset.groupby('acquisition_year').mean()['age_at_acquisition']
    # Femmes
    f = subset[subset['Gender']=='féminin'].groupby('acquisition_year').mean()
    if len(f) > 0:
        f = f['age_at_acquisition']
        f = f.reindex(range(1945, 2018))
        f.plot(figsize=PLOTDIM, color=color_f, marker='.', linewidth=2, ax=ax)
    # Hommes
    h = subset[subset['Gender']=='masculin'].groupby('acquisition_year').mean()
    if len(h) > 0:
        h = h['age_at_acquisition']
        h = h.reindex(range(1945, 2018))
        h.plot(figsize=PLOTDIM, color=color_h, marker='.', linewidth=2, ax=ax)
    # Then we customize the plot
    # ax.set_title("Age moyen de l'artiste à l'acquisition, par genre, " + subset_name, fontsize=20)
    # ax.set_xlabel("Année d'acquisition", fontsize=20)
    # ax.set_ylabel("Age moyen de l'artiste", fontsize=20)
    ax.set_xlim(left = 1945,right = 2017)
    ax.set_ylim(bottom = -1,top = 141)
    ax.tick_params(labelsize = 15)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(FREQYEARS))
    ax.set_xlabel('')
    plt.grid()
    plt.savefig(path, bbox_inches='tight',format="png", dpi=300, transparent=True)
    plt.close()

In [108]:
get_age_acquisition(art, art, "ensemble des collections", "./figures/_global/age_acquisition.png")
get_age_acquisition(art, art[art['collection'].isin(fracs)], "ensemble des 18 FRAC", "./figures/_fracs/age_acquisition.png")

In [109]:
for museum in art['collection'].dropna().unique():
    # print("Musée en cours:", museum)
    subset = art[art['collection'] == museum]
    get_age_acquisition(art, subset, museum, "./figures/" + validname(museum) + "/age_acquisition.png")
    del subset
print('Done')

Done


## Partie 3 : ratio d'artiste Français dans les acquisitions, par genre

In [23]:
art['nationality'] = None
counter = 0
done = 0
length = len(art)
done_percentage = 0

for _,i in art.iterrows():
    counter += 1
    if pd.isnull(i['authors']) == False:
        c = re.split(r'\|', i['authors'])#split
        # If several authors: we see if they all have same gender, else "groupe"
        if len(c) > 1:
           # nat_list = [authors[authors['Id artist'] == int(b)]['nationality_clean'].values[0] for b in c if ]
            nat_list = []
            for b in c:
                if pd.notnull(authors[authors['Id artist'] == int(b)]['nationality_clean'].values[0]):
                    nat_list.append(authors[authors['Id artist'] == int(b)]['nationality_clean'].values[0])
            if any("français" in g for g in nat_list):
                var = "français"
            else:
                var = "groupe"
        # If one author: trivial
        else:
            b = c[0]
            var = authors[authors['Id artist'] == int(b)]['nationality_clean'].values[0]

        art.at[_,'nationality'] = var
        done += 1
    
    if counter % 100000==0:
        print(f'percentage: {round(100*counter/length, 3)}%, total = {counter}, done = {done}, done percentage = {round(100*done/length, 3)}%')

percentage: 26.667%, total = 100000, done = 100000, done percentage = 26.667%
percentage: 53.334%, total = 200000, done = 200000, done percentage = 53.334%
percentage: 80.002%, total = 300000, done = 299996, done percentage = 80.001%


In [24]:
authors['nationality_clean'].value_counts()

française                         17026
américaine                         1536
allemande                          1051
italienne                           757
britannique                         612
suisse                              498
belge                               493
espagnole                           418
japonaise                           368
néerlandaise                        321
russe                               229
polonaise                           203
autrichienne                        192
chinoise                            144
suédoise                            134
canadienne                          124
brésilienne                         120
tchécoslovaque                      116
argentine                           107
danoise                             101
roumaine                             91
grecque                              90
hongroise                            88
soviétique                           84
russe|française                      79


In [110]:
def get_ratioNat(art, subset, subset_name, path):
    art = art[(art['acquisition_year'] >= 1945) & (art['acquisition_year'] <= 2017)]
    subset = subset[(subset['acquisition_year'] >= 1945) & (subset['acquisition_year'] <= 2017)]
    fig, ax = plt.subplots()
    total_f = subset[(subset['Gender'] == 'féminin')].groupby('acquisition_year').count()['_id']
    total_h = subset[(subset['Gender'] == 'masculin')].groupby('acquisition_year').count()['_id']
    # Same thing for men 
    h = subset[(subset['Gender'] == 'masculin') & (subset['nationality'].str.contains('français'))].groupby('acquisition_year').count()
    if len(h) == 0:
        h = total_h.copy()
        h[:] = 0
    else:
        h = h['_id']
    ratio2 = h/total_h
    ratio2 = ratio2.reindex(range(1945, 2018))
    ratio2.plot(figsize=PLOTDIM, color=color_h, marker='.', linewidth=2, ax=ax)
    # We plot the proportion of French in the subset of Femmes
    f = subset[(subset['Gender'] == 'féminin') & (subset['nationality'].str.contains('français'))].groupby('acquisition_year').count()
    if len(f) == 0:
        f = total_f.copy()
        f[:] = 0
    else:
        f = f['_id']
    ratio = f/total_f
    ratio = ratio.reindex(range(1945, 2018))
    if len(ratio) > 0:
        ratio.plot(figsize=PLOTDIM, color=color_f, marker='.', linewidth=2, ax=ax)
    
    # Then we customize the plot
    # ax.set_title("Proportion d'artistes français dans les acquisitions, " + subset_name, fontsize=20)
    # ax.set_xlabel("Année d'acquisition", fontsize=20)
    # ax.set_ylabel("Proportion d'artistes français", fontsize=20)
    ax.set_xlim(left = 1945,right = 2017)
    ax.set_ylim(bottom = -0.01,top = 1.01)
    ax.tick_params(labelsize = 15)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(FREQYEARS))
    ax.set_xlabel('')
    ax.set_yticklabels(['{:,.0%}'.format(x) for x in ax.get_yticks()])
    plt.grid()
    plt.savefig(path, bbox_inches='tight',format="png", dpi=300, transparent=True)
    plt.close()

In [111]:
get_ratioNat(art, art, "ensemble des collections", "./figures/_global/ratio_nationality.png")
get_ratioNat(art, art[art['collection'].isin(fracs)], "ensemble des 18 FRAC", "./figures/_fracs/ratio_nationality.png")

In [112]:
for museum in art['collection'].dropna().unique():
    # print("Musée en cours:", museum)
    subset = art[art['collection'] == museum]
    get_ratioNat(art, subset, museum, "./figures/" + validname(museum) + "/ratio_nationality.png")
    del subset
print('Done')

Done


## Partie 4 : ratio H/F dans les 29 domaines

In [28]:
my_cm = LinearSegmentedColormap.from_list("colormap", [color_h, color_f], N=100)

In [113]:
def get_ratio_domains(art, subset, subset_name, path):
    art = art[(art['acquisition_year'] >= 1945) & (art['acquisition_year'] <= 2017)]
    subset = subset[(subset['acquisition_year'] >= 1945) & (subset['acquisition_year'] <= 2017)]
    overall = subset.groupby('acquisition_year').count()['_id']
    overall_f = subset[subset['Gender']=='féminin'].groupby('acquisition_year').count()['_id']
    overall = pd.Series(overall_f/overall, name="Tous domaines")
    total = subset.groupby(['domain', 'acquisition_year']).count()['_id']
    # We need the fraction of Femmes in each year/domain
    f = subset[(subset['Gender'] == 'féminin')].groupby(['domain', 'acquisition_year']).count()
    if len(f) == 0:
        f = total.copy()
        f[:] = 0
    else:
        f = f['_id']
    ratio = f/total
    if len(ratio) > 0:
        ratio = ratio.reset_index(name='Value')
        names = sorted(list(ratio['domain'].unique()))
        all_names = sorted(list(art['domain'].dropna().unique()))
        ratio = ratio.pivot_table(index='domain', columns='acquisition_year', values='Value', aggfunc='sum')
        ratio.index = names
        ratio = ratio.reindex(columns=range(1945, 2018)).reindex(index=all_names)
        ratio = ratio.append(overall)
        mratio = np.matrix(ratio)
        
        # Then we customize the plot
        fig, ax = plt.subplots(figsize=PLOTDIM)
        im = ax.matshow(mratio, cmap=my_cm, vmin=0, vmax=1)
        #fig.set_size_inches(PLOTDIM)
        divider = make_axes_locatable(ax)
        truc = divider.append_axes("right", size="2%", pad=-2.8)
        cbar = plt.colorbar(im, cax=truc)
        cbar.ax.set_yticklabels(['0%', '20%', '40%', '60%', '80%','100%'], fontsize=13)
        # ax.set_title("Proportion d'artistes femmes par domaine, " + subset_name, fontsize=20, y=1)
        # ax.set_xlabel("Année d'acquisition", fontsize=20)
        ax.set_xticklabels(['50', '60', '70', '80', '90', '00', '10'], fontsize=13)
        ax.xaxis.set_ticks_position('bottom')
        ax.set_xticks(np.arange(5, 71, 10))
        domains = list(ratio.index)
        domains = [d if not "Domaine mixte" in d else "Domaine mixte" for d in domains]
        ax.set_yticks(np.arange(0, len(domains), 1))
        ax.set_yticklabels(domains, fontsize=10)
        ax.set_aspect(2.5)
    plt.savefig(path, bbox_inches='tight',format="png", dpi=300, transparent=True)
    plt.close()

In [114]:
get_ratio_domains(art, art, "ensemble des collections", "./figures/_global/ratio_domains.png")
get_ratio_domains(art, art[art['collection'].isin(fracs)], "ensemble des 18 FRAC", "./figures/_fracs/ratio_domains.png")

In [115]:
for museum in art['collection'].dropna().unique():
    # print("Musée en cours:", museum)
    subset = art[art['collection'] == museum]
    get_ratio_domains(art, subset, museum, "./figures/" + validname(museum) + "/ratio_domains.png")
    del subset
print('Done')

Done


In [32]:
m = []
n = []
for x in art['collection'].dropna().unique():
    m.append(x)
    n.append(validname(x))
pd.DataFrame({"musée": m, "dossier": n}).to_csv("./figures/musées.csv", encoding='utf-8', index=False)

Du bordel pour Vincent

In [33]:
k = 0
l = 0
for _, j in authors.iterrows():
    if not pd.isnull(j['acquisition_year']):
        c = re.split(r'\|', j['acquisition_year'])
        if max([int(i) for i in c]) >= 1945:
            k += 1
        if max([int(i) for i in c]) >= 1900:
            l += 1
print(len(authors), l, k)

35958 27272 22027


## Partie 5 : Sous forme de courbe, une par mode d'acquisition, le ratio de femme

In [34]:
len(art['acquisition_mode_clean'].dropna().unique())

22

In [116]:
def get_ratio_acquisition(art, subset, subset_name, path):
    art = art[(art['acquisition_year'] >= 1945) & (art['acquisition_year'] <= 2017)]
    subset = subset[(subset['acquisition_year'] >= 1945) & (subset['acquisition_year'] <= 2017)]
    overall = subset.groupby('acquisition_year').count()['_id']
    overall_f = subset[subset['Gender']=='féminin'].groupby('acquisition_year').count()['_id']
    overall = pd.Series(overall_f/overall, name="Tous modes")
    total = subset.groupby(['acquisition_mode_clean', 'acquisition_year']).count()['_id']
    # We need the fraction of Femmes in each year/mode d'acquisition
    f = subset[(subset['Gender'] == 'féminin')].groupby(['acquisition_mode_clean', 'acquisition_year']).count()
    if len(f) == 0:
        f = total.copy()
        f[:] = 0
    else:
        f = f['_id']
    ratio = f/total
    if len(ratio) > 0:
        ratio = ratio.reset_index(name='Value')
        names = sorted(list(ratio['acquisition_mode_clean'].unique()))
        all_names = sorted(list(art['acquisition_mode_clean'].dropna().unique()))
        ratio = ratio.pivot_table(index='acquisition_mode_clean', columns='acquisition_year',
                                  values='Value', aggfunc='sum')
        ratio.index = names
        ratio = ratio.reindex(columns=range(1945, 2018)).reindex(index=all_names)
        ratio = ratio.append(overall)
        mratio = np.matrix(ratio)
        
        # Then we customize the plot
        fig, ax = plt.subplots(figsize=PLOTDIM)
        im = ax.matshow(mratio, cmap=my_cm, vmin=0, vmax=1)
        #fig.set_size_inches(PLOTDIM)
        divider = make_axes_locatable(ax)
        truc = divider.append_axes("right", size="2%", pad=-4.05)
        cbar = plt.colorbar(im, cax=truc)
        cbar.ax.set_yticklabels(['0%', '20%', '40%', '60%', '80%','100%'], fontsize=10)
        # ax.set_title("Proportion d'artistes femmes par mode d'acquisition, " + subset_name, fontsize=20, y=1)
        # ax.set_xlabel("Année d'acquisition", fontsize=20)
        ax.set_xticklabels(['50', '60', '70', '80', '90', '00', '10'], fontsize=10)
        ax.xaxis.set_ticks_position('bottom')
        ax.set_xticks(np.arange(5, 71, 10))
        domains = list(ratio.index)
        domains = [d if not "mixte" in d else "Mode d'acquisition mixte" for d in domains]
        domains = [d if not "non renseigné par commande" in d else "Non renseigné, par commande" for d in domains]
        domains = [d if not "non renseigné" in d else "Non renseigné" for d in domains]
        domains = [d if not "Douanes" in d else "Saisie des Douanes" for d in domains]
        ax.set_yticks(np.arange(0, len(domains), 1))
        ax.set_yticklabels(domains, fontsize=10)
        ax.set_aspect(4.2)
    plt.savefig(path, bbox_inches='tight',format="png", dpi=300, transparent=True)
    plt.close()

In [117]:
get_ratio_acquisition(art, art, "ensemble des collections", "./figures/_global/ratio_acquisitions.png")
get_ratio_acquisition(art, art[art['collection'].isin(fracs)], "ensemble des 18 FRAC", "./figures/_fracs/ratio_acquisitions.png")

In [118]:
for museum in art['collection'].dropna().unique():
    # print("Musée en cours:", museum)
    subset = art[art['collection'] == museum]
    get_ratio_acquisition(art, subset, museum, "./figures/" + validname(museum) + "/ratio_acquisitions.png")
    del subset
print('Done')

Done
