# Exhibitions

In [1]:
from collections import Counter

import pandas as pd

## Loading data

In [2]:
%%time

exhibitions_df = pd.read_excel('data/src/20180410_EXHIBITIONS.xlsx')
exhibitions_df.fillna('', inplace=True)

CPU times: user 25.5 s, sys: 248 ms, total: 25.8 s
Wall time: 25.8 s


In [3]:
exhibitions_df.shape

(72746, 26)

In [4]:
exhibitions_df.head()

Unnamed: 0,ID,post_type,post_title,place_t,place_r,place_c,start_y,start_m,start_d,end_y,...,xplace_t,xplace_r,xplace_c,xstart_y,xstart_m,xstart_d,xend_y,xend_m,xend_d,xgender
0,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,España,España,España,1985.0,1.0,1.0,,,,Femenino
1,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,2017.0,1.0,1.0,,,,
2,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,,,,,,,
3,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,,,,,,,
4,108998,exposición,"""European Masterworks. Paintings from the Coll...",Nashville,Tennessee,Estados Unidos,2001,4,8,2001,...,Francia,Francia,Francia,1840.0,11.0,14.0,1926.0,12.0,5.0,Masculino


## Cleaning data

### Blank values in important columns

In [5]:
important_columns = ['ID', 'post_type', 'post_title', 'xkey', 'xid', 'xpost_type']

In [6]:
for c in important_columns:
    counts = exhibitions_df[c].value_counts()
    if '' in counts:
        print(c, counts[''])

xkey 4
xid 4
xpost_type 123


In [7]:
exhibitions_df[exhibitions_df.xkey == ''][important_columns]

Unnamed: 0,ID,post_type,post_title,xkey,xid,xpost_type
28829,105470,exposición,"FAKE. No es verdad, no es mentira.",,,
43194,109392,exposición,Los colegios exponen,,,
66713,105469,exposición,Testigos de la ciudad,,,
68122,109234,exposición,Trampa para Incautos,,,


In [8]:
for ide in exhibitions_df[exhibitions_df.xkey == ''].ID:
    print(ide, exhibitions_df.ID.value_counts()[ide])

105470 1
109392 1
105469 1
109234 1


In [9]:
exhibitions_df = exhibitions_df[exhibitions_df.xkey != '']

### Values for 'xkey' and 'xpost_type'

In [10]:
exhibitions_df[['xkey', 'xpost_type']].drop_duplicates().sort_values('xkey')

Unnamed: 0,xkey,xpost_type
50190,actor comisario,
13,actor comisario,actor
29974,actor que participa como artista,entidad
301,actor que participa como artista,
0,actor que participa como artista,actor
1,catálogo,catálogo
8925,catálogo,
2313,coleccionista prestatario de obras,actor
3253,coleccionista prestatario de obras,
9,coleccionista prestatario de obras,entidad


In [11]:
len(exhibitions_df[
    (exhibitions_df.xkey == 'actor que participa como artista') &
    (exhibitions_df.xpost_type == '')
])

47

In [12]:
len(exhibitions_df[
    (exhibitions_df.xkey == 'actor que participa como artista') &
    (exhibitions_df.xpost_type == 'actor')
])

44371

In [13]:
len(exhibitions_df[
    (exhibitions_df.xkey == 'actor que participa como artista') &
    (exhibitions_df.xpost_type == 'entidad')
])

2

In [14]:
len(exhibitions_df[
    (exhibitions_df.xkey == 'coleccionista prestatario de obras') &
    (exhibitions_df.xpost_type == '')
])

8

In [15]:
len(exhibitions_df[
    (exhibitions_df.xkey == 'coleccionista prestatario de obras') &
    (exhibitions_df.xpost_type == 'actor')
])

222

In [16]:
len(exhibitions_df[
    (exhibitions_df.xkey == 'coleccionista prestatario de obras') &
    (exhibitions_df.xpost_type == 'entidad')
])

853

In [17]:
for i, row in exhibitions_df[exhibitions_df.xpost_type == ''].iterrows():
    if row.xkey == 'actor comisario':
        exhibitions_df.loc[i, 'xpost_type'] = 'actor'
    elif row.xkey == 'actor que participa como artista':
        exhibitions_df.loc[i, 'xpost_type'] = 'actor'
    elif row.xkey == 'catálogo':
        exhibitions_df.loc[i, 'xpost_type'] = 'catálogo'
    elif row.xkey == 'coleccionista prestatario de obras':
        exhibitions_df.loc[i, 'xpost_type'] = 'entidad'
    elif row.xkey == 'empresa que realiza la museografía':
        exhibitions_df.loc[i, 'xpost_type'] = 'empresa'
    elif row.xkey == 'entidad organizadora':
        exhibitions_df.loc[i, 'xpost_type'] = 'entidad'
    elif row.xkey == 'entidad patrocinadora':
        exhibitions_df.loc[i, 'xpost_type'] = 'entidad'
    elif row.xkey == 'exposición de la que depende':
        exhibitions_df.loc[i, 'xpost_type'] = 'exposición'
    elif row.xkey == 'fuente de información':
        exhibitions_df.loc[i, 'xpost_type'] = 'entidad'

for i, row in exhibitions_df[
    (exhibitions_df.xkey == 'actor que participa como artista') &
    (exhibitions_df.xpost_type != 'actor')
    ].iterrows():
        exhibitions_df.loc[i, 'xpost_type'] = 'actor'

### Gender

In [18]:
exhibitions_df[exhibitions_df.xpost_type != 'actor'].xgender.value_counts()

    24377
Name: xgender, dtype: int64

In [19]:
exhibitions_df[exhibitions_df.xpost_type == 'actor'].xgender.value_counts()

Masculino       34031
Femenino        10936
No declarado     3348
                   50
Name: xgender, dtype: int64

In [20]:
for i, row in exhibitions_df[
    (exhibitions_df.xpost_type == 'actor') &
    (exhibitions_df.xgender == '')
    ].iterrows():
        exhibitions_df.loc[i, 'xgender'] = 'No declarado'

### Test

In [21]:
for c in important_columns:
    counts = exhibitions_df[c].value_counts()
    if '' in counts:
        print(c, counts[''])

In [22]:
exhibitions_df[['xkey', 'xpost_type']].drop_duplicates().sort_values('xkey')

Unnamed: 0,xkey,xpost_type
13,actor comisario,actor
0,actor que participa como artista,actor
1,catálogo,catálogo
9,coleccionista prestatario de obras,entidad
2313,coleccionista prestatario de obras,actor
1761,empresa que realiza la museografía,empresa
2,entidad organizadora,entidad
16,entidad patrocinadora,entidad
32,exposición de la que depende,exposición
3,fuente de información,entidad


In [23]:
exhibitions_df[exhibitions_df.xpost_type == 'actor'].xgender.value_counts()

Masculino       34031
Femenino        10936
No declarado     3398
Name: xgender, dtype: int64

## Saving data

In [45]:
exhibitions_df.to_csv('data/out/exhibitions.csv', index=False)