# Exhibitions

In [94]:
from collections import Counter
from pprint import pprint
from string import punctuation

from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd

## Loading data

In [2]:
%%time

exhibitions_df = pd.read_csv('data/out/exhibitions.csv')
exhibitions_df.fillna('', inplace=True)

CPU times: user 860 ms, sys: 192 ms, total: 1.05 s
Wall time: 1.05 s


In [3]:
exhibitions_df.shape

(72740, 26)

In [4]:
exhibitions_df.head()

Unnamed: 0,ID,post_type,post_title,place_t,place_r,place_c,start_y,start_m,start_d,end_y,...,xplace_t,xplace_r,xplace_c,xstart_y,xstart_m,xstart_d,xend_y,xend_m,xend_d,xgender
0,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,España,España,España,1985.0,1.0,1.0,,,,Femenino
1,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,2017.0,1.0,1.0,,,,NO APLICA
2,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,,,,,,,NO APLICA
3,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,,,,,,,NO APLICA
4,108998,exposición,"""European Masterworks. Paintings from the Coll...",Nashville,Tennessee,Estados Unidos,2001,4,8,2001,...,Francia,Francia,Francia,1840.0,11.0,14.0,1926.0,12.0,5.0,Masculino


## Processing data

In [8]:
def add_blank(x):
    return x + ' - '

exhibitions_df.post_title = exhibitions_df.post_title.apply(add_blank)

In [9]:
exhibitions_df.head()

Unnamed: 0,ID,post_type,post_title,place_t,place_r,place_c,start_y,start_m,start_d,end_y,...,xplace_t,xplace_r,xplace_c,xstart_y,xstart_m,xstart_d,xend_y,xend_m,xend_d,xgender
0,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver -",Granada,Andalucía,España,2017,3,1,2017,...,España,España,España,1985.0,1.0,1.0,,,,Femenino
1,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver -",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,2017.0,1.0,1.0,,,,NO APLICA
2,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver -",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,,,,,,,NO APLICA
3,117998,exposición,"""Elizabeth Blackwell"", Carmen Oliver -",Granada,Andalucía,España,2017,3,1,2017,...,Granada,Andalucía,España,,,,,,,NO APLICA
4,108998,exposición,"""European Masterworks. Paintings from the Coll...",Nashville,Tennessee,Estados Unidos,2001,4,8,2001,...,Francia,Francia,Francia,1840.0,11.0,14.0,1926.0,12.0,5.0,Masculino


In [48]:
stop = stopwords.words('english') + stopwords.words('spanish') + list(punctuation)

def clean(s):
    r = s.lower().strip()
    rs = [w for w in word_tokenize(r) if w not in stop]
    r = ' '.join(rs)
    for x in list("'`“”‘’¡¿…–"):
        r = r.replace(x, '')
    r = ' '.join(r.split())
    return r

### General histogram

In [49]:
all_text = exhibitions_df.drop_duplicates('ID').post_title.sum()

In [52]:
all_text[:100]

'"Elizabeth Blackwell", Carmen Oliver - "European Masterworks. Paintings from the Collection of the A'

In [50]:
cleaned_all_text = clean(all_text)

In [53]:
cleaned_all_text[:100]

'elizabeth blackwell carmen oliver european masterworks paintings collection art gallery ontario face'

In [54]:
general_counter = Counter(cleaned_all_text.split())

In [56]:
general_counter.most_common(20)

[('picasso', 687),
 ('exposición', 464),
 ('arte', 411),
 ('colección', 330),
 ('pintura', 275),
 ('museo', 165),
 ('título', 165),
 ('facticio', 159),
 ('art', 134),
 ('años', 133),
 ('fotografía', 129),
 ('obra', 122),
 ('josé', 111),
 ('obras', 81),
 ('luz', 77),
 ('málaga', 76),
 ('miradas', 74),
 ('vida', 74),
 ('paisajes', 72),
 ('artes', 72)]

### Histogram by city

In [65]:
sp_exhibitions = exhibitions_df[exhibitions_df.place_c == 'España'].drop_duplicates('ID')

In [102]:
sp_exhibitions.place_t.value_counts()[:10]

Málaga       1277
Madrid       1109
Córdoba       904
Sevilla       432
Barcelona     409
Valencia      359
Huelva        320
Bilbao        195
Almería       168
Granada       157
Name: place_t, dtype: int64

In [104]:
top_sp_cities = [t for t, f in sp_exhibitions.place_t.value_counts()[:10].iteritems()]

In [105]:
top_sp_cities

['Málaga',
 'Madrid',
 'Córdoba',
 'Sevilla',
 'Barcelona',
 'Valencia',
 'Huelva',
 'Bilbao',
 'Almería',
 'Granada']

In [85]:
top_sp_cities_exhibitions = sp_exhibitions[sp_exhibitions.place_t.isin(top_sp_cities)]

In [106]:
text_by_city_sr = top_sp_cities_exhibitions.groupby('place_t').post_title.sum()

In [109]:
for city in top_sp_cities:
    print(city)
    print('-' * 80)
    text = text_by_city_sr[city]
    cleaned_text = clean(text)
    counter = Counter(cleaned_text.split())
    pprint(counter.most_common(20))
    print('=' * 80)
    print()

Málaga
--------------------------------------------------------------------------------
[('picasso', 104),
 ('exposición', 47),
 ('málaga', 44),
 ('arte', 36),
 ('colección', 34),
 ('pintura', 29),
 ('obra', 23),
 ('años', 18),
 ('retratos', 13),
 ('obras', 12),
 ('certamen', 12),
 ('fundación', 11),
 ('ruiz', 11),
 ('fotografía', 11),
 ('fotografías', 11),
 ('ii', 10),
 ('cine', 10),
 ('siglo', 9),
 ('mundo', 9),
 ('artes', 9)]

Madrid
--------------------------------------------------------------------------------
[('arte', 51),
 ('picasso', 31),
 ('exposición', 30),
 ('museo', 23),
 ('colección', 21),
 ('españa', 20),
 ('prado', 19),
 ('miradas', 19),
 ('siglo', 18),
 ('juan', 17),
 ('colecciones', 16),
 ('mundo', 16),
 ('diseño', 15),
 ('obras', 15),
 ('art', 13),
 ('años', 13),
 ('pintura', 13),
 ('madrid', 13),
 ('retrato', 12),
 ('pinturas', 12)]

Córdoba
--------------------------------------------------------------------------------
[('exposición', 172),
 ('título', 149),
 ('f