# Replica project

### Quantitative analysis of how interesting morphograph clusters are

In [11]:
# loading the metadata
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import pickle
import sys
from tqdm import tqdm
from deep_translator import GoogleTranslator
from sklearn.feature_extraction.text import TfidfVectorizer


sys.path.insert(0, "../model/")
from utils import *

sys.path.insert(0, "../web_annotation/")
from utils_clusters import *
from metrics_clusters import *
from interest_metrics import *

data_dir = '/scratch/students/schaerf/'

data_dir = '../data/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
new = True
translate = False

In [78]:
positives = update_morph(data_dir, '', new=new).groupby('uid').first().reset_index()
print(positives.shape)
positives.head()

(2280, 27)


Unnamed: 0,uid,uid_connection,index,level_0,AuthorOriginal,Description,Author,AuthorBirth,path,BeginDate,...,img1,img2,type,annotated,cluster_file,cluster,set,new_cluster,new set,old_cluster
0,00046fb9752f41e8a74ab3a2b1168039,0dde0c4734c34ad2a91e5470d8cd1295,0.0,435,"GARNIER, Etienne-Barthélemy",Diana and Her Nymphs,"GARNIER, Etienne-Barthélemy","(b. 1759, Paris, d. 1849, Paris)",../data/WGA/images/g/garnie/diana_ny.jpg,1751.0,...,00046fb9752f41e8a74ab3a2b1168039,d00e4250fe934c53b065725419483521,POSITIVE,2016-05-17 16:24:53+00:00,Original,72,val,72,val,72.0
1,00097e39496f431186a6eac236d809d8,5737dcc62a334c8894a7c6fcc801c74e,234.0,2666,"STROZZI, Bernardo",Berenice,"STROZZI, Bernardo","(b. 1581, Genova, d. 1644, Venezia)",../data/WGA/images/s/strozzi/2/berenice.jpg,1601.0,...,00097e39496f431186a6eac236d809d8,d60902bab566424a94cbcfcd99e0558d,POSITIVE,2018-07-11 17:33:37.897066+00:00,Original,821,val,821,val,822.0
2,003597f46a3f4db5ad35b1e8cbcee1cd,f41acc297f1f4594bbd22426b2bbfa2b,235.0,7029,"GRECO, El",Christ Healing the Blind,"GRECO, El","(b. 1541, Candia, d. 1614, Toledo)",../data/WGA/images/g/greco_el/02/0201grec.jpg,1562.0,...,003597f46a3f4db5ad35b1e8cbcee1cd,fb8fcaa505bf4b8897773da49eb257ec,POSITIVE,2017-09-22 15:58:57.418057+00:00,Original,344,train,344,train,344.0
3,004b4ecffec840ce8bbc4a768efe2c87,004b4ecffec840cea54692e70d9ade6e,,179,"GOLT ZIUS, Hendrick",Tre pose per lo studio di un ritratto d 'uomo ...,GOLTZIUS Hendrick,1558.0,../data/15B/15B_13.jpg,1576.0,...,004b4ecffec840ce8bbc4a768efe2c87,faffda2772514cdba54692e70d9ade6e,POSITIVE,14-06-2022_20:12:15,01-06-2022/clusters_optics_0.13_01-06-2022_19,879,no set,879,train,
4,00a59b7eb3ae41af81d3ccaaedb2007c,3d7c3002b7ab4e578523e2ee841f6f0e,238.0,1917,"CALVAERT, Dionisio.",Gesù Cristo flagellato. 150 x 67.,CALVAERT Dionisio,1535.0,../data/111B/111B_407.jpg,1553.0,...,00a59b7eb3ae41af81d3ccaaedb2007c,a15f8efc3926415da7727b63ab819ee5,POSITIVE,2018-03-03 19:31:18.759602+00:00,Original,133,train,133,train,133.0


In [79]:
# scores for iconography
if translate:
    positives.loc[:, 'Description (EN)'] = [catch_transl(lambda : GoogleTranslator(source='auto', target='en').translate(word)) 
                                      for word in tqdm(list(positives.loc[:, 'Description'].astype(str)))]

else:
    with open(data_dir + 'uid2desc.pkl', 'rb') as infile:
        uid2endesc = pickle.load(infile)

    positives.loc[:, 'Description (EN)'] = [catch_transl_2(uid, word, uid2endesc) 
                                for uid, word in tqdm(zip(list(positives.loc[:, 'uid'].astype(str)), list(positives.loc[:, 'Description'].astype(str))))]

    uid2endesc = {uid:desc for uid,desc in zip(positives['uid'], positives['Description (EN)'])}

    with open(data_dir + 'uid2desc.pkl', 'wb') as outfile:
        pickle.dump(uid2endesc, outfile)


2280it [00:00, 625115.25it/s]


In [80]:
positives['Description (EN - ref)'] = positives['Description (EN)'].astype(str).str.split(r'[^S-]\.').apply(lambda x: x[0]).apply(lambda x: x.replace('0123456789', ''))
positives['Description (EN - ref)']

0                                    Diana and Her Nymphs
1                                                Berenice
2                                Christ Healing the Blind
3       Three poses for the study of a portrait of a m...
4                                    Jesus Christ scourge
                              ...                        
2275                Portrait of Francis I, King of France
2276                     ff7c7f6e16d645869e828e016234db4d
2277    The Grand Canal Seen from Rialto toward the North
2278                                Study for a Fruit Bow
2279                                   Death of Cleopatra
Name: Description (EN - ref), Length: 2280, dtype: object

In [81]:
def cluster_text(text, range_try=(100,102), hyperparam=False):
    vectorizer = TfidfVectorizer(stop_words={'english'})
    X = vectorizer.fit_transform(text)


    #import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    
    if hyperparam:
        Sum_of_squared_distances = []
        K = range(range_try[0],range_try[1])
        for k in K:
            km = KMeans(n_clusters=k, max_iter=200, n_init=10)
            km = km.fit(X)
            Sum_of_squared_distances.append(km.inertia_)
        plt.plot(K, Sum_of_squared_distances, 'bx-')
        plt.xlabel('k')
        plt.ylabel('Sum_of_squared_distances')
        plt.title('Elbow Method For Optimal k')
        plt.show()

    print('How many clusters do you want to use?')
    true_k = int(input())
    
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
    model.fit(X)

    labels=model.labels_
    clusters=pd.DataFrame(list(zip(text,labels)),columns=['title','cluster'])
    #print(clusters.sort_values(by=['cluster']))

    for i in range(true_k):
        print(clusters[clusters['cluster'] == i])
        
    return clusters

In [82]:
clusters = cluster_text(positives['Description (EN - ref)'].values)

How many clusters do you want to use?
                                                title  cluster
166                                 Judgment of Paris        0
285                             The Judgment of Paris        0
407                                        Still-Life        0
538                              The Judgment of Pari        0
708                               Kneeling male pigur        0
773                                        Still-Life        0
785                                        Male pigur        0
942                           Still Life with a Skull        0
1120                                Judgment of Paris        0
1300                          Still-life with Peaches        0
1308  Still-Life with Ham, Bread and Precious Vessels        0
1529                                 Fruit Still-Life        0
1561                                       Still-Life        0
1602                            The Judgment of Paris        0
1752             

In [83]:
clusters['cluster_iconography'] = clusters['cluster']

In [84]:
positives = positives.merge(clusters[['cluster_iconography']], left_index=True, right_index=True)

In [85]:
positives.head()

Unnamed: 0,uid,uid_connection,index,level_0,AuthorOriginal,Description,Author,AuthorBirth,path,BeginDate,...,annotated,cluster_file,cluster,set,new_cluster,new set,old_cluster,Description (EN),Description (EN - ref),cluster_iconography
0,00046fb9752f41e8a74ab3a2b1168039,0dde0c4734c34ad2a91e5470d8cd1295,0.0,435,"GARNIER, Etienne-Barthélemy",Diana and Her Nymphs,"GARNIER, Etienne-Barthélemy","(b. 1759, Paris, d. 1849, Paris)",../data/WGA/images/g/garnie/diana_ny.jpg,1751.0,...,2016-05-17 16:24:53+00:00,Original,72,val,72,val,72.0,Diana and Her Nymphs,Diana and Her Nymphs,46
1,00097e39496f431186a6eac236d809d8,5737dcc62a334c8894a7c6fcc801c74e,234.0,2666,"STROZZI, Bernardo",Berenice,"STROZZI, Bernardo","(b. 1581, Genova, d. 1644, Venezia)",../data/WGA/images/s/strozzi/2/berenice.jpg,1601.0,...,2018-07-11 17:33:37.897066+00:00,Original,821,val,821,val,822.0,Berenice,Berenice,7
2,003597f46a3f4db5ad35b1e8cbcee1cd,f41acc297f1f4594bbd22426b2bbfa2b,235.0,7029,"GRECO, El",Christ Healing the Blind,"GRECO, El","(b. 1541, Candia, d. 1614, Toledo)",../data/WGA/images/g/greco_el/02/0201grec.jpg,1562.0,...,2017-09-22 15:58:57.418057+00:00,Original,344,train,344,train,344.0,Christ Healing the Blind,Christ Healing the Blind,30
3,004b4ecffec840ce8bbc4a768efe2c87,004b4ecffec840cea54692e70d9ade6e,,179,"GOLT ZIUS, Hendrick",Tre pose per lo studio di un ritratto d 'uomo ...,GOLTZIUS Hendrick,1558.0,../data/15B/15B_13.jpg,1576.0,...,14-06-2022_20:12:15,01-06-2022/clusters_optics_0.13_01-06-2022_19,879,no set,879,train,,Three poses for the study of a portrait of a m...,Three poses for the study of a portrait of a m...,28
4,00a59b7eb3ae41af81d3ccaaedb2007c,3d7c3002b7ab4e578523e2ee841f6f0e,238.0,1917,"CALVAERT, Dionisio.",Gesù Cristo flagellato. 150 x 67.,CALVAERT Dionisio,1535.0,../data/111B/111B_407.jpg,1553.0,...,2018-03-03 19:31:18.759602+00:00,Original,133,train,133,train,133.0,Jesus Christ scourged. 150 x 67.,Jesus Christ scourge,30


In [86]:
scores_iconography = {cluster: np.around(content['cluster_iconography'].nunique() - (content.shape[0] * 0.1),2) for cluster, content in positives.groupby('cluster') if content.shape[0] > 1}
list(scores_iconography.items())[:20]

[(0, 0.8),
 (1, 3.5),
 (2, 0.7),
 (3, 1.5),
 (4, 0.7),
 (5, 0.8),
 (7, 0.7),
 (9, 0.7),
 (10, 3.0),
 (13, 2.2),
 (14, 2.0),
 (15, 1.8),
 (17, 1.3),
 (19, 1.8),
 (20, 0.8),
 (22, 0.5),
 (24, 1.5),
 (25, 1.2),
 (26, 0.8),
 (27, 0.8)]

In [87]:
positives[positives['cluster'] == list(scores_iconography.keys())[np.argmin(list(scores_iconography.values()))]]['Description'].value_counts()

Putto in fasce.    8
Putto in fasce?    1
Putto in fasce,    1
Name: Description, dtype: int64

In [88]:
positives[positives['cluster'] == list(scores_iconography.keys())[np.argmax(list(scores_iconography.values()))]]['Description'].value_counts()

St Jerome                             1
Contemplazione della Morte. 61x50.    1
L 'alchimista.                        1
San Girolamo nello studio             1
S. Girolamo.                          1
San Gerolamo                          1
Ritratto di un filosofo.              1
Name: Description, dtype: int64

In [114]:
positives['scores_iconography'] = positives['cluster'].apply(lambda x: scores_iconography[x] if x in scores_iconography.keys() else 0)

## Authors

In [89]:
positives.Author.value_counts()

TIZIANO Vecellio                                 85
GRECO, El                                        64
RAFFAELLO Sanzio                                 57
CANALETTO                                        50
STROZZI Bernardo                                 41
                                                 ..
SEBASTIANO del PIOMBO MARLE-VENTURA VAN FONDO     1
TURCHI Alessandro ) (copia da)                    1
GHIRLANDAIO Ridolfo                               1
MARATTA Carlo (?)                                 1
SCOREL, Jan van                                   1
Name: Author, Length: 970, dtype: int64

In [90]:
positives['AuthorClean'] = positives['Author'].str.split().apply(lambda x: x[0].strip(',.;')) #.str.split('(').apply(lambda x: x[0])
positives['AuthorClean'].value_counts()

TIZIANO      134
BASSANO      101
RAFFAELLO     84
GRECO         65
STROZZI       58
            ... 
COUSIN         1
LA             1
PUPINI         1
METSIS         1
GRASSI         1
Name: AuthorClean, Length: 509, dtype: int64

In [91]:
scores_authors = {cluster: np.around(content['AuthorClean'].nunique() - (content.shape[0] * 0.01),2) for cluster, content in positives.groupby('cluster') if content.shape[0] > 1}
list(scores_authors.items())[:20]

[(0, 0.98),
 (1, 1.95),
 (2, 0.97),
 (3, 2.95),
 (4, 0.97),
 (5, 0.98),
 (7, 0.97),
 (9, 0.97),
 (10, 1.9),
 (13, 2.92),
 (14, 2.9),
 (15, 0.98),
 (17, 1.93),
 (19, 1.98),
 (20, 1.98),
 (22, 0.95),
 (24, 2.95),
 (25, 2.92),
 (26, 1.98),
 (27, 0.98)]

In [92]:
positives[positives['cluster'] == list(scores_authors.keys())[np.argmax(list(scores_authors.values()))]]['AuthorClean'].value_counts()

CRANACH       6
TIZIANO       5
BORDONE       3
GIORGIONE     2
GARNIER       1
LIBERI        1
HEEMSKERCK    1
COUSIN        1
LEFEBVRE      1
SPADARINO     1
PALMA         1
LE            1
GIROLAMO      1
SCOREL        1
Name: AuthorClean, dtype: int64

In [93]:
positives[positives['cluster'] == list(scores_authors.keys())[np.argmin(list(scores_authors.values()))]]['AuthorClean'].value_counts()

BASSANO    10
Name: AuthorClean, dtype: int64

In [115]:
positives['scores_authors'] = positives['cluster'].apply(lambda x: scores_authors[x] if x in scores_authors.keys() else 0)
    

### Author attributions

In [94]:
positives['AuthorAttr'] = positives['AuthorOriginal'].str.split('(').apply(lambda x: x[1] if len(x)>1 else 'Original').str.split(')').apply(lambda x: x[0]).apply(lambda x: x.replace('-)', '')).apply(lambda x: x.strip(') '))
positives['AuthorAttr'].value_counts()

Original         1859
copia da -         45
attr.              31
inc.               23
scuola di-         22
                 ... 
copia ua -.         1
i                   1
scuola des          1
imitatore-i -       1
copia ua-           1
Name: AuthorAttr, Length: 127, dtype: int64

In [95]:
clusters = cluster_text(positives['AuthorAttr'].values)

How many clusters do you want to use?
         title  cluster
0     Original        0
1     Original        0
2     Original        0
3     Original        0
4     Original        0
...        ...      ...
2275  Original        0
2276  Original        0
2277  Original        0
2278  Original        0
2279  Original        0

[1859 rows x 2 columns]
             title  cluster
16    bottega di -        1
28              Il        1
39         replica        1
52              I1        1
62              il        1
...            ...      ...
2224           Van        1
2247         pinx.        1
2256          inc.        1
2267          inc.        1
2269             ?        1

[260 rows x 2 columns]
        title  cluster
18      attr.        2
50      attr.        2
105     attr.        2
143     attr.        2
161     attr.        2
255     attr.        2
277     attr.        2
293     attr.        2
412     attr.        2
420   attr. a        2
445     attr.        2
491     attr.

In [96]:
clusters['cluster_attribution'] = clusters['cluster']

In [97]:
positives = positives.merge(clusters[['cluster_attribution']], left_index=True, right_index=True)
positives.head()

Unnamed: 0,uid,uid_connection,index,level_0,AuthorOriginal,Description,Author,AuthorBirth,path,BeginDate,...,set,new_cluster,new set,old_cluster,Description (EN),Description (EN - ref),cluster_iconography,AuthorClean,AuthorAttr,cluster_attribution
0,00046fb9752f41e8a74ab3a2b1168039,0dde0c4734c34ad2a91e5470d8cd1295,0.0,435,"GARNIER, Etienne-Barthélemy",Diana and Her Nymphs,"GARNIER, Etienne-Barthélemy","(b. 1759, Paris, d. 1849, Paris)",../data/WGA/images/g/garnie/diana_ny.jpg,1751.0,...,val,72,val,72.0,Diana and Her Nymphs,Diana and Her Nymphs,46,GARNIER,Original,0
1,00097e39496f431186a6eac236d809d8,5737dcc62a334c8894a7c6fcc801c74e,234.0,2666,"STROZZI, Bernardo",Berenice,"STROZZI, Bernardo","(b. 1581, Genova, d. 1644, Venezia)",../data/WGA/images/s/strozzi/2/berenice.jpg,1601.0,...,val,821,val,822.0,Berenice,Berenice,7,STROZZI,Original,0
2,003597f46a3f4db5ad35b1e8cbcee1cd,f41acc297f1f4594bbd22426b2bbfa2b,235.0,7029,"GRECO, El",Christ Healing the Blind,"GRECO, El","(b. 1541, Candia, d. 1614, Toledo)",../data/WGA/images/g/greco_el/02/0201grec.jpg,1562.0,...,train,344,train,344.0,Christ Healing the Blind,Christ Healing the Blind,30,GRECO,Original,0
3,004b4ecffec840ce8bbc4a768efe2c87,004b4ecffec840cea54692e70d9ade6e,,179,"GOLT ZIUS, Hendrick",Tre pose per lo studio di un ritratto d 'uomo ...,GOLTZIUS Hendrick,1558.0,../data/15B/15B_13.jpg,1576.0,...,no set,879,train,,Three poses for the study of a portrait of a m...,Three poses for the study of a portrait of a m...,28,GOLTZIUS,Original,0
4,00a59b7eb3ae41af81d3ccaaedb2007c,3d7c3002b7ab4e578523e2ee841f6f0e,238.0,1917,"CALVAERT, Dionisio.",Gesù Cristo flagellato. 150 x 67.,CALVAERT Dionisio,1535.0,../data/111B/111B_407.jpg,1553.0,...,train,133,train,133.0,Jesus Christ scourged. 150 x 67.,Jesus Christ scourge,30,CALVAERT,Original,0


In [98]:
scores_attributions = {cluster: np.around(content['cluster_attribution'].nunique() - (content.shape[0] * 0.05),2) for cluster, content in positives.groupby('cluster') if content.shape[0] > 1}
list(scores_attributions.items())[:20]

[(0, 0.9),
 (1, 1.75),
 (2, 0.85),
 (3, 1.75),
 (4, 0.85),
 (5, 0.9),
 (7, 0.85),
 (9, 1.85),
 (10, 2.5),
 (13, 0.6),
 (14, 0.5),
 (15, 0.9),
 (17, 0.65),
 (19, 0.9),
 (20, 0.9),
 (22, 0.75),
 (24, 1.75),
 (25, 1.6),
 (26, 0.9),
 (27, 0.9)]

In [99]:
positives[positives['cluster'] == list(scores_attributions.keys())[np.argmax(list(scores_attributions.values()))]]['AuthorAttr'].value_counts()

Original        4
copia da-       2
attr.           1
cerchia di -    1
da -            1
Name: AuthorAttr, dtype: int64

In [100]:
positives[positives['cluster'] == list(scores_attributions.keys())[np.argmin(list(scores_attributions.values()))]]['AuthorAttr'].value_counts()

Original    10
Name: AuthorAttr, dtype: int64

In [116]:
positives['scores_attributions'] = positives['cluster'].apply(lambda x: scores_attributions[x] if x in scores_attributions.keys() else 0)
    

### Time difference

In [101]:
extras = pd.read_csv(data_dir + 'morphograph/Cini_AllVariationsMerged_20210421.csv', sep=';')
extras.columns

Index(['Author', 'AuthorULAN', 'AuthorULANLabel', 'AuthorNationality',
       'BiographyLabel', 'AuthorBirth', 'AuthorDeath', 'AuthorBirthLong',
       'AuthorBirthLat', 'AuthorDeathLong', 'AuthorDeathLat',
       'AuthorBirthCity', 'AuthorDeathCity', 'CountModifiers',
       'CountModifiers_Contact', 'CountModifiers_Neighbours', 'CountWorks',
       'CountWorks_Modified', 'CountWorks_Contact', 'CountWorks_Neighbours',
       'PercWorks_Modified', 'PercWorks_Contact', 'PercWorks_Neighbours'],
      dtype='object')

In [119]:
positives_extra = positives.merge(extras[['Author', 'AuthorULAN', 'AuthorULANLabel', 'AuthorNationality',  'BiographyLabel', 
                                          'AuthorDeath', 'AuthorBirthLong', 'AuthorBirthLat', 'AuthorDeathLong', 'AuthorDeathLat',
                                           'CountModifiers',]], left_on='Author', right_on='Author', how='left')
positives_extra.columns                          

Index(['uid', 'uid_connection', 'index', 'level_0', 'AuthorOriginal',
       'Description', 'Author', 'AuthorBirth', 'path', 'BeginDate', 'ImageURL',
       'City', 'Country', 'AuthorBirthCity', 'AuthorDeathCity', 'Drawer',
       'Type', 'img1', 'img2', 'type', 'annotated', 'cluster_file', 'cluster',
       'set', 'new_cluster', 'new set', 'old_cluster', 'Description (EN)',
       'Description (EN - ref)', 'cluster_iconography', 'AuthorClean',
       'AuthorAttr', 'cluster_attribution', 'scores_iconography',
       'scores_authors', 'scores_attributions', 'AuthorULAN',
       'AuthorULANLabel', 'AuthorNationality', 'BiographyLabel', 'AuthorDeath',
       'AuthorBirthLong', 'AuthorBirthLat', 'AuthorDeathLong',
       'AuthorDeathLat', 'CountModifiers'],
      dtype='object')

In [120]:
positives_extra['BeginDate'].value_counts()

1501.0    158
1523.0     82
1546.0     73
1599.0     59
1493.0     42
         ... 
1698.0      1
1769.0      1
1849.0      1
1791.0      1
1587.0      1
Name: BeginDate, Length: 317, dtype: int64

In [121]:
scores_times = {cluster: np.around(content[content['BeginDate'].notnull()]['BeginDate'].max() - content[content['BeginDate'].notnull()]['BeginDate'].min() + (content.shape[0] * 0.05), 2) for cluster, content in positives_extra.groupby('cluster') if content[content['BeginDate'].notnull()].shape[0] > 1}
list(scores_times.items())[:20]

[(0, 1.1),
 (1, 0.25),
 (2, 20.15),
 (3, 31.25),
 (4, 22.15),
 (5, 12.1),
 (7, 22.15),
 (9, 12.15),
 (10, 59.5),
 (13, 54.4),
 (14, 50.5),
 (15, 9.1),
 (17, 50.35),
 (19, 157.1),
 (20, 44.1),
 (22, 30.25),
 (24, 171.25),
 (25, 99.4),
 (26, 13.1),
 (27, 1.1)]

In [122]:
positives_extra[positives_extra['cluster'] == list(scores_times.keys())[np.argmax(list(scores_times.values()))]]['BeginDate'].value_counts()

1503.0    2
1820.0    1
1497.0    1
Name: BeginDate, dtype: int64

In [123]:
positives_extra[positives_extra['cluster'] == list(scores_times.keys())[np.argmin(list(scores_times.values()))]]['BeginDate'].value_counts()

1470.0    2
Name: BeginDate, dtype: int64

In [124]:
positives_extra['scores_times'] = positives_extra['cluster'].apply(lambda x: scores_times[x]  if x in scores_times.keys() else 0)
    

### Places

In [125]:
scores_place = {cluster: np.around(content[content['AuthorDeathLat'].notnull()]['AuthorDeathLat'].max() - content[content['AuthorDeathLat'].notnull()]['AuthorDeathLat'].min() + content[content['AuthorDeathLong'].notnull()]['AuthorDeathLong'].max() - content[content['AuthorDeathLong'].notnull()]['AuthorDeathLong'].min() + (content.shape[0] * 0.05), 2) for cluster, content in positives_extra.groupby('cluster') if content[content['AuthorDeathLat'].notnull()].shape[0] > 1}
list(scores_place.items())[:20]

[(1, 0.25),
 (10, 1.42),
 (17, 0.35),
 (25, 0.4),
 (33, 0.55),
 (35, 0.15),
 (48, 2.26),
 (50, 1.6),
 (51, 4.3),
 (52, 0.35),
 (54, 0.6),
 (63, 0.35),
 (71, 0.1),
 (72, 1.3),
 (82, 0.15),
 (83, 0.1),
 (92, 22.84),
 (93, 0.25),
 (94, 0.2),
 (107, 0.15)]

In [126]:
positives_extra[positives_extra['cluster'] == list(scores_place.keys())[np.argmax(list(scores_place.values()))]][['City', 'Country']]#['BeginDate'].value_counts()

Unnamed: 0,City,Country
144,LONDRA,Italian
179,LONDRA,Dutch
181,FIRENZE,Netherlandish
302,LONDRA,Dutch
502,FIRENZE,Netherlandish
643,FIRENZE,Flemish
659,LONDR A,
692,NAPOLI,Dutch
1075,LONDRA,Netherlandish
1671,PA DO V A,


In [127]:
positives_extra[positives_extra['cluster'] == list(scores_place.keys())[np.argmin(list(scores_place.values()))]][['City', 'Country', ]]#['BeginDate'].value_counts()

Unnamed: 0,City,Country
1196,,Italian
2036,,Italian


In [128]:
positives_extra['scores_places'] = positives_extra['cluster'].apply(lambda x: scores_place[x]  if x in scores_place.keys() else 0)

In [129]:
# scores_iconography, scores_authors, scores_attributions, scores_times, scores_place
clu2count = {cl:group.shape[0] for cl, group in positives_extra.groupby('cluster')}

In [136]:
positives_extra['scores_count'] = positives_extra['cluster'].apply(lambda x: clu2count[x])

In [138]:
positives_extra.columns

Index(['uid', 'uid_connection', 'index', 'level_0', 'AuthorOriginal',
       'Description', 'Author', 'AuthorBirth', 'path', 'BeginDate', 'ImageURL',
       'City', 'Country', 'AuthorBirthCity', 'AuthorDeathCity', 'Drawer',
       'Type', 'img1', 'img2', 'type', 'annotated', 'cluster_file', 'cluster',
       'set', 'new_cluster', 'new set', 'old_cluster', 'Description (EN)',
       'Description (EN - ref)', 'cluster_iconography', 'AuthorClean',
       'AuthorAttr', 'cluster_attribution', 'scores_iconography',
       'scores_authors', 'scores_attributions', 'AuthorULAN',
       'AuthorULANLabel', 'AuthorNationality', 'BiographyLabel', 'AuthorDeath',
       'AuthorBirthLong', 'AuthorBirthLat', 'AuthorDeathLong',
       'AuthorDeathLat', 'CountModifiers', 'scores_times', 'scores_places',
       'count', 'scores_count'],
      dtype='object')

In [140]:
positives_extra[['uid', 'cluster', 'Description (EN)',
       'Description (EN - ref)', 'cluster_iconography', 'AuthorClean',
       'AuthorAttr', 'cluster_attribution', 'scores_iconography',
       'scores_authors', 'scores_attributions', 'AuthorDeathLong',
       'AuthorDeathLat', 'CountModifiers', 'scores_times', 'scores_places',
       'scores_count']].head()

Unnamed: 0,uid,cluster,Description (EN),Description (EN - ref),cluster_iconography,AuthorClean,AuthorAttr,cluster_attribution,scores_iconography,scores_authors,scores_attributions,AuthorDeathLong,AuthorDeathLat,CountModifiers,scores_times,scores_places,scores_count
0,00046fb9752f41e8a74ab3a2b1168039,72,Diana and Her Nymphs,Diana and Her Nymphs,46,GARNIER,Original,0,3.4,13.74,0.7,,,,251.3,1.3,26
1,00097e39496f431186a6eac236d809d8,821,Berenice,Berenice,7,STROZZI,Original,0,1.7,0.97,1.85,,,,2.15,0.0,3
2,003597f46a3f4db5ad35b1e8cbcee1cd,344,Christ Healing the Blind,Christ Healing the Blind,30,GRECO,Original,0,0.8,0.98,0.9,,,,8.1,0.0,2
3,004b4ecffec840ce8bbc4a768efe2c87,879,Three poses for the study of a portrait of a m...,Three poses for the study of a portrait of a m...,28,GOLTZIUS,Original,0,0.8,0.98,0.9,,,,0.1,0.0,2
4,00a59b7eb3ae41af81d3ccaaedb2007c,133,Jesus Christ scourged. 150 x 67.,Jesus Christ scourge,30,CALVAERT,Original,0,1.3,6.93,0.65,11.351,44.507,1.0,62.35,4.09,7


In [141]:
positives_extra[['uid', 'cluster', 'Description (EN)',
       'Description (EN - ref)', 'cluster_iconography', 'AuthorClean',
       'AuthorAttr', 'cluster_attribution', 'scores_iconography',
       'scores_authors', 'scores_attributions', 'AuthorDeathLong',
       'AuthorDeathLat', 'CountModifiers', 'scores_times', 'scores_places',
       'scores_count']].to_csv(data_dir + 'interest_scores.csv')