# Current version: 0.2.1 (unfinished)

From 2024, updates to the dataset are handled and stored in a separate file. This is that file (previously, all Art500k dataset processing was done in *art500k.csv*, the file now renamed to *art500k_initial*).

In [2]:
import numpy as np
import pandas as pd

url_v_latest = "https://raw.githubusercontent.com/me9hanics/PainterPalette/main/datasets/artists.csv"
artists = pd.read_csv(url_v_latest)

## 2024.01.07: Use measures to find artists with multiple names (aliases)

If we take a look at popular artists in the dataset, for example Rembrandt:

In [4]:
url_v_01_09 = "https://raw.githubusercontent.com/me9hanics/PainterPalette/main/datasets/saves/artists_0_1.csv"
artists = pd.read_csv(url_v_01_09)

Considered measures:

* Fuzzy string matching (Levenshtein distance) between artist names. 
* Basic string containment (other artists names containing one word artist names, e.g. Rembrandt).
* Token-Based Matching (TBM) (Jaccard similarity) between artist names.
* Named Entity Recognition (NER) (Spacy) to find artist names from text, then apply Coreference Resolution to link pronouns and other expressions to the correct entities.


Other considerations: <br>
* Phonetic matching: This could be helpful when an artist's name is spelled differently in different languages, e.g. "Č" (Czech) / "Ch" (English) / "cs" (Hungarian). Even if this is the case for some instances, we should find these with the Levenshtein distance search. <br>
* Online available resources for aliases, web scraping, etc.
* Custom rules (e.g "... and his workshop", "... and his circle", etc.)

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz, process

def fuzzy_match(target, choices, threshold=50):
    """
    Perform fuzzy string matching and return the best matches above the threshold.
    """
    #result, score, index = process.extractOne(target, choices, scorer=fuzz.ratio)
    results = process.extractBests(target, choices, scorer=fuzz.ratio)
    scores = [score for _, score, _ in results]
    print(results, scores) 
    wins = [r for r in results if r[1] > threshold]

    return wins[0]


matches = {}

for clean_artist in df_clean['artist']:
    match = fuzzy_match(clean_artist, df_dirty['artist'])
    if match:
        print(f'"{clean_artist}" matches "{match}"')
        matches[clean_artist] = match


In [6]:
import spacy

# Sample data
data = {
    'author_name': ['Rembrandt', 'Rembrandt van Rijn', 'Rembrandt Peale', 'Michelangelo', 'Michelangelo Buonarroti', 'Michelangelo Merisi da Caravaggio', 'Caravaggio', 'Caravaggio, Michelangelo Merisi da', 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)', 'Leonardo', 'Leonardo da Vinci'],
}

# Assuming your dataframe is named df
df = pd.DataFrame(data)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Extract entities using spaCy NER
aliases = {}

for name in df['author_name']:
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ent.text != name:
            aliases.setdefault(name, set()).add(ent.text)
            aliases.setdefault(ent.text, set()).add(name)

# Convert sets to lists
aliases = {key: list(value) for key, value in aliases.items()}

for key, value in aliases.items():
    print(f'{key}: {value}')


Michelangelo Merisi da Caravaggio: ['Michelangelo Merisi da']
Michelangelo Merisi da: ['Michelangelo Merisi da Caravaggio', 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)']
Caravaggio, Michelangelo Merisi da: ['Michelangelo Merisi', 'Caravaggio']
Caravaggio: ['Caravaggio, Michelangelo Merisi da', 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)']
Michelangelo Merisi: ['Caravaggio, Michelangelo Merisi da']
Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole): ['Michelangelo Merisi da', 'Caravaggio']


In [10]:
import spacy
import pandas as pd

# Sample data
data = {
    'author_name': ['Rembrandt', 'Rembrandt van Rijn', 'Rembrandt Peale', 'Michelangelo', 'Michelangelo Buonarroti', 'Michelangelo Merisi da Caravaggio', 'Caravaggio', 'Caravaggio, Michelangelo Merisi da', 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)', 'Leonardo', 'Leonardo da Vinci'],
}

# Assuming your dataframe is named df
df = pd.DataFrame(data)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Extract entities using spaCy NER
aliases = {}

for name in df['author_name']:
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            aliases.setdefault(name, set()).add(ent.text)
            aliases.setdefault(ent.text, set()).add(name)

# Convert sets to lists
aliases = {key: list(value) for key, value in aliases.items()}
aliases

{'Rembrandt van Rijn': ['Rembrandt van Rijn'],
 'Rembrandt Peale': ['Rembrandt Peale'],
 'Michelangelo': ['Michelangelo'],
 'Michelangelo Buonarroti': ['Michelangelo Buonarroti'],
 'Michelangelo Merisi da Caravaggio': ['Michelangelo Merisi da'],
 'Michelangelo Merisi da': ['Michelangelo Merisi da Caravaggio',
  'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)'],
 'Caravaggio, Michelangelo Merisi da': ['Michelangelo Merisi', 'Caravaggio'],
 'Caravaggio': ['Caravaggio, Michelangelo Merisi da',
  'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)'],
 'Michelangelo Merisi': ['Caravaggio, Michelangelo Merisi da'],
 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)': ['Michelangelo Merisi da',
  'Caravaggio'],
 'Leonardo': ['Leonardo'],
 'Leonardo da Vinci': ['Leonardo da Vinci']}

In [9]:
class UnionFind:
    def __init__(self):
        self.parent = {}

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
        elif self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

# Sample data
data = {
    'author_name': ['Rembrandt', 'Rembrandt van Rijn', 'Rembrandt Peale', 'Michelangelo', 'Michelangelo Buonarroti', 'Michelangelo Merisi da Caravaggio', 'Caravaggio', 'Caravaggio, Michelangelo Merisi da', 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)', 'Leonardo', 'Leonardo da Vinci'],
}

# Assuming your dataframe is named df
df = pd.DataFrame(data)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Extract entities using spaCy NER
uf = UnionFind()

for name in df['author_name']:
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            uf.union(name, ent.text)

# Group aliases
groups = {}
for name in uf.parent.keys():
    parent = uf.find(name)
    if parent not in groups:
        groups[parent] = []
    groups[parent].append(name)

for group in groups.values():
    print(group)

['Rembrandt van Rijn']
['Rembrandt Peale']
['Michelangelo']
['Michelangelo Buonarroti']
['Michelangelo Merisi da', 'Michelangelo Merisi da Caravaggio', 'Caravaggio', 'Caravaggio, Michelangelo Merisi da', 'Michelangelo Merisi', 'Caravaggio, Michelangelo Merisi da (Italian, Milan or Caravaggio 1571-1610 Porto Ercole)']
['Leonardo']
['Leonardo da Vinci']


This seems to leave