# Alignment of MusicBrainz and Wikidata instruments

In [None]:
%run -i ../startup.py
ENTITY_TYPE = 'instrument'

## Instruments from Wikidata

Wikidata entities which are musical instruments or families of musical instruments:

In [None]:
# instance of musical instrument
wd_musical_instruments = sparql("""
SELECT ?instrument ?instrumentLabel ?HornbostelSachs
WHERE {
  { ?instrument wdt:P31* wd:Q34379 . }
  UNION
  { ?instrument wdt:P31 wd:Q1254773 . }
  OPTIONAL
  { ?instrument wdt:P1762 ?HornbostelSachs . }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""")
wd_musical_instruments.rename(columns={
    'instrument': 'wd', 'instrumentLabel': 'name'}, inplace=True)
wd_musical_instruments.head()

Entities with "instrumental" links to MB:

In [None]:
# linked to MB instrument
links_from_wd = sparql("""
SELECT (?instrument AS ?wd) ?mbid ?instrumentLabel
WHERE {
  ?instrument wdt:P1330 ?mbid .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?instrumentLabel)
""")
links_from_wd.rename(columns={'instrumentLabel': 'name'}, inplace=True)

display_df(links_from_wd.head())

### Wikidata instruments with several MusicBrainz links

Probably needs cleanup

In [None]:
set([wd for wd in links_from_wd.wd
     if links_from_wd.wd.to_list().count(wd) > 1])

In [None]:
set([mbid for mbid in links_from_wd.mbid
     if links_from_wd.mbid.to_list().count(mbid) > 1])

## Instruments from MusicBrainz with wikidata links

In [None]:
links_from_mb = sql("""
SELECT
    url.url AS wd,
    instrument.gid AS mbid,
    instrument.name
FROM url
JOIN l_instrument_url AS llu ON llu.entity1 = url.id
JOIN instrument              ON llu.entity0 = instrument.id
WHERE
    url.url LIKE '%%wikidata.org%%'
ORDER BY instrument.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)
display_df(links_from_mb.head())

### MusicBrainz instruments with several Wikidata links

Probably needs cleanup

In [None]:
set([wd for wd in links_from_mb.wd
     if links_from_mb.wd.to_list().count(wd) > 1])

In [None]:
set([mbid for mbid in links_from_mb.mbid
     if links_from_mb.mbid.to_list().count(mbid) > 1])

## Data alignment

In [None]:
merge = pd.merge(links_from_wd, links_from_mb, 
                 on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
                 how='outer', indicator=True)
display_df(merge.head())

In [None]:
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][['name_mb', 'mbid', 'wd']]
display_df(links_to_add_to_wd)

24 links in MB that are not in WD

In [None]:
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][['name_wd', 'wd', 'mbid']]
display_df(links_to_add_to_mb)

9 links in WD that are not in MB

In those mismatches, some are not recognized because of redirects on WD side: Q54995817 to Q4138014, Q16033036 to Q3181140

## Instruments from MusicBrainz without wikidata links

In [None]:
no_links_from_mb = sql("""
SELECT
    gid AS mbid,
    name
FROM
    instrument
WHERE
    id NOT IN (
        SELECT
            instrument.id
        FROM url
        JOIN l_instrument_url AS llu ON llu.entity1 = url.id
        JOIN instrument              ON llu.entity0 = instrument.id
        WHERE
            url.url LIKE '%%wikidata.org%%'
    )
;
""")
no_links_from_mb.mbid = no_links_from_mb.mbid.apply(str)
display_df(no_links_from_mb)

## Alignment suggestions

### Exact match

Exact match between instrument names in WD and MB:

In [None]:
no_links_merge = pd.merge(no_links_from_mb, wd_musical_instruments, 
                 on='name', how='inner', indicator=False)
display_df(no_links_merge)

### With fuzzy-matching library

Using fuzzy-matching to find close instrument names:

In [None]:
import fuzzymatcher

match = fuzzymatcher.fuzzy_left_join(
    no_links_from_mb, wd_musical_instruments[['wd', 'name']], 
    left_on='name', right_on='name')[['best_match_score', 'mbid', 
                                      'name_left', 'name_right', 'wd']]
match = match[match['best_match_score'] > 0.09].sort_values(by='best_match_score', 
                                                            ascending=False)

In [None]:
display_df(match, index=False)

### With record linkage library

In [None]:
import recordlinkage

# Indexation step
indexer = recordlinkage.SortedNeighbourhoodIndex('name', window=9)
pairs = indexer.index(no_links_from_mb, wd_musical_instruments[['wd', 'name']])
print(len(pairs))

# Comparison step
compare_cl = recordlinkage.Compare()
compare_cl.string('name', 'name', method='jarowinkler', 
                  threshold=0.9, label='name')
features = compare_cl.compute(pairs, no_links_from_mb, wd_musical_instruments[['wd', 'name']])
print(features[features.sum(axis=1) > 0].shape)

# Classification step
linkage = []
for (idx0, idx1) in features[features.sum(axis=1) > 0].index:
    linkage.append([
        no_links_from_mb.loc[idx0]['mbid'],
        no_links_from_mb.loc[idx0]['name'],
        wd_musical_instruments.loc[idx1]['name'],
        wd_musical_instruments.loc[idx1]['wd'],
    ])
    
display_df(pd.DataFrame(linkage, columns=('mbid', 'name_left', 'name_right', 'wd')),
           index=False)

## Report

In [None]:
import jinja2

template = jinja2.Template("""
<!doctype html>

<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Alignment of MusicBrainz and Wikidata Instruments</title>
    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
  </head>

  <body style="margin: 20px;">
    <h1>Alignment of MusicBrainz and Wikidata Instruments</h1>

    <p>Latest MB database update: {{ MB_DATABASE_VERSION }}</p>
    <p>Latest update: {{ date.today().isoformat() }}</p>
    
    <ol>
      <li><a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a></li>
      <li><a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a></li>
      <li><a href="#alignment">Missing alignment suggestions</a>
    </ol>
    
    <h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
    {{ df_to_html(links_to_add_to_mb) }}

    <h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
    {{ df_to_html(links_to_add_to_wd) }}

    <h2 id="alignment">Missing alignment suggestions</h2>
    
    <h3>Alignment on exact names</h3>
    {{ df_to_html(no_links_merge) }}
    
    <h3>Alignment on fuzzy matching</h3>
    {{ df_to_html(match) }}    
  
  </body>
</html>
""")

with open('../docs/wd-instruments-report.html', 'w') as f:
    f.write(template.render(**globals())
            .replace('&lt;', '<').replace('&gt;', '>')
            .replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
            .replace('thead', 'thead class="thead-light"'))