# Mix'n'match Mismatch Generation

This notebook is used to genrate mismatches for [Mismatch Finder](https://www.wikidata.org/wiki/Wikidata:Mismatch_Finder) via a request to [Mix'n'match](https://meta.wikimedia.org/wiki/Mix%27n%27match) data stores. Data will be formatted for upload given the [directions for creating a mismatch file](https://github.com/wmde/wikidata-mismatch-finder/blob/main/docs/UserGuide.md#creating-a-mismatches-import-file).

In [None]:
#!pip install jupyter-black
#!pip install tensorflow

In [None]:
# %load_ext jupyter_black

In [None]:
import ast
import json
import sys
import urllib

import numpy as np
import pandas as pd

PATH_TO_UTILS = "."  # change based on your directory structure
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

## Get Data

In [None]:
mnm_mismatch_request_url = (
    "https://mix-n-match.toolforge.org/api.php?query=all_issues&mode=time_mismatch"
)

In [None]:
with urllib.request.urlopen(mnm_mismatch_request_url) as url:
    mnm_mismatch_data = json.load(url)

In [None]:
print(f"{len(mnm_mismatch_data['data']):,}")

In [None]:
mnm_mismatch_data["data"][:2]

In [None]:
mnm_mismatch_data_expanded = []
for d in mnm_mismatch_data["data"]:
    d["source"] = f"https://mix-n-match.toolforge.org/#/entry/{d['entry_id']}"
    d.pop("issue_id", None)
    d["time_mismatch"]["pid"] = d["time_mismatch"].pop("prop")
    d["time_mismatch"]["qid"] = d["time_mismatch"].pop("q")
    d["item_id"] = d["time_mismatch"]["qid"]

    mnm_mismatch_data_expanded.append(d)

In [None]:
mnm_mismatch_data_expanded[:2]

In [None]:
import pandas as pd
from numpy import NAN
from tqdm import tqdm

In [None]:
acc = []
for entry in tqdm(mnm_mismatch_data_expanded):
    data = entry["time_mismatch"]
    req = f'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{entry["item_id"]}?_fields=statements'
    try:
        with urllib.request.urlopen(req) as url:
            wd_props = json.load(url)["statements"]
    except urllib.request.HTTPError as e:
        # Fixed in newer version https://stackoverflow.com/questions/67723860/python-urllib-request-urlopen-http-error-308-permanent-redirect
        print("Skipped", req)
        print(e)
        continue
    
    if "P1220" not in wd_props:  # https://www.wikidata.org/wiki/Property:P1220
        # No mix'n'match id, meaning it is automatic, not manual match & should be skipped
        continue

    with urllib.request.urlopen(f'https://mix-n-match.toolforge.org/api.php?query=get_entry&entry={entry["entry_id"]}') as url:
        ext_url = json.load(url)["data"]["entries"][entry["entry_id"]]["ext_url"]
    
    acc.append({
        "item_id": entry["item_id"],
        "statement_guid": wd_props[data["pid"]][0]["id"],
        "property_id": data["pid"],
        "wikidata_value": data["wd_time"],
        "meta_wikidata_value": NAN,
        "external_value": data["mnm_time"],
        "external_url": ext_url,
        "type": "statement",
    })
pd.DataFrame(acc)