From 7259a132334ae2e1b217b7b1b4fa330cf6df9b49 Mon Sep 17 00:00:00 2001
From: Clara Andrew-Wani <candrewwani@gmail.com>
Date: Wed, 29 Sep 2021 18:40:26 -0400
Subject: [PATCH] Add algorithm_v3 which uses pandas udf instead of spark udf

Output is slightly modified now from an array of objects to that of
an array of json. This change will need to be accounted for in the
ETL pipeline
---
 algorithm_v3.ipynb | 649 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 649 insertions(+)
 create mode 100644 algorithm_v3.ipynb

diff --git a/algorithm_v3.ipynb b/algorithm_v3.ipynb
new file mode 100644
index 0000000..f591b53
--- /dev/null
+++ b/algorithm_v3.ipynb
@@ -0,0 +1,649 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import pickle\n",
+    "import pandas as pd\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import requests\n",
+    "#from bs4 import BeautifulSoup\n",
+    "import json\n",
+    "import os\n",
+    "# from wmfdata.spark import get_session\n",
+    "import getpass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!which python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qids_and_properties={}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Pass in directory to place output files\n",
+    "output_dir = 'Output'\n",
+    "\n",
+    "if not os.path.exists(output_dir):\n",
+    "    os.makedirs(output_dir)\n",
+    "    \n",
+    "# Pass in the full snapshot date\n",
+    "snapshot = '2021-07-26'\n",
+    "\n",
+    "# Allow the passing of a single language as a parameter\n",
+    "language = 'kowiki'\n",
+    "\n",
+    "# A spark session type determines the resource pool\n",
+    "# to initialise on yarn\n",
+    "spark_session_type = 'regular'\n",
+    "\n",
+    "# Name of placeholder images parquet file\n",
+    "image_placeholders_file = 'image_placeholders'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# \"\"\"\n",
+    "# Will be used for findspark.init().\n",
+    "# \"\"\"\n",
+    "# SPARK_HOME = os.environ.get(\"SPARK_HOME\", \"/usr/lib/spark2\")\n",
+    "\n",
+    "# import findspark\n",
+    "# findspark.init(SPARK_HOME)\n",
+    "\n",
+    "# import pyspark\n",
+    "# import pyspark.sql\n",
+    "# from pyspark.sql import SparkSession\n",
+    "# spark = SparkSession.builder.master(\"yarn\").appName(\"ImageRec-DEV Training\").config(\"spark.submit.deployMode\",\"cluster\").getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We use wmfdata boilerplate to init a spark session.\n",
+    "# Under the hood the library uses findspark to initialise\n",
+    "# Spark's environment. pyspark imports will be available \n",
+    "# after initialisation\n",
+    "from wmfdata.spark import get_session\n",
+    "\n",
+    "spark = get_session(type='large', app_name=\"ImageRec-DEV Training\", extra_settings={\"spark.executor.memory\": \"16g\", \"spark.sql.execution.arrow.enabled\": \"true\"}, ship_python_env=True)\n",
+    "\n",
+    "import pyspark\n",
+    "import pyspark.sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# languages=['enwiki','arwiki','kowiki','cswiki','viwiki','frwiki','fawiki','ptwiki','ruwiki','trwiki','plwiki','hewiki','svwiki','ukwiki','huwiki','hywiki','srwiki','euwiki','arzwiki','cebwiki','dewiki','bnwiki'] #language editions to consider\n",
+    "#val=100 #threshold above which we consider images as non-icons\n",
+    "\n",
+    "languages=[language]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg = r'^([\\w]+-[\\w]+)'\n",
+    "short_snapshot = re.match(reg, snapshot).group()\n",
+    "short_snapshot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg = r'.+?(?=wiki)'\n",
+    "label_lang = re.match(reg, language).group()\n",
+    "label_lang"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(languages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_placeholders = spark.read.parquet(image_placeholders_file)\n",
+    "image_placeholders.createOrReplaceTempView(\"image_placeholders\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_threshold(wiki_size):\n",
+    "    #change th to optimize precision vs recall. recommended val for accuracy = 5\n",
+    "    sze, th, lim = 50000, 15, 4 \n",
+    "    if (wiki_size >= sze):\n",
+    "        #if wiki_size > base size, scale threshold by (log of ws/bs) + 1\n",
+    "        return (math.log(wiki_size/sze, 10)+1)*th\n",
+    "    #else scale th down by ratio bs/ws, w min possible val of th = th/limiting val\n",
+    "    return max((wiki_size/sze) * th, th/lim)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val={}\n",
+    "total={}\n",
+    "for wiki in languages:\n",
+    "     querytot=\"\"\"SELECT COUNT(*) as c\n",
+    "     FROM wmf_raw.mediawiki_page\n",
+    "     WHERE page_namespace=0 \n",
+    "     AND page_is_redirect=0\n",
+    "     AND snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
+    "     AND wiki_db='\"\"\"+wiki+\"\"\"'\"\"\"\n",
+    "     wikisize = spark.sql(querytot).toPandas()\n",
+    "     val[wiki]=get_threshold(int(wikisize['c']))\n",
+    "     total[wiki]=int(wikisize['c'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wikisize"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The query below retrieves, for each unillustrated article: its Wikidata ID, the image of the Wikidata ID (if any), the Commons category of the Wikidata ID (if any), and the lead images of the articles in other languages (if any).\n",
+    "\n",
+    "`allowed_images` contains the list of icons (images appearing in more than `val` articles)\n",
+    "\n",
+    "`image_pageids` contains the list of illustrated articles (articles with images that are not icons)\n",
+    "\n",
+    "`noimage_pages` contains the pageid and Qid of unillustrated articles\n",
+    "\n",
+    "`qid_props` contains for each Qid in `noimage_pages`, the values of the following properties, when present:\n",
+    "* P18: the item's image\n",
+    "* P373: the item's Commons category\n",
+    "* P31: the item's \"instance of\" property\n",
+    "\n",
+    "`category_image_list` contains the list of all images in a Commons category in `qid_props`\n",
+    "\n",
+    "`lan_page_images` contains the list of lead images in Wikipedia articles in all languages linked to each Qid\n",
+    "\n",
+    "`qid_props_with_image_list` is qid_props plus the list of images in the Commons category linked to the Wikidata item\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for wiki in languages:\n",
+    "     print(wiki)\n",
+    "     queryd=\"\"\"WITH allowed_images AS \n",
+    "     (\n",
+    "     SELECT il_to\n",
+    "     FROM wmf_raw.mediawiki_imagelinks\n",
+    "     WHERE il_from_namespace=0 \n",
+    "     AND snapshot='\"\"\"+short_snapshot+\"\"\"'  \n",
+    "     AND wiki_db='\"\"\"+wiki+\"\"\"' \n",
+    "     AND il_to not like '%\\\"%' AND il_to not like '%,%'\n",
+    "     GROUP BY il_to  \n",
+    "     HAVING COUNT(il_to)>\"\"\"+str(val[wiki])+\"\"\"),\n",
+    "     image_pageids AS \n",
+    "     (SELECT DISTINCT il_from as pageid\n",
+    "     FROM wmf_raw.mediawiki_imagelinks il1 \n",
+    "     LEFT ANTI JOIN allowed_images\n",
+    "     ON allowed_images.il_to=il1.il_to\n",
+    "     WHERE il1.il_from_namespace=0 \n",
+    "     AND il1.wiki_db='\"\"\"+wiki+\"\"\"' \n",
+    "     AND il1.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
+    "     ),\n",
+    "     pageimage_pageids AS \n",
+    "     (\n",
+    "     SELECT DISTINCT pp_page as pageid\n",
+    "     FROM wmf_raw.mediawiki_page_props pp\n",
+    "     WHERE pp.wiki_db ='\"\"\"+wiki+\"\"\"'\n",
+    "     AND pp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
+    "     AND pp_propname in ('page_image','page_image_free')),\n",
+    "     all_image_pageids as(\n",
+    "     SELECT pageid \n",
+    "     FROM image_pageids \n",
+    "     UNION\n",
+    "     SELECT pageid\n",
+    "     FROM pageimage_pageids\n",
+    "     ),\n",
+    "     noimage_pages as \n",
+    "     (\n",
+    "     SELECT wipl.item_id,p.page_id,p.page_title,page_len\n",
+    "     FROM wmf_raw.mediawiki_page p \n",
+    "     JOIN wmf.wikidata_item_page_link wipl\n",
+    "     ON p.page_id=wipl.page_id\n",
+    "     LEFT ANTI JOIN all_image_pageids\n",
+    "     on all_image_pageids.pageid=wipl.page_id\n",
+    "     WHERE p.page_namespace=0 \n",
+    "     AND page_is_redirect=0 AND p.wiki_db='\"\"\"+wiki+\"\"\"' \n",
+    "     AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
+    "     AND wipl.snapshot='\"\"\"+snapshot+\"\"\"'\n",
+    "     AND wipl.page_namespace=0\n",
+    "     AND wipl.wiki_db='\"\"\"+wiki+\"\"\"'\n",
+    "     ORDER BY page_len desc\n",
+    "     ),\n",
+    "     qid_props AS \n",
+    "     (\n",
+    "     SELECT we.id,label_val, \n",
+    "     MAX(CASE WHEN claim.mainSnak.property = 'P18' THEN claim.mainSnak.datavalue.value ELSE NULL END) AS hasimage,\n",
+    "     MAX(CASE WHEN claim.mainSnak.property = 'P373' THEN REPLACE(REPLACE(claim.mainSnak.datavalue.value,'\\\"',''),' ','_') ELSE NULL END) AS commonscategory,\n",
+    "     MAX(CASE WHEN claim.mainSnak.property = 'P31' THEN claim.mainSnak.datavalue.value ELSE NULL END) AS instanceof\n",
+    "     FROM wmf.wikidata_entity we\n",
+    "     JOIN noimage_pages\n",
+    "     ON we.id=noimage_pages.item_id\n",
+    "     LATERAL VIEW explode(labels) t AS label_lang,label_val\n",
+    "     LATERAL VIEW OUTER explode(claims) c AS claim\n",
+    "     WHERE typ='item'\n",
+    "     AND t.label_lang='\"\"\"+label_lang+\"\"\"'\n",
+    "     AND snapshot='\"\"\"+snapshot+\"\"\"'\n",
+    "     AND claim.mainSnak.property in ('P18','P31','P373')\n",
+    "     GROUP BY id,label_val\n",
+    "     ),\n",
+    "     category_image_list AS\n",
+    "     (\n",
+    "     SELECT cl_to,concat_ws(';',collect_list(mp.page_title)) as category_imagelist\n",
+    "     from qid_props\n",
+    "     left join wmf_raw.mediawiki_categorylinks mc\n",
+    "     on qid_props.commonscategory=mc.cl_to\n",
+    "     join wmf_raw.mediawiki_page mp\n",
+    "     on mp.page_id=mc.cl_from\n",
+    "     LEFT ANTI JOIN image_placeholders\n",
+    "     on image_placeholders.page_title = mp.page_title\n",
+    "     WHERE mp.wiki_db ='commonswiki'\n",
+    "     AND mp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
+    "     AND mp.page_namespace=6\n",
+    "     AND mp.page_is_redirect=0\n",
+    "     AND mc.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
+    "     AND mc.wiki_db ='commonswiki'\n",
+    "     AND mc.cl_type='file'\n",
+    "     group by mc.cl_to\n",
+    "     ),\n",
+    "     qid_props_with_image_list AS\n",
+    "     (\n",
+    "     SELECT id, label_val, hasimage, commonscategory, instanceof,category_imagelist\n",
+    "     from qid_props\n",
+    "     left join category_image_list\n",
+    "     on qid_props.commonscategory=category_image_list.cl_to\n",
+    "     ),\n",
+    "     lan_page_images AS\n",
+    "     (\n",
+    "     SELECT nip.item_id,nip.page_id,nip.page_title,nip.page_len,collect_list(concat(pp.wiki_db,': ',pp.pp_value)) as lan_images\n",
+    "     FROM noimage_pages nip\n",
+    "     LEFT JOIN  wmf.wikidata_item_page_link wipl\n",
+    "     LEFT JOIN wmf_raw.mediawiki_page_props pp\n",
+    "     LEFT JOIN wmf_raw.mediawiki_page mp\n",
+    "     ON nip.item_id=wipl.item_id\n",
+    "     AND wipl.page_id=pp.pp_page\n",
+    "     AND wipl.wiki_db=pp.wiki_db\n",
+    "     AND mp.page_title=pp.pp_value\n",
+    "     LEFT ANTI JOIN image_placeholders\n",
+    "     ON image_placeholders.page_title = pp.pp_value\n",
+    "     WHERE wipl.wiki_db !='\"\"\"+wiki+\"\"\"'\n",
+    "     AND wipl.snapshot='\"\"\"+snapshot+\"\"\"'\n",
+    "     AND wipl.page_namespace=0\n",
+    "     AND pp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
+    "     AND pp_propname in ('page_image','page_image_free')\n",
+    "     AND mp.wiki_db ='commonswiki'\n",
+    "     AND mp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
+    "     AND mp.page_namespace=6\n",
+    "     AND mp.page_is_redirect=0\n",
+    "     GROUP BY nip.item_id,nip.page_id,nip.page_title,nip.page_len\n",
+    "     ),\n",
+    "     joined_lan_page_images AS\n",
+    "     (\n",
+    "     SELECT nip.item_id,nip.page_id,nip.page_title,nip.page_len, lpi.lan_images\n",
+    "     from noimage_pages nip\n",
+    "     LEFT JOIN lan_page_images lpi\n",
+    "     on nip.item_id=lpi.item_id\n",
+    "     )\n",
+    "     SELECT * from joined_lan_page_images\n",
+    "     LEFT JOIN qid_props_with_image_list\n",
+    "     on qid_props_with_image_list.id=joined_lan_page_images.item_id\n",
+    "     \n",
+    "     \"\"\"\n",
+    "     qid_props = spark.sql(queryd).cache()\n",
+    "     qids_and_properties[wiki]=qid_props"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below I am just creating different tables according to whether an image is retrieved from a specific source (Wikidata image, Commons Category, or interlingual links)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# hasimage={}\n",
+    "# commonscategory={}\n",
+    "# lanimages={}\n",
+    "# allimages={}\n",
+    "# for wiki in languages:\n",
+    "#     print(wiki)\n",
+    "#     hasimage[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['hasimage'].astype(str).ne('None')]\n",
+    "#     commonscategory[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['category_imagelist'].astype(str).ne('None')]\n",
+    "#     lanimages[wiki]=qids_and_properties[wiki][qids_and_properties[wiki]['lan_images'].astype(str).ne('None')]\n",
+    "#     print(\"number of unillustrated articles: \"+str(len(qids_and_properties[wiki])))\n",
+    "#     print(\"number of articles items with Wikidata image: \"+str(len(hasimage[wiki])))\n",
+    "#     print(\"number of articles items with Wikidata Commons Category: \"+str(len(commonscategory[wiki])))\n",
+    "#     print(\"number of articles items with Language Links: \"+str(len(lanimages[wiki])))\n",
+    "#     ####\n",
+    "#     allimages[wiki]=qids_and_properties[wiki]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below the two functions to select images depending on the source:\n",
+    "* `select_image_language` takes as input the list of images from articles in multiple languages and selects the one which is used more often across languages (after some major filtering)\n",
+    "* `select_image_category` selects at random one of the images in the Commons category linked to the Wikidata item."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below the priority assignment process:\n",
+    "* If the article has a Wikidata image (not a flag, as this is likely a duplicate), give it priority 1\n",
+    "* Choose up to 3 images among the ones from related Wikipedia articles  in other languages, using the `select_image_language` function, and give priority 2.x where `x` is a ranking given by the number of languages using that image \n",
+    "* If the article has an associated Commons category, call the `select_image_category` function, randomly selecting up to 3 images form that category\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import functions as F\n",
+    "from pyspark.sql import Column\n",
+    "from pyspark.sql.functions import udf, pandas_udf,  PandasUDFType\n",
+    "from pyspark.sql.types import ArrayType, MapType, StringType\n",
+    "import itertools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rewrite helper functions as panda udfs\n",
+    "\n",
+    "@pandas_udf(ArrayType(StringType()), PandasUDFType.SCALAR)\n",
+    "def select_image_language_udf(imagelist: pd.Series) -> pd.Series:     \n",
+    "    results=[]\n",
+    "\n",
+    "    for row in imagelist.values:\n",
+    "        if row is not None:\n",
+    "            languages={} #contains which languages cover a given image\n",
+    "            counts={} #contains counts of image occurrences across languages\n",
+    "            for image in row:\n",
+    "                data=image.strip().split(' ')#this contains the language and image name data\n",
+    "                ###\n",
+    "                if len(data)==2: #if we actually have 2 fields\n",
+    "                    iname=data[1].strip()\n",
+    "                    lan=data[0].strip()[:-1]\n",
+    "                    ###\n",
+    "                    if iname not in counts: #if this image does not exist in our counts yet, initialize counts\n",
+    "                        substring_list=['.svg','flag','noantimage','no_free_image','image_manquante',\n",
+    "                            'replace_this_image','disambig','regions','map','map','default',\n",
+    "                            'defaut','falta_imagem_','imageNA','noimage','noenzyimage']\n",
+    "\n",
+    "                        if any(map(iname.lower().__contains__, substring_list)): #if the image name is not valid\n",
+    "                            continue\n",
+    "                       # urll = 'https://commons.wikimedia.org/wiki/File:'+iname.replace(' ','_')+'?uselang='+language\n",
+    "                        #page = requests.get(urll)\n",
+    "                        #if page.status_code == 404:\n",
+    "                         #   print (urll)\n",
+    "                         #   continue\n",
+    "                        counts[iname]=1\n",
+    "                        languages[iname]=[]\n",
+    "                    else:\n",
+    "                        counts[iname]+=1\n",
+    "                    languages[iname].append(lan)\n",
+    "            results.append(json.dumps(languages))\n",
+    "        else:\n",
+    "            results.append(None)\n",
+    "    return pd.Series(results)\n",
+    "\n",
+    "@pandas_udf(returnType=ArrayType(StringType())) \n",
+    "def select_commons_images_udf(commons_column: pd.Series) -> pd.Series:\n",
+    "    results=[]\n",
+    "    for row in commons_column.values:\n",
+    "        if row is not None:\n",
+    "            commons_images=[]\n",
+    "            def select_image_category(imagelist):\n",
+    "                languages={}\n",
+    "                data=list(imagelist.strip().split(';'))\n",
+    "                data=[d for d in data if d.find('.')!=-1]\n",
+    "                return random.choice(data)\n",
+    "\n",
+    "            for i in range(min(len(list(row.strip().split(';'))),3)):\n",
+    "                image=select_image_category(row)\n",
+    "                rating=3\n",
+    "                note='image was found in the Commons category linked in the Wikidata item'\n",
+    "                commons_images.append(json.dumps({'image':image,'rating':rating,'note':note}))\n",
+    "            results.append(commons_images)\n",
+    "        else:\n",
+    "            results.append(None)\n",
+    "    return pd.Series(results)\n",
+    "\n",
+    "@pandas_udf(returnType=ArrayType(StringType()))\n",
+    "def select_wikipedia_images_udf(wikipedia_column: pd.Series) -> pd.Series:\n",
+    "    results=[]\n",
+    "    for row in wikipedia_column.values:\n",
+    "        if row is not None:\n",
+    "            wikipedia=json.loads(row)\n",
+    "            wikipedia_images=[]\n",
+    "            index=np.argsort([len(l) for l in list(wikipedia.values())])\n",
+    "\n",
+    "            for i in range(min(len(wikipedia),3)):\n",
+    "                image=list(wikipedia.keys())[index[-(i+1)]]\n",
+    "                rating=2+(float(i)/10)\n",
+    "                note='image was found in the following Wikis: '+', '.join(wikipedia[image])\n",
+    "                wikipedia_images.append(json.dumps({'image':image,'rating':rating,'note':note}))\n",
+    "            results.append(wikipedia_images)\n",
+    "        else:\n",
+    "            results.append(None)\n",
+    "    return pd.Series(results)\n",
+    "\n",
+    "@pandas_udf(returnType=ArrayType(StringType()))\n",
+    "def select_wikidata_images_udf(wikidata_column: pd.Series) -> pd.Series:\n",
+    "    results=[]\n",
+    "    \n",
+    "    for row in wikidata_column.values:\n",
+    "        if row is not None and row.lower().find('flag') ==-1:\n",
+    "            image=row[1:-1]\n",
+    "            rating=1\n",
+    "            note='image was in the Wikidata item'\n",
+    "            results.append([json.dumps({'image':image,'rating':rating,'note':note})])\n",
+    "        else:\n",
+    "            results.append(None)\n",
+    "    return pd.Series(results)\n",
+    "    \n",
+    "@pandas_udf(returnType=ArrayType(StringType()))\n",
+    "def select_top_candidates_udf(wikidata_col: pd.Series, wikipedia_col: pd.Series, commons_col: pd.Series) -> pd.Series:\n",
+    "    results=[]\n",
+    "    \n",
+    "    for wikidata, wikipedia, commons in zip(wikidata_col.values, wikipedia_col.values, commons_col.values):\n",
+    "        top_candidates=[]\n",
+    "        \n",
+    "        if wikidata is not None:\n",
+    "            for image in wikidata:\n",
+    "                if len(top_candidates) < 3:\n",
+    "                    top_candidates.append(image)\n",
+    "                else:\n",
+    "                    break\n",
+    "        \n",
+    "        if wikipedia is not None:\n",
+    "            for image in wikipedia:\n",
+    "                if len(top_candidates) < 3:\n",
+    "                    top_candidates.append(image)\n",
+    "                else:\n",
+    "                    break\n",
+    "        \n",
+    "        if commons is not None:\n",
+    "            for image in commons:\n",
+    "                if len(top_candidates) < 3:\n",
+    "                    top_candidates.append(image)\n",
+    "                else:\n",
+    "                    break\n",
+    "        results.append(top_candidates)\n",
+    "        \n",
+    "    return pd.Series(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for wiki in languages:\n",
+    "    qids_and_properties[wiki].withColumn(\n",
+    "        \"wikipedia_imagelist\", select_image_language_udf(F.col(\"lan_images\"))\n",
+    "    ).withColumn(\n",
+    "         \"wikipedia_images\",  select_wikipedia_images_udf(F.col(\"wikipedia_imagelist\"))\n",
+    "    ).withColumn(\n",
+    "        \"commons_images\", select_commons_images_udf(F.col(\"category_imagelist\"))\n",
+    "    ).withColumn(\n",
+    "        \"wikidata_images\", select_wikidata_images_udf(F.col(\"hasimage\"))\n",
+    "    ).withColumn(\n",
+    "        \"top_candidates\", select_top_candidates_udf(F.col(\"wikidata_images\"), F.col(\"wikipedia_images\"), F.col(\"commons_images\"))\n",
+    "    ).select(\n",
+    "        \"item_id\",\n",
+    "        \"page_id\",\n",
+    "        \"page_title\",\n",
+    "        \"instanceof\",\n",
+    "#         \"wikipedia_imagelist\",\n",
+    "#         \"commons_images\",\n",
+    "#         \"wikipedia_images\",\n",
+    "#         \"wikidata_images\",\n",
+    "        \"top_candidates\"\n",
+    "    ).write.save(f'{wiki}_{snapshot}_wd_image_candidates_test', format=\"parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.stop()"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  },
+  "toc-showtags": true
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file