Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load Languages from Wikidata #111

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/general_helpers.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ getSitelinkData('https://de.wikipedia.org/wiki/Kernfusion')
// => {
// lang: 'de',
// project: 'wikipedia',
// key: 'dewiki',wwwwwwwwwwwwwwwww
// key: 'dewiki',
// title: 'Kernfusion',
// url: 'https://de.wikipedia.org/wiki/Kernfusion'
// }
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"prepublishOnly": "git checkout main",
"prepack": "npm run build && npm run lint && npm test",
"postpublish": "./scripts/postpublish",
"update-sitelinks-languages": "./scripts/sitelinks_languages/update_sitelinks_languages",
"update-wikimedia-constants": "./scripts/update_wikimedia_constants.ts",
"update-toc": "./scripts/update_toc",
"watch": "tsc --watch"
},
Expand Down
17 changes: 0 additions & 17 deletions scripts/sitelinks_languages/generate_sitelinks_languages.ts

This file was deleted.

5 changes: 0 additions & 5 deletions scripts/sitelinks_languages/get_sitelinks_sites

This file was deleted.

20 changes: 0 additions & 20 deletions scripts/sitelinks_languages/update_sitelinks_languages

This file was deleted.

61 changes: 61 additions & 0 deletions scripts/update_wikimedia_constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env ts-node
import { writeFile } from 'node:fs/promises'
import { uniq } from '../src/utils/utils.js'

interface Parameter {
name: string
type: string[]
}

function stringifyArray (input: string[]) {
return JSON.stringify(uniq(input), null, 2)
// Prevent linting errors
.replace(/"/g, '\'')
.replace(/'\n/, '\',\n')
}

function stringifySimpleRecord (input: Record<string, string>) {
let output = '{\n'
output += Object.entries(input).map(([ key, value ]) => ' ' + key + ': ' + '\'' + value + '\',\n').join('')
output += '}'
return output
}

doit()
async function doit () {
const response = await fetch('https://www.wikidata.org/w/api.php?action=paraminfo&modules=wbgetentities&format=json')
const data = await response.json()

const parameters = data.paraminfo.modules[0].parameters as Parameter[]

const sites = parameters.find(o => o.name === 'sites')?.type
const languages = parameters.find(o => o.name === 'languages')?.type
if (!sites || !languages) throw new Error('paraminfo format changed')

const specialSites: Record<string, string> = {}
for (const site of sites) {
const project = site.match(/^(.+)wiki$/)?.[1]
if (!project) continue
if (!languages.includes(project.replace(/_/g, '-'))) {
specialSites[site] = project
}
}

const languagesWithWiki = languages.filter(o => sites.includes(o.replace(/-/g, '_') + 'wiki'))
console.log('languages', languages.length, 'with a wiki', languagesWithWiki.length)

const output = [
"// Generated by 'npm run update-wikimedia-constants'",
[
'export type Site = typeof sites[number]',
'export type Language = typeof languages[number]',
'export type LanguageWithWiki = typeof languagesWithWiki[number]',
].join('\n'),
'export const specialSites = ' + stringifySimpleRecord(specialSites) + ' as const',
'export const sites = ' + stringifyArray(sites) + ' as const',
'export const languages = ' + stringifyArray(languages) + ' as const',
'export const languagesWithWiki = ' + stringifyArray(languagesWithWiki) + ' as const',
].join('\n\n') + '\n'

writeFile('./src/helpers/wikimedia_constants.ts', output, 'utf-8')
}
26 changes: 13 additions & 13 deletions src/helpers/rank.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import type { Claims, PropertyClaims } from '../types/claim.js'
import { typedEntries } from '../utils/utils.js'
import type { Claim, Claims, PropertyClaims, Rank } from '../types/claim.js'

export function truthyPropertyClaims (propertyClaims: PropertyClaims): PropertyClaims {
const aggregate = propertyClaims.reduce(aggregatePerRank, {})
const aggregate: Partial<Record<Rank, Claim[]>> = {}
for (const claim of propertyClaims) {
const { rank } = claim
aggregate[rank] ??= []
aggregate[rank].push(claim)
}

// on truthyness: https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Truthy_statements
return aggregate.preferred || aggregate.normal || []
}
Expand All @@ -10,17 +17,10 @@ export function nonDeprecatedPropertyClaims (propertyClaims: PropertyClaims): Pr
return propertyClaims.filter(claim => claim.rank !== 'deprecated')
}

const aggregatePerRank = (aggregate, claim) => {
const { rank } = claim
aggregate[rank] || (aggregate[rank] = [])
aggregate[rank].push(claim)
return aggregate
}

export function truthyClaims (claims: Claims): Claims {
const truthClaimsOnly = {}
Object.keys(claims).forEach(property => {
truthClaimsOnly[property] = truthyPropertyClaims(claims[property])
})
const truthClaimsOnly: Claims = {}
for (const [ property, value ] of typedEntries(claims)) {
truthClaimsOnly[property] = truthyPropertyClaims(value)
}
return truthClaimsOnly
}
14 changes: 7 additions & 7 deletions src/helpers/simplify_text_attributes.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import type { WmLanguageCode } from '../types/options.js'
import { typedEntries } from '../utils/utils.js'
import type { Aliases, Descriptions, Glosses, Labels, Lemmas, Representations, SimplifiedAliases, SimplifiedDescriptions, SimplifiedGlosses, SimplifiedLabels, SimplifiedLemmas, SimplifiedRepresentations } from '../types/terms.js'

type InValue<T> = { readonly value: T }

function singleValue<V> (data: Partial<Readonly<Record<WmLanguageCode, InValue<V>>>>) {
const simplified: Partial<Record<WmLanguageCode, V>> = {}
for (const [ lang, obj ] of Object.entries(data)) {
function singleValue<K extends string, V> (data: Partial<Readonly<Record<K, InValue<V>>>>) {
const simplified: Partial<Record<K, V>> = {}
for (const [ lang, obj ] of typedEntries(data)) {
simplified[lang] = obj != null ? obj.value : null
}
return simplified
}

function multiValue<V> (data: Partial<Readonly<Record<WmLanguageCode, ReadonlyArray<InValue<V>>>>>) {
const simplified: Partial<Record<WmLanguageCode, readonly V[]>> = {}
for (const [ lang, obj ] of Object.entries(data)) {
function multiValue<K extends string, V> (data: Partial<Readonly<Record<K, ReadonlyArray<InValue<V>>>>>) {
const simplified: Partial<Record<K, readonly V[]>> = {}
for (const [ lang, obj ] of typedEntries(data)) {
simplified[lang] = obj != null ? obj.map(o => o.value) : []
}
return simplified
Expand Down
103 changes: 56 additions & 47 deletions src/helpers/sitelinks.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import { fixedEncodeURIComponent, isOfType, rejectObsoleteInterface, replaceSpaceByUnderscores } from '../utils/utils.js'
import { languages } from './sitelinks_languages.js'
import { specialSites } from './special_sites.js'
import type { EntityId } from '../types/entity.js'
import type { Url, WmLanguageCode } from '../types/options.js'
import type { Site } from '../types/sitelinks.js'
import { fixedEncodeURIComponent, isAKey, isOfType, rejectObsoleteInterface, replaceSpaceByUnderscores } from '../utils/utils.js'
import { languages, sites, specialSites } from './wikimedia_constants.js'
import type { Language, Site } from './wikimedia_constants.js'
import type { Url } from '../types/options.js'

type ValueOf<T> = T[keyof T]
type SpecialSiteProjectName = ValueOf<typeof specialSites>

const wikidataBase = 'https://www.wikidata.org/wiki/'

export interface GetSitelinkUrlOptions {
site: Site
site: Site | SpecialSiteProjectName
title: string
}

Expand All @@ -18,9 +19,14 @@ export function getSitelinkUrl ({ site, title }: GetSitelinkUrlOptions): Url {
if (!site) throw new Error('missing a site')
if (!title) throw new Error('missing a title')

if (isAKey(siteUrlBuilders, site)) {
return siteUrlBuilders[site](title)
}

const shortSiteKey = site.replace(/wiki$/, '')
const specialUrlBuilder = siteUrlBuilders[shortSiteKey] || siteUrlBuilders[site]
if (specialUrlBuilder) return specialUrlBuilder(title)
if (isAKey(siteUrlBuilders, shortSiteKey)) {
return siteUrlBuilders[shortSiteKey](title)
}

const { lang, project } = getSitelinkData(site)
title = fixedEncodeURIComponent(replaceSpaceByUnderscores(title))
Expand All @@ -29,20 +35,21 @@ export function getSitelinkUrl ({ site, title }: GetSitelinkUrlOptions): Url {

const wikimediaSite = (subdomain: string) => (title: string) => `https://${subdomain}.wikimedia.org/wiki/${title}`

const siteUrlBuilders = {
const siteUrlBuilders: Readonly<Record<SpecialSiteProjectName, (s: string) => string>> = {
commons: wikimediaSite('commons'),
mediawiki: (title: string) => `https://www.mediawiki.org/wiki/${title}`,
mediawiki: title => `https://www.mediawiki.org/wiki/${title}`,
meta: wikimediaSite('meta'),
sources: title => `https://wikisource.org/wiki/${title}`,
species: wikimediaSite('species'),
wikidata: (entityId: EntityId) => {
wikidata: entityId => {
const prefix = prefixByEntityLetter[entityId[0]]
let title = prefix ? `${prefix}:${entityId}` : entityId
// Required for forms and senses
title = title.replace('-', '#')
return `${wikidataBase}${title}`
},
wikimania: wikimediaSite('wikimania'),
} as const
}

const prefixByEntityLetter = {
E: 'EntitySchema',
Expand All @@ -53,7 +60,7 @@ const prefixByEntityLetter = {
const sitelinkUrlPattern = /^https?:\/\/([\w-]{2,10})\.(\w+)\.org\/\w+\/(.*)/

export interface SitelinkData {
lang: WmLanguageCode
lang: Language
project: Project
key: string
title?: string
Expand All @@ -67,57 +74,59 @@ export function getSitelinkData (site: Site | Url): SitelinkData {
if (!matchData) throw new Error(`invalid sitelink url: ${url}`)
let [ lang, project, title ] = matchData.slice(1)
title = decodeURIComponent(title)
let key: string
if (lang === 'commons') {
return { lang: 'en', project: 'commons', key: 'commons', title, url }
}

if (!isOfType(projectNames, project)) {
throw new Error(`project is unknown: ${project}`)
}

// Known case: wikidata, mediawiki
if (lang === 'www') {
lang = 'en'
key = project
} else if (lang === 'commons') {
lang = 'en'
project = key = 'commons'
} else {
// Support multi-parts language codes, such as be_x_old
lang = lang.replace(/-/g, '_')
key = `${lang}${project}`.replace('wikipedia', 'wiki')
return { lang: 'en', project, key: project, title, url }
}
// @ts-expect-error

if (!isOfType(languages, lang)) {
throw new Error(`sitelink language not found: ${lang}. Updating wikibase-sdk to a more recent version might fix the issue.`)
}

// Support multi-parts language codes, such as be_x_old
const sitelang = lang.replace(/-/g, '_')
const key = `${sitelang}${project}`.replace('wikipedia', 'wiki')

return { lang, project, key, title, url }
} else {
const key = site
const specialProjectName = specialSites[key]
if (specialProjectName) {
return { lang: 'en', project: specialProjectName, key }
if (isAKey(specialSites, site)) {
const project = specialSites[site]
return { lang: 'en', project, key: site }
}

let [ lang, projectSuffix, rest ] = key.split('wik')
let [ lang, projectSuffix, rest ] = site.split('wik')

// Detecting cases like 'frwikiwiki' that would return [ 'fr', 'i', 'i' ]
if (rest != null) throw new Error(`invalid sitelink key: ${key}`)
if (rest != null) throw new Error(`invalid sitelink key: ${site}`)

// Support sites such as be_x_oldwiki, which refers to be-x-old.wikipedia.org
lang = lang.replace(/_/g, '-')

if (!isOfType(languages, lang)) {
throw new Error(`sitelink lang not found: ${lang}. Updating wikibase-sdk to a more recent version might fix the issue.`)
throw new Error(`sitelink language not found: ${lang}. Updating wikibase-sdk to a more recent version might fix the issue.`)
}

// Support keys such as be_x_oldwiki, which refers to be-x-old.wikipedia.org
lang = lang.replace(/_/g, '-')
if (!isAKey(projectsBySuffix, projectSuffix)) {
throw new Error(`sitelink project not found: ${site}`)
}

const project = projectsBySuffix[projectSuffix]
if (!project) throw new Error(`sitelink project not found: ${project}`)

// @ts-expect-error
return { lang, project, key }
return { lang, project, key: site }
}
}

export const isSitelinkKey = (site: string): boolean => {
try {
// relies on getSitelinkData validation
getSitelinkData(site)
return true
} catch (err) {
return false
}
}
export const isSite = (site: string): site is Site => isOfType(sites, site)

/** @deprecated use isSite */
export const isSitelinkKey = isSite

const projectsBySuffix = {
i: 'wikipedia',
Expand Down