Skip to content

Commit

Permalink
fix: fix short tokens in getEmojiByShortcode (#90)
Browse files Browse the repository at this point in the history
* fix: fix short tokens in getEmojiByShortcode

* fix: add comment
  • Loading branch information
nolanlawson committed Dec 24, 2020
1 parent 7c3f9fa commit 992ac10
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 8 deletions.
38 changes: 37 additions & 1 deletion src/database/idbInterface.js
Expand Up @@ -11,6 +11,7 @@ import { mark, stop } from '../shared/marks'
import { extractTokens } from './utils/extractTokens'
import { getAllIDB, getAllKeysIDB, getIDB } from './idbUtil'
import { findCommonMembers } from './utils/findCommonMembers'
import { normalizeTokens } from './utils/normalizeTokens'

export async function isEmpty (db) {
return !(await get(db, STORE_KEYVALUE, KEY_URL))
Expand All @@ -22,6 +23,23 @@ export async function hasData (db, url, eTag) {
return (oldETag === eTag && oldUrl === url)
}

async function doFullDatabaseScanForSingleResult (db, predicate) {
// TODO: we could do batching here using getAll(). Not sure if it's worth the extra code though.
return dbPromise(db, STORE_EMOJI, MODE_READONLY, (emojiStore, cb) => {
emojiStore.openCursor().onsuccess = e => {
const cursor = e.target.result

if (!cursor) { // no more results
cb()
} else if (predicate(cursor.value)) {
cb(cursor.value)
} else {
cursor.continue()
}
}
})
}

export async function loadData (db, emojiData, url, eTag) {
mark('loadData')
try {
Expand Down Expand Up @@ -83,7 +101,12 @@ export async function getEmojiByGroup (db, group) {
}

export async function getEmojiBySearchQuery (db, query) {
const tokens = extractTokens(query)
const tokens = normalizeTokens(extractTokens(query))

if (!tokens.length) {
return []
}

return dbPromise(db, STORE_EMOJI, MODE_READONLY, (emojiStore, cb) => {
// get all results that contain all tokens (i.e. an AND query)
const intermediateResults = []
Expand Down Expand Up @@ -112,8 +135,21 @@ export async function getEmojiBySearchQuery (db, query) {
})
}

// This could have been implemented as an IDB index on shortcodes, but it seemed wasteful to do that
// when we can already query by tokens and this will give us what we're looking for 99.9% of the time
export async function getEmojiByShortcode (db, shortcode) {
const emojis = await getEmojiBySearchQuery(db, shortcode)

// In very rare cases (e.g. the shortcode "v" as in "v for victory"), we cannot search because
// there are no usable tokens (too short in this case). In that case, we have to do an inefficient
// full-database scan, which I believe is an acceptable tradeoff for not having to have an extra
// index on shortcodes.

if (!emojis.length) {
const predicate = _ => ((_.shortcodes || []).includes(shortcode.toLowerCase()))
return (await doFullDatabaseScanForSingleResult(db, predicate)) || null
}

return emojis.filter(_ => {
const lowerShortcodes = _.shortcodes.map(_ => _.toLowerCase())
return lowerShortcodes.includes(shortcode.toLowerCase())
Expand Down
13 changes: 13 additions & 0 deletions src/database/utils/normalizeTokens.js
@@ -0,0 +1,13 @@
import { MIN_SEARCH_TEXT_LENGTH } from '../../shared/constants'

// This is an extra step in addition to extractTokens(). The difference here is that we expect
// the input to have already been run through extractTokens(). This is useful for cases like
// emoticons, where we don't want to do any tokenization (because it makes no sense to split up
// ">:)" by the colon) but we do want to lowercase it to have consistent search results, so that
// the user can type ':P' or ':p' and still get the same result.
export function normalizeTokens (str) {
return str
.filter(Boolean)
.map(_ => _.toLowerCase())
.filter(_ => _.length >= MIN_SEARCH_TEXT_LENGTH)
}
9 changes: 3 additions & 6 deletions src/database/utils/transformEmojiData.js
@@ -1,21 +1,18 @@
import { MIN_SEARCH_TEXT_LENGTH } from '../../shared/constants'
import { mark, stop } from '../../shared/marks'
import { extractTokens } from './extractTokens'
import { normalizeTokens } from './normalizeTokens'

// Transform emoji data for storage in IDB
export function transformEmojiData (emojiData) {
mark('transformEmojiData')
const res = emojiData.map(({ annotation, emoticon, group, order, shortcodes, skins, tags, emoji, version }) => {
const tokens = [...new Set(
[
normalizeTokens([
...(shortcodes || []).map(extractTokens).flat(),
...tags.map(extractTokens).flat(),
...extractTokens(annotation),
emoticon
]
.filter(Boolean)
.map(_ => _.toLowerCase())
.filter(_ => _.length >= MIN_SEARCH_TEXT_LENGTH)
])
)].sort()
const res = {
annotation,
Expand Down
8 changes: 8 additions & 0 deletions test/spec/database/getEmojiBySearchQuery.test.js
Expand Up @@ -175,6 +175,7 @@ describe('getEmojiBySearchQuery', () => {
expect((await db.getEmojiBySearchQuery(' :wink: ')).map(_ => _.annotation)).toStrictEqual(['winking face'])
expect((await db.getEmojiBySearchQuery(':)')).map(_ => _.annotation)).toStrictEqual(['slightly smiling face'])
expect((await db.getEmojiBySearchQuery(' :) ')).map(_ => _.annotation)).toStrictEqual(['slightly smiling face'])
expect((await db.getEmojiBySearchQuery(';)')).map(_ => _.annotation)).toStrictEqual(['winking face'])

await db.delete()
})
Expand All @@ -187,4 +188,11 @@ describe('getEmojiBySearchQuery', () => {
}
await db.delete()
})

test('search queries that result in no tokens', async () => {
const db = new Database({ dataSource: ALL_EMOJI })

expect((await db.getEmojiBySearchQuery(';;;;'))).toStrictEqual([])
expect((await db.getEmojiBySearchQuery('B&'))).toStrictEqual([])
})
})
22 changes: 21 additions & 1 deletion test/spec/database/getEmojiByShortcode.test.js
@@ -1,4 +1,4 @@
import { ALL_EMOJI, basicAfterEach, basicBeforeEach } from '../shared'
import { ALL_EMOJI, basicAfterEach, basicBeforeEach, truncatedEmoji } from '../shared'
import Database from '../../../src/database/Database'

describe('getEmojiByShortcode', () => {
Expand Down Expand Up @@ -29,4 +29,24 @@ describe('getEmojiByShortcode', () => {

await db.delete()
})

test('all shortcodes are queryable', async () => {
const db = new Database({ dataSource: ALL_EMOJI })

for (const emoji of truncatedEmoji) {
for (const shortcode of emoji.shortcodes) {
expect((await db.getEmojiByShortcode(shortcode)).unicode).toEqual(emoji.emoji)
// test uppercase too
expect((await db.getEmojiByShortcode(shortcode.toUpperCase())).unicode).toEqual(emoji.emoji)
}
}
})

test('short nonexistent shortcodes', async () => {
const db = new Database({ dataSource: ALL_EMOJI })

expect(await db.getEmojiByShortcode('z')).toEqual(null)
expect(await db.getEmojiByShortcode('1')).toEqual(null)
expect(await db.getEmojiByShortcode(' ')).toEqual(null)
})
})

0 comments on commit 992ac10

Please sign in to comment.