fix: fix short tokens in getEmojiByShortcode (#90)

* fix: fix short tokens in getEmojiByShortcode * fix: add comment
nolanlawson · Dec 24, 2020 · 992ac10 · 992ac10
1 parent 7c3f9fa
commit 992ac10
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 8 deletions.
diff --git a/src/database/idbInterface.js b/src/database/idbInterface.js
@@ -11,6 +11,7 @@ import { mark, stop } from '../shared/marks'
 import { extractTokens } from './utils/extractTokens'
 import { getAllIDB, getAllKeysIDB, getIDB } from './idbUtil'
 import { findCommonMembers } from './utils/findCommonMembers'
+import { normalizeTokens } from './utils/normalizeTokens'
 
 export async function isEmpty (db) {
   return !(await get(db, STORE_KEYVALUE, KEY_URL))
@@ -22,6 +23,23 @@ export async function hasData (db, url, eTag) {
   return (oldETag === eTag && oldUrl === url)
 }
 
+async function doFullDatabaseScanForSingleResult (db, predicate) {
+  // TODO: we could do batching here using getAll(). Not sure if it's worth the extra code though.
+  return dbPromise(db, STORE_EMOJI, MODE_READONLY, (emojiStore, cb) => {
+    emojiStore.openCursor().onsuccess = e => {
+      const cursor = e.target.result
+
+      if (!cursor) { // no more results
+        cb()
+      } else if (predicate(cursor.value)) {
+        cb(cursor.value)
+      } else {
+        cursor.continue()
+      }
+    }
+  })
+}
+
 export async function loadData (db, emojiData, url, eTag) {
   mark('loadData')
   try {
@@ -83,7 +101,12 @@ export async function getEmojiByGroup (db, group) {
 }
 
 export async function getEmojiBySearchQuery (db, query) {
-  const tokens = extractTokens(query)
+  const tokens = normalizeTokens(extractTokens(query))
+
+  if (!tokens.length) {
+    return []
+  }
+
   return dbPromise(db, STORE_EMOJI, MODE_READONLY, (emojiStore, cb) => {
     // get all results that contain all tokens (i.e. an AND query)
     const intermediateResults = []
@@ -112,8 +135,21 @@ export async function getEmojiBySearchQuery (db, query) {
   })
 }
 
+// This could have been implemented as an IDB index on shortcodes, but it seemed wasteful to do that
+// when we can already query by tokens and this will give us what we're looking for 99.9% of the time
 export async function getEmojiByShortcode (db, shortcode) {
   const emojis = await getEmojiBySearchQuery(db, shortcode)
+
+  // In very rare cases (e.g. the shortcode "v" as in "v for victory"), we cannot search because
+  // there are no usable tokens (too short in this case). In that case, we have to do an inefficient
+  // full-database scan, which I believe is an acceptable tradeoff for not having to have an extra
+  // index on shortcodes.
+
+  if (!emojis.length) {
+    const predicate = _ => ((_.shortcodes || []).includes(shortcode.toLowerCase()))
+    return (await doFullDatabaseScanForSingleResult(db, predicate)) || null
+  }
+
   return emojis.filter(_ => {
     const lowerShortcodes = _.shortcodes.map(_ => _.toLowerCase())
     return lowerShortcodes.includes(shortcode.toLowerCase())

diff --git a/src/database/utils/normalizeTokens.js b/src/database/utils/normalizeTokens.js
@@ -0,0 +1,13 @@
+import { MIN_SEARCH_TEXT_LENGTH } from '../../shared/constants'
+
+// This is an extra step in addition to extractTokens(). The difference here is that we expect
+// the input to have already been run through extractTokens(). This is useful for cases like
+// emoticons, where we don't want to do any tokenization (because it makes no sense to split up
+// ">:)" by the colon) but we do want to lowercase it to have consistent search results, so that
+// the user can type ':P' or ':p' and still get the same result.
+export function normalizeTokens (str) {
+  return str
+    .filter(Boolean)
+    .map(_ => _.toLowerCase())
+    .filter(_ => _.length >= MIN_SEARCH_TEXT_LENGTH)
+}
diff --git a/src/database/utils/transformEmojiData.js b/src/database/utils/transformEmojiData.js
@@ -1,21 +1,18 @@
-import { MIN_SEARCH_TEXT_LENGTH } from '../../shared/constants'
 import { mark, stop } from '../../shared/marks'
 import { extractTokens } from './extractTokens'
+import { normalizeTokens } from './normalizeTokens'
 
 // Transform emoji data for storage in IDB
 export function transformEmojiData (emojiData) {
   mark('transformEmojiData')
   const res = emojiData.map(({ annotation, emoticon, group, order, shortcodes, skins, tags, emoji, version }) => {
     const tokens = [...new Set(
-      [
+      normalizeTokens([
         ...(shortcodes || []).map(extractTokens).flat(),
         ...tags.map(extractTokens).flat(),
         ...extractTokens(annotation),
         emoticon
-      ]
-        .filter(Boolean)
-        .map(_ => _.toLowerCase())
-        .filter(_ => _.length >= MIN_SEARCH_TEXT_LENGTH)
+      ])
     )].sort()
     const res = {
       annotation,

diff --git a/test/spec/database/getEmojiBySearchQuery.test.js b/test/spec/database/getEmojiBySearchQuery.test.js
@@ -175,6 +175,7 @@ describe('getEmojiBySearchQuery', () => {
     expect((await db.getEmojiBySearchQuery(' :wink: ')).map(_ => _.annotation)).toStrictEqual(['winking face'])
     expect((await db.getEmojiBySearchQuery(':)')).map(_ => _.annotation)).toStrictEqual(['slightly smiling face'])
     expect((await db.getEmojiBySearchQuery(' :) ')).map(_ => _.annotation)).toStrictEqual(['slightly smiling face'])
+    expect((await db.getEmojiBySearchQuery(';)')).map(_ => _.annotation)).toStrictEqual(['winking face'])
 
     await db.delete()
   })
@@ -187,4 +188,11 @@ describe('getEmojiBySearchQuery', () => {
     }
     await db.delete()
   })
+
+  test('search queries that result in no tokens', async () => {
+    const db = new Database({ dataSource: ALL_EMOJI })
+
+    expect((await db.getEmojiBySearchQuery(';;;;'))).toStrictEqual([])
+    expect((await db.getEmojiBySearchQuery('B&'))).toStrictEqual([])
+  })
 })
diff --git a/test/spec/database/getEmojiByShortcode.test.js b/test/spec/database/getEmojiByShortcode.test.js
@@ -1,4 +1,4 @@
-import { ALL_EMOJI, basicAfterEach, basicBeforeEach } from '../shared'
+import { ALL_EMOJI, basicAfterEach, basicBeforeEach, truncatedEmoji } from '../shared'
 import Database from '../../../src/database/Database'
 
 describe('getEmojiByShortcode', () => {
@@ -29,4 +29,24 @@ describe('getEmojiByShortcode', () => {
 
     await db.delete()
   })
+
+  test('all shortcodes are queryable', async () => {
+    const db = new Database({ dataSource: ALL_EMOJI })
+
+    for (const emoji of truncatedEmoji) {
+      for (const shortcode of emoji.shortcodes) {
+        expect((await db.getEmojiByShortcode(shortcode)).unicode).toEqual(emoji.emoji)
+        // test uppercase too
+        expect((await db.getEmojiByShortcode(shortcode.toUpperCase())).unicode).toEqual(emoji.emoji)
+      }
+    }
+  })
+
+  test('short nonexistent shortcodes', async () => {
+    const db = new Database({ dataSource: ALL_EMOJI })
+
+    expect(await db.getEmojiByShortcode('z')).toEqual(null)
+    expect(await db.getEmojiByShortcode('1')).toEqual(null)
+    expect(await db.getEmojiByShortcode(' ')).toEqual(null)
+  })
 })