Skip to content

Commit

Permalink
Getting the highlights through to the backend api for each document
Browse files Browse the repository at this point in the history
  • Loading branch information
Annabel Church committed Dec 1, 2016
1 parent f684b7a commit cc87fb2
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -25,3 +25,4 @@ dist
npm-debug.log
overview-server*.zip
/overview-server/
*.iml
12 changes: 9 additions & 3 deletions app/controllers/DocumentListController.scala
@@ -1,16 +1,17 @@
package controllers

import play.api.libs.concurrent.Execution.Implicits.defaultContext
import scala.concurrent.Future

import scala.concurrent.Future
import controllers.auth.AuthorizedAction
import controllers.auth.Authorities.userOwningDocumentSet
import controllers.backend.{DocumentBackend,DocumentNodeBackend,DocumentTagBackend}
import controllers.backend.{DocumentBackend, DocumentNodeBackend, DocumentTagBackend, HighlightBackend}

trait DocumentListController extends Controller with SelectionHelpers {
protected val documentBackend: DocumentBackend
protected val documentNodeBackend: DocumentNodeBackend
protected val documentTagBackend: DocumentTagBackend
protected val highlightBackend: HighlightBackend

private val MaxPageSize = 100

Expand All @@ -22,14 +23,18 @@ trait DocumentListController extends Controller with SelectionHelpers {
case Right(selection) => {
for {
page <- documentBackend.index(selection, pr, false)

snippets <- highlightBackend.index(documentSetId, page.items.map(_.id), selectionRequest(documentSetId, request).right.get.q.get)

// In serial so as not to bombard Postgres
nodeIds <- documentNodeBackend.indexMany(page.items.map(_.id))
tagIds <- documentTagBackend.indexMany(page.items.map(_.id))
} yield {
val pageOfItems = page.map { document => (
document,
nodeIds.getOrElse(document.id, Seq()),
tagIds.getOrElse(document.id, Seq())
tagIds.getOrElse(document.id, Seq()),
snippets.getOrElse(document.id, Seq())
)}
Ok(views.json.DocumentList.show(selection.id, pageOfItems))
}
Expand All @@ -42,4 +47,5 @@ object DocumentListController extends DocumentListController {
override val documentBackend = DocumentBackend
override val documentNodeBackend = DocumentNodeBackend
override val documentTagBackend = DocumentTagBackend
override val highlightBackend = HighlightBackend
}
4 changes: 1 addition & 3 deletions app/controllers/HighlightController.scala
Expand Up @@ -24,9 +24,7 @@ trait HighlightController extends Controller {
case Left(_) => Future.successful(BadRequest(jsonError("illegal-arguments", Messages("com.overviewdocs.query.SyntaxError"))))
case Right(query) => {
highlightBackend.index(documentSetId, documentId, query).map { highlights: Seq[Highlight] =>
val json = JsArray(highlights.map { highlight =>
JsArray(Seq(JsNumber(highlight.begin), JsNumber(highlight.end)))
})
val json = Highlight.asJson(highlights)
Ok(json).withHeaders(CACHE_CONTROL -> "no-cache")
}
}
Expand Down
2 changes: 1 addition & 1 deletion app/controllers/backend/DocumentBackend.scala
Expand Up @@ -88,7 +88,7 @@ trait DbDocumentBackend extends DocumentBackend with DbBackend {

protected val indexClient: IndexClient

override def index(selection: Selection, pageRequest: PageRequest, includeText: Boolean) = {
override def index(selection: Selection, pageRequest: PageRequest, includeText: Boolean): Future[Page[DocumentHeader]] = {
selection.getDocumentIds(pageRequest)
.flatMap { (page: Page[Long]) =>
if (page.pageInfo.total == 0) {
Expand Down
9 changes: 7 additions & 2 deletions app/controllers/backend/HighlightBackend.scala
@@ -1,9 +1,8 @@
package controllers.backend

import scala.concurrent.Future

import com.overviewdocs.query.Query
import com.overviewdocs.searchindex.{Highlight,IndexClient,ElasticSearchIndexClient}
import com.overviewdocs.searchindex.{ElasticSearchIndexClient, Highlight, IndexClient, Snippet}

/** Finds highlights of a search term in a document.
*/
Expand All @@ -15,13 +14,19 @@ trait HighlightBackend extends Backend {
* @param q Search string
*/
def index(documentSetId: Long, documentId: Long, q: Query): Future[Seq[Highlight]]

def index(documentSetId: Long, documentIds: Seq[Long], q: Query): Future[Map[Long, Seq[Snippet]]]
}

/** ElasticSearch-backed highlight backend.
*/
trait EsHighlightBackend extends HighlightBackend {
val indexClient: IndexClient

override def index(documentSetId: Long, documentIds: Seq[Long], q: Query): Future[Map[Long, Seq[Snippet]]] = {
indexClient.highlights(documentSetId, documentIds, q)
}

override def index(documentSetId: Long, documentId: Long, q: Query) = {
indexClient.highlight(documentSetId, documentId, q)
}
Expand Down
15 changes: 11 additions & 4 deletions app/views/DocumentList/show.json.scala
@@ -1,13 +1,14 @@
package views.json.DocumentList

import java.util.UUID
import play.api.libs.json.{JsValue,Json}

import play.api.libs.json.{JsValue, Json}
import models.pagination.Page
import com.overviewdocs.models.DocumentHeader
import com.overviewdocs.searchindex.{Highlight, Snippet}

object show {
private def documentToJson(document: DocumentHeader, nodeIds: Seq[Long], tagIds: Seq[Long]) : JsValue = {
private def documentToJson(document: DocumentHeader, nodeIds: Seq[Long], tagIds: Seq[Long], snippets: Seq[Snippet]) : JsValue = {
Json.obj(
"id" -> document.id,
"documentSetId" -> document.documentSetId.toString,
Expand All @@ -16,11 +17,17 @@ object show {
"page_number" -> document.pageNumber,
"url" -> document.viewUrl,
"nodeids" -> nodeIds,
"tagids" -> tagIds
"tagids" -> tagIds,
"snippets" -> snippets.map { snippet =>
Json.obj(
"text" -> snippet.text,
"highlights" -> Highlight.asJson(snippet.highlights)
)
}
)
}

def apply(selectionId: UUID, documents: Page[(DocumentHeader,Seq[Long],Seq[Long])]) = {
def apply(selectionId: UUID, documents: Page[(DocumentHeader,Seq[Long],Seq[Long],Seq[Snippet])]) = {
Json.obj(
"selection_id" -> selectionId.toString,
"total_items" -> documents.pageInfo.total,
Expand Down
4 changes: 2 additions & 2 deletions common/src/main/scala/com/overviewdocs/models/Document.scala
Expand Up @@ -48,15 +48,15 @@ case class Document(
// URL used for document display. This is ultitmately fed to pdf.js viewer if pdf
override def viewUrl: Option[String] = {
url match {
case None =>
case None =>
fileId.map(_ => s"/documents/${id}.pdf") // pdf in blobStorage

case url if url.get.startsWith("local://") =>
Some("/localfiles/" + url.get.stripPrefix("local://")) // pdf file in local storage

case _ =>
url // something else, e.g. Twitter
}
}
}

/** Text, normalized as NFKC. */
Expand Down
Expand Up @@ -90,7 +90,10 @@ class ElasticSearchIndexClient(val hosts: Seq[String]) extends IndexClient {

private[searchindex] def hostUrl(path: String): String = s"http://${hosts.head}$path"
private def GET(path: String): Future[Response] = GET(Request(path, None))
private def GET(path: String, body: JsValue): Future[Response] = GET(Request(path, Some(body)))
private def GET(path: String, body: JsValue): Future[Response] = {
System.err.println(body.toString())
GET(Request(path, Some(body)))
}
private def GET(request: Request): Future[Response] = {
httpClient.get(request.toHttpRequest).map(Response.fromHttpResponse _)
}
Expand Down Expand Up @@ -260,6 +263,98 @@ class ElasticSearchIndexClient(val hosts: Seq[String]) extends IndexClient {
})
}


override def highlights(documentSetId: Long, documentIds: Seq[Long], q: Query): Future[Map[Long, Seq[Snippet]]] = {
val HighlightBegin: Char = '\u0001' // something that can't be in any text ever
val HighlightEnd: Char = '\u0002'

/** Searches for "\u0001" and "\u0002" and uses them to create a Highlight.
*
* @param textWithHighlights Text we're searching in
* @param cur Index into text
* @param n How many highlights came before this one
*/
def findHighlight(textWithHighlights: String, cur: Int, n: Int): Option[Highlight] = {
val begin = textWithHighlights.indexOf(HighlightBegin, cur)
if (begin == -1) {
None
} else {
val end = textWithHighlights.indexOf(HighlightEnd, begin)
if (end == -1) throw new Exception(s"Found begin without end starting at index ${begin} in text: ${textWithHighlights}")
Some(Highlight(begin - n * 2, end - n * 2 - 1))
}
}

/** Recursively finds Highlights in the given text.
*
* @param textWithHighlights Text we're searching in
* @param cur Index into the text
* @param n Number of highlights we've found already
* @param acc Return value we're building
*/
@scala.annotation.tailrec
def findHighlightsRec(textWithHighlights: String, cur: Int, n: Int, acc: List[Highlight]): List[Highlight] = {
findHighlight(textWithHighlights, cur, n) match {
case None => acc.reverse
case Some(highlight) => findHighlightsRec(textWithHighlights, highlight.end + n * 2 + 2, n + 1, highlight :: acc)
}
}


/** Finds Highlights in the given text.
*
* The given text has highlights delimited by <tt>\u0001</tt> and
* <tt>\u0002</tt>. We return Highlights that <em>ignore</em> those values:
* that means the indices we return in the Highlights are less than or
* equal to the indices in the input text.
*
* @param textWithHighlights Text we're searching in
*/

// '{"query":{"ids":{"type":"document","values":["4294967298","4294967297"]}},"highlight":
// {"number_of_fragments":1,"require_field_match":false,"fields":{"text":{"highlight_query":{"constant_score":
// {"filter":{"match_phrase":{"_all":"this"}}}},"pre_tags":["\u0001"],"post_tags":["\u0002"],"number_of_fragments":5}}}}'
//

def findHighlights(textWithHighlights: String): Seq[Highlight] = findHighlightsRec(textWithHighlights, 0, 0, Nil)

// ?filter_path=hits.hits.highlight.text
GET(s"/documents_$documentSetId/_search", Json.obj(
"query" -> Json.obj("ids" -> Json.obj("type" -> "document", "values" -> documentIds.map(_.toString))),
"highlight" -> Json.obj(
"number_of_fragments" -> 0,
"require_field_match" -> false, // Confusing: we use *filters*, not *queries*, so true matches nothing
"fields" -> Json.obj(
"text" -> Json.obj(
"highlight_query" -> repr(q),
"pre_tags" -> Json.arr(HighlightBegin.toString),
"post_tags" -> Json.arr(HighlightEnd.toString),
"number_of_fragments" -> 2
)
)
)
)).map(_ match {
case Response(statusCode, json) if statusCode >= 200 && statusCode < 300 => {
import play.api.libs.json._

val hits = (json \ "hits" \ "hits").as[JsArray]

hits.value.map { hit =>
val documentId = (hit \ "_id").as[String].toLong

val texts = (hit \ "highlight" \ "text").as[JsArray]
val snippets =
texts.value
.map(_.as[String])
.map(text => Snippet(text.replaceAll(s"$HighlightBegin|$HighlightEnd", ""), findHighlights(text)))

documentId -> snippets
}.toMap
}
case Response(_, json) => throw UnexpectedResponse(json)
})
}

override def highlight(documentSetId: Long, documentId: Long, q: Query): Future[Seq[Highlight]] = {
val HighlightBegin: Char = '\u0001' // something that can't be in any text ever
val HighlightEnd: Char = '\u0002'
Expand Down Expand Up @@ -296,6 +391,7 @@ class ElasticSearchIndexClient(val hosts: Seq[String]) extends IndexClient {
}
}


/** Finds Highlights in the given text.
*
* The given text has highlights delimited by <tt>\u0001</tt> and
Expand Down Expand Up @@ -334,7 +430,7 @@ class ElasticSearchIndexClient(val hosts: Seq[String]) extends IndexClient {
})
}

private def repr(field: Field): String = field match {
private def repr(field: Field): String = field match {
case Field.All => "_all"
case Field.Title => "title"
case Field.Text => "text"
Expand Down
11 changes: 11 additions & 0 deletions common/src/main/scala/com/overviewdocs/searchindex/Highlight.scala
@@ -1,7 +1,18 @@
package com.overviewdocs.searchindex

import play.api.libs.json.{JsArray, JsNumber}

/** A place in a document where a search query was found.
*
* The highlighted section is [begin,end).
*/
case class Highlight(begin: Int, end: Int)

object Highlight {

def asJson(highlights: Seq[Highlight]): JsArray =
JsArray(highlights.map { highlight =>
JsArray(Seq(JsNumber(highlight.begin), JsNumber(highlight.end)))
})

}
Expand Up @@ -67,6 +67,8 @@ trait IndexClient {
*/
def highlight(documentSetId: Long, documentId: Long, q: Query): Future[Seq[Highlight]]

def highlights(documentSetId: Long, documentIds: Seq[Long], q: Query): Future[Map[Long, Seq[Snippet]]]

/** Guarantees all past added documents are searchable. */
def refresh: Future[Unit]

Expand Down
@@ -0,0 +1,3 @@
package com.overviewdocs.searchindex

case class Snippet(text: String, highlights: Seq[Highlight])

0 comments on commit cc87fb2

Please sign in to comment.