-
Notifications
You must be signed in to change notification settings - Fork 0
/
config_t3s.php
56 lines (46 loc) · 2.3 KB
/
config_t3s.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
<?php
# Create the document analyzer for the documents in data/t3s:
function createDocumentAnalyzer_t3s( $strusctx) {
$analyzer = $strusctx->createDocumentAnalyzer( ["xml"]);
# Define the features and attributes to store:
$analyzer->addSearchIndexFeature( "word", "/doc/text()", "word", [["stem","en"],"lc",["convdia","en"]]);
$analyzer->addSearchIndexFeature( "word", "/doc/title()", "word", [["stem","en"],"lc",["convdia","en"]]);
$analyzer->addSearchIndexFeature( "endtitle", "/doc/title~", "content", "empty");
$analyzer->addForwardIndexFeature( "orig", "/doc/text()", "split", "orig");
$analyzer->addForwardIndexFeature( "orig", "/doc/title()", "split", "orig");
$analyzer->defineAttribute( "title", "/doc/title()", "content", "orig");
$analyzer->defineAggregatedMetaData( "title_end", ["nextpos", "endtitle"]);
$analyzer->defineAggregatedMetaData( "doclen", ["count", "word"]);
return $analyzer;
}
function createQueryAnalyzer_t3s( $strusctx) {
$analyzer = $strusctx->createQueryAnalyzer();
$analyzer->addElement( "word", "word", "word", [["stem","en"],"lc",["convdia","en"]]);
return $analyzer;
}
function metadata_t3s() {
return [ ["doclen", "UINT16"], ["title_start", "UINT8"], ["title_end", "UINT8"] ];
}
function createQueryEval_t3s( $strusctx) {
# Define the query evaluation scheme:
$queryEval = $strusctx->createQueryEval();
# Here we define what query features decide, what is ranked for the result:
$queryEval->addSelectionFeature( "select");
# Here we define how we rank a document selected. We use the 'BM25' weighting scheme:
$queryEval->addWeightingFunction(
"BM25", ["k1" => 0.75, "b" => 2.1, "avgdoclen" => 1000 ], ["match" => "seek"]);
# Now we define what attributes of the documents are returned and how they are build.
# The functions that extract stuff from documents for presentation are called summarizers.
# First we add a summarizer that extracts us the title of the document:
$queryEval->addSummarizer( "", "attribute", [["name", "title"]]);
$queryEval->addSummarizer( "", "attribute", [["name", "docid"]]);
# Then we add a summarizer that collects the sections that enclose the best matches
# in a ranked document:
$queryEval->addSummarizer(
"summary", "matchphrase",
[ ["text","orig"], ["cluster",0.1], ["maxdf",1.0] ],
["match" => "seek"]
);
return $queryEval;
}
?>