Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
parpalak committed Nov 14, 2023
1 parent 67c95db commit 7906964
Show file tree
Hide file tree
Showing 23 changed files with 246 additions and 822 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test_mysql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,5 @@ jobs:
run: php bin/codecept run --skip-group profile

- name: Run profiling
if: success() || failure()
run: php bin/codecept run -g profile -d
1 change: 1 addition & 0 deletions .github/workflows/test_postgres.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ jobs:
run: php bin/codecept run --skip-group profile

- name: Run profiling
if: success() || failure()
run: php bin/codecept run -g profile -d
1 change: 1 addition & 0 deletions .github/workflows/test_sqlite.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ jobs:
run: php bin/codecept run --skip-group profile

- name: Run profiling
if: success() || failure()
run: php bin/codecept run -g profile -d
89 changes: 36 additions & 53 deletions src/S2/Rose/Entity/FulltextResult.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,11 @@

class FulltextResult
{
/**
* @var int
*/
protected $tocSize = 0;

/**
* @var FulltextQuery
*/
protected $query;

/**
* @var FulltextIndexContent
*/
protected $fulltextIndexContent;
protected int $tocSize = 0;
protected FulltextQuery $query;
protected FulltextIndexContent $fulltextIndexContent;

/**
* @param FulltextQuery $query
* @param FulltextIndexContent $fulltextIndexContent
* @param int $tocSize
*/
public function __construct(FulltextQuery $query, FulltextIndexContent $fulltextIndexContent, $tocSize = 0)
public function __construct(FulltextQuery $query, FulltextIndexContent $fulltextIndexContent, int $tocSize = 0)
{
$this->query = $query;
$this->fulltextIndexContent = $fulltextIndexContent;
Expand All @@ -40,13 +24,8 @@ public function __construct(FulltextQuery $query, FulltextIndexContent $fulltext

/**
* https://i.upmath.me/svg/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09xmin%3D-1.2%2C%20xmax%3D1.2%2C%0A%09ymin%3D0%2C%20ymax%3D1.1%2C%0A%09restrict%20y%20to%20domain%3D-0.1%3A1%2C%0A%09ytick%3D%7B1%7D%2C%0A%09xtick%3D%7B-1%2C1%7D%2C%0A%09axis%20equal%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D-2%3A1%2Csemithick%5D%7Bexp(-(x%2F0.38)%5E2)%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(0.8%2C0.6)%7D%20node%7B%24y%3De%5E%7B-%5Cleft(x%2F0.38%5Cright)%5E2%7D%24%7D%3B%0A%5Cpath%20(axis%20cs%3A0%2C0)%20node%20%5Banchor%3Dnorth%20west%2Cyshift%3D-0.07cm%5D%20%7B0%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D
*
* @param int $tocSize
* @param int $foundTocEntriesNum
*
* @return float
*/
public static function frequencyReduction($tocSize, $foundTocEntriesNum)
public static function frequencyReduction(int $tocSize, int $foundTocEntriesNum): float
{
if ($tocSize < 5) {
return 1;
Expand All @@ -57,12 +36,8 @@ public static function frequencyReduction($tocSize, $foundTocEntriesNum)

/**
* Weight ratio for repeating words in the indexed item.
*
* @param int $repeatNum
*
* @return float
*/
protected static function repeatWeightRatio($repeatNum)
protected static function repeatWeightRatio(int $repeatNum): float
{
return min(0.5 * ($repeatNum - 1) + 1, 4);
}
Expand All @@ -71,12 +46,8 @@ protected static function repeatWeightRatio($repeatNum)
* Weight ratio for entry size (prefer some middle size)
*
* https://i.upmath.me/g/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09ymin%3D0%2C%20ymax%3D5%2C%0A%09xmin%3D0%2C%20xmax%3D1100%2C%0A%09ytick%3D%7B1%2C2%7D%2C%0A%09xtick%3D%7B50%2C200%2C500%2C1000%7D%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%7D%3B%0A%5Caddplot%5Bblue%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(600%2C3)%7D%20node%7B%24y%3D1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%24%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D
*
* @param int $totalWordsNum
*
* @return float
*/
protected static function entrySizeWeightRatio($totalWordsNum)
protected static function entrySizeWeightRatio(int $totalWordsNum): float
{
return $totalWordsNum >= 10 ? 1.0 + 1.0 / (1.0 + exp((sqrt($totalWordsNum) - 18) ** 2 / 60.0)) : 1;
}
Expand All @@ -89,42 +60,54 @@ protected static function entrySizeWeightRatio($totalWordsNum)
*
* @return float
*/
protected static function neighbourWeight($distance)
protected static function neighbourWeight(float $distance): float
{
return 30.0 / (1 + pow($distance / 7.0, 2));
}

/**
* @param ResultSet $resultSet
*
* @throws ImmutableException
*/
public function fillResultSet(ResultSet $resultSet)
public function fillResultSet(ResultSet $resultSet): void
{
// $queryWordCount = $this->query->getCount();

$wordReductionRatios = [];
foreach ($this->fulltextIndexContent->toArray() as $word => $items) {
$reductionRatio = self::frequencyReduction($this->tocSize, count($items));
foreach ($this->fulltextIndexContent->toArray() as $word => $indexedItems) {
$reductionRatio = self::frequencyReduction($this->tocSize, \count($indexedItems));
$wordReductionRatios[$word] = $reductionRatio;

foreach ($items as $positions) {
$weights = [
'abundance_reduction' => $reductionRatio,
'repeat_multiply' => self::repeatWeightRatio(count($positions['pos'])),
'entry_size' => self::entrySizeWeightRatio($positions['wordCount']),
];
$resultSet->addWordWeight($word, $positions['extId'], $weights, $positions['pos']);
foreach ($indexedItems as $positions) {
$externalId = $positions['extId'];
if (\count($positions['pos']) > 0) {
$weights = [
'abundance_reduction' => $reductionRatio,
'repeat_multiply' => self::repeatWeightRatio(\count($positions['pos'])),
'entry_size' => self::entrySizeWeightRatio($positions['wordCount']),
];
$resultSet->addWordWeight($word, $externalId, $weights, $positions['pos']);
}
if (\count($positions['kpos']) > 0) {
$resultSet->addWordWeight($word, $externalId, [
'keyword' => 15,
'abundance_reduction' => $reductionRatio,
]);
}
if (\count($positions['tpos']) > 0) {
$resultSet->addWordWeight($word, $externalId, [
'title' => 25,
// TODO seems like this was not used before
// 'abundance_reduction' => $reductionRatio,
]);
}
}
}

$referenceContainer = $this->query->toWordPositionContainer();

$this->fulltextIndexContent->iterateWordPositions(
$this->fulltextIndexContent->iterateContentWordPositions(
static function (ExternalId $id, WordPositionContainer $container) use ($referenceContainer, $wordReductionRatios, $resultSet) {
$pairsDistance = $container->compareWith($referenceContainer);
foreach ($pairsDistance as $pairDistance) {
list($word1, $word2, $distance) = $pairDistance;
[$word1, $word2, $distance] = $pairDistance;
$weight = self::neighbourWeight($distance);
if (isset($wordReductionRatios[$word1])) {
$weight *= $wordReductionRatios[$word1];
Expand Down
12 changes: 0 additions & 12 deletions src/S2/Rose/Exception/UnknownKeywordTypeException.php

This file was deleted.

64 changes: 2 additions & 62 deletions src/S2/Rose/Finder.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php
/**
* Fulltext and keyword search
* Fulltext search
*
* @copyright 2010-2023 Roman Parpalak
* @license MIT
Expand All @@ -17,17 +17,13 @@
use S2\Rose\Exception\ImmutableException;
use S2\Rose\Exception\LogicException;
use S2\Rose\Exception\UnknownIdException;
use S2\Rose\Exception\UnknownKeywordTypeException;
use S2\Rose\Snippet\SnippetBuilder;
use S2\Rose\Stemmer\StemmerInterface;
use S2\Rose\Storage\Dto\SnippetQuery;
use S2\Rose\Storage\StorageReadInterface;

class Finder
{
public const TYPE_TITLE = 1;
public const TYPE_KEYWORD = 2;

protected StorageReadInterface $storage;
protected StemmerInterface $stemmer;
protected ?string $highlightTemplate = null;
Expand Down Expand Up @@ -63,19 +59,10 @@ public function find(Query $query, bool $isDebug = false): ResultSet
$resultSet->setHighlightTemplate($this->highlightTemplate);
}

$rawWords = $query->valueToArray();
$cleanedQuery = implode(' ', $rawWords);
$rawWords = $query->valueToArray();
$resultSet->addProfilePoint('Input cleanup');

if (\count($rawWords) > 1) {
$this->findSpacedKeywords($cleanedQuery, $query->getInstanceId(), $resultSet);
$resultSet->addProfilePoint('Keywords with space');
}

if (\count($rawWords) > 0) {
$this->findSimpleKeywords($rawWords, $query->getInstanceId(), $resultSet);
$resultSet->addProfilePoint('Simple keywords');

$this->findFulltext($rawWords, $query->getInstanceId(), $resultSet);
$resultSet->addProfilePoint('Fulltext search');
}
Expand Down Expand Up @@ -108,23 +95,6 @@ public static function fulltextRateExcludeNum(int $tocSize): int
return max($tocSize * 0.5, 20);
}

/**
* @return int[]|array
* @throws UnknownKeywordTypeException
*/
protected static function getKeywordWeight(int $type): array
{
if ($type === self::TYPE_KEYWORD) {
return ['keyword' => 15];
}

if ($type === self::TYPE_TITLE) {
return ['title' => 25];
}

throw new UnknownKeywordTypeException(sprintf('Unknown type "%s"', $type));
}

/**
* @throws ImmutableException
*/
Expand All @@ -141,36 +111,6 @@ protected function findFulltext(array $words, ?int $instanceId, ResultSet $resul
$fulltextResult->fillResultSet($resultSet);
}

/**
* @param string[] $words
*/
protected function findSimpleKeywords(array $words, ?int $instanceId, ResultSet $result): void
{
$wordsWithStems = $words;
foreach ($words as $word) {
$stem = $this->stemmer->stemWord($word);
$wordsWithStems[] = $stem;
}

foreach ($this->storage->getSingleKeywordIndexByWords($wordsWithStems, $instanceId) as $word => $content) {
$content->iterate(static function (ExternalId $externalId, $type, $tocSize, $foundTocEntriesNum) use ($word, $result) {
$weights = self::getKeywordWeight($type);
if ($tocSize !== null && $foundTocEntriesNum !== null) {
$weights['abundance_reduction'] = FulltextResult::frequencyReduction($tocSize, $foundTocEntriesNum);
}
$result->addWordWeight($word, $externalId, $weights);
});
}
}

protected function findSpacedKeywords(string $string, ?int $instanceId, ResultSet $result): void
{
$content = $this->storage->getMultipleKeywordIndexByString($string, $instanceId);
$content->iterate(static function (ExternalId $externalId, $type) use ($string, $result) {
$result->addWordWeight($string, $externalId, self::getKeywordWeight($type));
});
}

public function buildSnippets(array $relevanceByExternalIds, ResultSet $resultSet): void
{
$snippetQuery = new SnippetQuery(ExternalIdCollection::fromStringArray(array_keys($relevanceByExternalIds)));
Expand Down
Loading

0 comments on commit 7906964

Please sign in to comment.