In [1]:
import logging
logger = logging.getLogger(__name__)

In [2]:
logging.basicConfig(format = '%(asctime)s ***%(levelname)s*** [%(name)s:%(lineno)s] - %(message)s',
                    datefmt = '%Y/%m/%d %H:%M:%S',
                    level=logging.INFO)

In [3]:
import lucene
from org.apache.lucene.analysis.cn.smart import SmartChineseAnalyzer
from org.apache.lucene.store import RAMDirectory, FSDirectory
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexReader, DirectoryReader
from org.apache.lucene.document import Document, StringField, TextField, Field
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import ClassicSimilarity, BM25Similarity
from org.apache.pylucene.search.similarities import PythonClassicSimilarity

In [4]:
class SimpleSimilarity(PythonClassicSimilarity):

    def lengthNorm(self, numTerms):
        return 1.0

    def tf(self, freq):
        return freq

    def sloppyFreq(self, distance):
        return 2.0

    def idf(self, docFreq, numDocs):
        return 1.0

    def idfExplain(self, collectionStats, termStats):
        return Explanation.match(1.0, "inexplicable", [])

In [5]:
mySimilarity = ClassicSimilarity

In [6]:
class ChineseRamIndexer:
    def __init__(self):
        lucene.initVM()
        indexDir = RAMDirectory()
        analyzer = SmartChineseAnalyzer()
        writerConfig = IndexWriterConfig(analyzer)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create new directory, remove previously indexed documents
        writerConfig.setSimilarity(mySimilarity())
        self.indexDir = indexDir
        self.writer = IndexWriter(indexDir, writerConfig)

    def add(self, pid, ptext):
        doc = Document()
        doc.add(StringField("pid", pid, Field.Store.YES))
        doc.add(TextField("ptext", ptext, Field.Store.YES))
        self.writer.addDocument(doc)
		
    def close(self):
        self.writer.close()

In [7]:
myIndexer = ChineseRamIndexer()

In [8]:
myIndexer.add("p4", "測試")

In [9]:
myIndexer.add("p4", "測試smar")

In [10]:
myIndexer.close()

In [21]:
class ParagSearcher:
    def __init__(self, indexDir):
        lucene.initVM()
        analyzer = SmartChineseAnalyzer()
        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.info('search similarity:{}'.format(self.searcher.getSimilarity()))
        
    def search(self, query_text, top_n=1):
        query_text = query_text.strip()
#         query = QueryParser("ptext", self.analyzer).parse(QueryParser.escape(query_text.strip()))
        query = QueryParser("ptext", self.analyzer).parse(query_text)
        scoreDocs = self.searcher.search(query, top_n).scoreDocs
        
        print(scoreDocs)
        for scoreDoc in scoreDocs:
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            print(doc)
            print(doc['pid'])
            print(self.searcher.explain(query, docIndex))
    
    def close(self):
        self.reader.close()

In [22]:
searcher = ParagSearcher(myIndexer.indexDir)

2020/05/14 15:31:01 ***INFO*** [__main__:9] - search similarity:ClassicSimilarity


In [23]:
searcher.search("試試看samr")

JArray<object>[<ScoreDoc: doc=0 score=1.4142135 shardIndex=0>]
Document<stored,indexed,tokenized,omitNorms,indexOptions=DOCS<pid:p4> stored,indexed,tokenized<ptext:測試>>
p4
1.4142135 = sum of:
  1.4142135 = weight(ptext:試 in 0) [ClassicSimilarity], result of:
    1.4142135 = score(freq=1.0), product of:
      2.0 = boost
      1.0 = idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:
        2 = docFreq, number of documents containing term
        2 = docCount, total number of documents with field
      1.0 = tf(freq=1.0), with freq of:
        1.0 = freq, occurrences of term within document
      0.70710677 = fieldNorm



In [14]:
searcher.search("測試測試測試測試測試測試測試測試測試測試測試測試測試測試")

JArray<object>[<ScoreDoc: doc=0 score=19.79899 shardIndex=0>]
p4
19.79899 = sum of:
  9.899495 = weight(ptext:試 in 0) [ClassicSimilarity], result of:
    9.899495 = score(freq=1.0), product of:
      14.0 = boost
      1.0 = idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:
        2 = docFreq, number of documents containing term
        2 = docCount, total number of documents with field
      1.0 = tf(freq=1.0), with freq of:
        1.0 = freq, occurrences of term within document
      0.70710677 = fieldNorm
  9.899495 = weight(ptext:測 in 0) [ClassicSimilarity], result of:
    9.899495 = score(freq=1.0), product of:
      14.0 = boost
      1.0 = idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:
        2 = docFreq, number of documents containing term
        2 = docCount, total number of documents with field
      1.0 = tf(freq=1.0), with freq of:
        1.0 = freq, occurrences of term within document
      0.70710677 = fieldNorm



In [25]:
searcher.search("smr")

JArray<object>[]


In [26]:
searcher.search("english")

JArray<object>[]


In [17]:
searcher.close()