In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from os import path
import json
import numpy as np

In [2]:
def normalize(text, language='english'):
    tokens = word_tokenize(text)
    stop = set(stopwords.words(language))
    tokens = [token for token in tokens if token not in stop]
    tokens = [token for token in tokens if token.isalnum()]
    tokens = lemmatize(tokens)
    return tokens


In [3]:
def tokenize(text, language='english'):
    stop = set(stopwords.words(language))
    return [token for token in word_tokenize(text) if token not in stop and token.isalnum()]


In [4]:
def stem(tokens, language='english'):
    stemmer = SnowballStemmer(language)
    return [stemmer.stem(token) for token in tokens]


In [5]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]


In [6]:
def read_corpus(file):
    corpus = {}
    with open(file) as fin:
        for row in fin:
            jsonLine = json.loads(row)
            corpus[jsonLine['id']] = jsonLine['content']
    return corpus

In [7]:
def read_corpus_lines(file):
    corpus = []
    with open(file, encoding='utf-8') as fin:
        for row in fin:
            corpus.append(row.rstrip('\n'))
    return corpus

In [8]:
def read_document_ids(file):
    ids = []
    with open(file, encoding='utf-8') as fin:
        for row in fin:
            ids.append(row.rstrip('\n'))
    return ids


In [9]:
def tokenizeFile(targetFile, language='english'):
    targetFileName = path.basename(targetFile)
    destinationFileName = "tokenized." + targetFileName
    with open(targetFile) as fin, open(destinationFileName, 'w', newline='') as fout:
        for row in fin:
            # jsonLine = json.loads(row)
            # content = jsonLine['content']
            # jsonLine['content'] = " ".join(normalize(content))
            jsonLine = normalize(row)
            print(" ".join(jsonLine), file=fout)
            # print(json.dumps(jsonLine), file=fout)


In [10]:
# tokenizeFile('.\\plain.raw.txt')

# with open ('dump.tokenized.txt') as fin:
#     lines = fin.read()
#     count = Counter(lines.split())
#     print(len(count))


In [11]:
from rank_bm25 import BM25Okapi

corpus = read_corpus_lines('tokenized.plain.raw.txt')
corpus_tokenized = [document.split(" ") for document in corpus]
bm25 = BM25Okapi(corpus_tokenized)
# corpus = read_corpus('tokenized.raw.dump.jsonl')
# corpusDocsOnly = [doc.split(" ") for doc in list(corpus.values())]
# bm25 = BM25Okapi(corpusDocsOnly)

In [16]:
query = 'encode a string into base64'
tQuery = normalize(query)
bm25.get_top_n(tQuery, corpus_tokenized, n=10)

[['encode',
  'string',
  'base64',
  'form',
  'result',
  'always',
  'multiple',
  '4',
  'byte',
  'length'],
 ['encode',
  'string',
  'base64',
  'form',
  'result',
  'always',
  'multiple',
  '4',
  'byte',
  'length'],
 ['encode',
  'string',
  'base64',
  'form',
  'result',
  'always',
  'multiple',
  '4',
  'byte',
  'length'],
 ['serialize', 'base64', 'encode', 'secret', 'key'],
 ['encode', 'single', 'part', 'use', 'base64', 'binary', 'data'],
 ['encrypt',
  'authenticate',
  'encode',
  'base64',
  'given',
  'cookie',
  'data',
  'returned',
  'byte',
  'string',
  'ready',
  'used',
  'response',
  'header'],
 ['get',
  'base64',
  'encode',
  'bytestring',
  'getencodedbytestring64',
  'foobar',
  'zm9vymfy',
  'getencodedbytestring64'],
 ['get',
  'base64',
  'encode',
  'bytestring',
  'getencodedbytestring64',
  'foobar',
  'zm9vymfy',
  'getencodedbytestring64'],
 ['get',
  'base64',
  'encode',
  'bytestring',
  'getencodedbytestring64',
  'foobar',
  'zm9vymfy',


In [13]:
query = 'angle degree radian simple little library dealing geometric angl'
tQuery = normalize(query)

bm25.get_top_n(query, corpusDocsOnly)

NameError: name 'corpusDocsOnly' is not defined

In [None]:
def get_top_n_indices(bm25, query, n=5):
    scores = bm25.get_scores(query)
    return np.argpartition(scores, -n)[-n:]

In [10]:
import storage.document_store as ds

store = ds.DocumentStore()
store.loadFromJsonl('tokenized.small.dump.jsonl')

ModuleNotFoundError: No module named 'storage'