In [1]:
# imports
import pickle
from tqdm import tqdm

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.regexp import blankline_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter


# lucene imports
import lucene
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.store import FSDirectory, SimpleFSDirectory
from java.io import File

In [2]:
# start lucene virtual machine
lucene.initVM()

<jcc.JCCEnv at 0x7f0a8c8fb070>

In [3]:
# This class returns a corpus document field content generator (as an iterator)
class MyCorpus:
    def __init__(self, indexPath, fieldname):
        # Corpus documents directory path
        directory = FSDirectory.open(File(indexPath).toPath())
        self.indexReader = DirectoryReader.open(directory)
        self.numDocs = self.indexReader.numDocs()   # no. docs in English Wikipedia or its index
        self.FIELDNAME = fieldname
        self.bad_docid = 1053350
    
    def __iter__(self):
        for luceneDocid in range(self.bad_docid):
            yield self.indexReader.document(luceneDocid).get(self.FIELDNAME)
        # excluded bad_docid document 
        # which was throwing `SystemError: invalid maximum character passed to PyUnicode_New`
        for luceneDocid in range(self.bad_docid+1,self.numDocs):
            yield self.indexReader.document(luceneDocid).get(self.FIELDNAME)

In [None]:
FIELDNAME = 'CONTENT'       # Lucene index field name for content of the doc
index_path = './Wikipedia-pages/index-enwiki'   # Lucene index directory path

# enwiki doc generator object
enwiki_corpus = MyCorpus(index_path, FIELDNAME)

In [None]:
FIELDNAME = 'CONTENT'       # Lucene index field name for content of the doc
index_path = './Wikipedia-pages/index-enwiki'   # Lucene index directory path

# enwiki doc generator object
enwiki_corpus = MyCorpus(index_path, FIELDNAME)

# to store ngrams with their frequencies
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()
quadgram_counter = Counter()

# sampling ngrams from one doc at a time and adding to Counters
for doc in tqdm(enwiki_corpus, total=enwiki_corpus.numDocs):
    # text pre-processing:
    # blankline tokenization, then sentence tokenization, then word tokenization
    sents_nested = [sent_tokenize(ss) for ss in blankline_tokenize(doc)]
    sents = [sent for sublist in sents_nested for sent in sublist]
    tokenized_sents = [word_tokenize(s) for s in sents]
    
    # POS tagging
    tagged_sents = nltk.tag.pos_tag_sents(tokenized_sents, tagset='universal')
    
    # sampling ngrams from each sentence
    for tagged_sent in tagged_sents:
        unigrams = ngrams(tagged_sent, 1)
        bigrams = ngrams(tagged_sent, 2)
        trigrams = ngrams(tagged_sent, 3)
        quadgrams = ngrams(tagged_sent, 4)
        
        # non-alphabetical ngram removal
        unigrams = [ele[0] for ele in unigrams if ele[0][1].isalpha()]
        bigrams = [bigram for bigram in bigrams if all(term.isalpha() for term,tag in bigram)]
        trigrams = [trigram for trigram in trigrams if all(term.isalpha() for term,tag in trigram)]
        quadgrams = [quadgram for quadgram in quadgrams if all(term.isalpha() for term,tag in quadgram)]
        
        # collocation POS filters
        unigram_tags = ['NOUN']
        bigram_tags = [('ADJ','NOUN'),('NOUN','NOUN')]
        trigram_tags = [('ADJ','ADJ','NOUN'),('ADJ','NOUN','NOUN'),('NOUN','ADJ','NOUN'), \
            ('NOUN','NOUN','NOUN'),('NOUN','ADP','NOUN')]
        quadgram_tags = [('NOUN','VERB','ADP','NOUN'),('NOUN','VERB','NOUN','NOUN'),('ADJ','NOUN','ADJ','NOUN'), \
            ('ADV','ADJ','NOUN','NOUN'),('NOUN','ADP','ADJ','NOUN'), \
            ('ADJ','NOUN','VERB','NOUN'),('NOUN','NOUN','ADP','NOUN'),('NOUN','ADJ','NOUN','NOUN')]
        
        # doing POS filteration and lowercasing
        unigrams = [unigram.lower() for unigram,tag in unigrams if any(tag==ut for ut in unigram_tags)]
        bigrams = [' '.join(term.lower() for term,tag in bigram) for bigram in bigrams if any([all(btgs[i]==bigram[i][1] for i in range(len(bigram))) for btgs in bigram_tags])]
        trigrams = [' '.join(term.lower() for term,tag in trigram) for trigram in trigrams if any([all(ttgs[i]==trigram[i][1] for i in range(len(trigram))) for ttgs in trigram_tags])]
        quadgrams = [' '.join(term.lower() for term,tag in quadgram) for quadgram in quadgrams if any([all(qtgs[i]==quadgram[i][1] for i in range(len(quadgram))) for qtgs in quadgram_tags])]
        
        # Add to Counters
        unigram_counter.update(unigrams)
        bigram_counter.update(bigrams)
        trigram_counter.update(trigrams)
        quadgram_counter.update(quadgrams)
    
with open('./counters-dump/unigram_counter.pickle', 'wb') as f:
    pickle.dump(unigram_counter, f)
    
with open('./counters-dump/bigram_counter.pickle', 'wb') as f:
    pickle.dump(bigram_counter, f)
    
with open('./counters-dump/trigram_counter.pickle', 'wb') as f:
    pickle.dump(trigram_counter, f)

with open('./counters-dump/quadgram_counter.pickle', 'wb') as f:
    pickle.dump(quadgram_counter, f)

#### Testing query-generation-parallel.py for small number of docs

In [50]:
import pickle

with open('./doc-query-dumps/1000-2000/1122/quadgram_doc1122.pickle', 'rb') as f:
    bigram_doc10 = pickle.load(f)

In [51]:
bigram_doc10

{'a manual of regional',
 'citing a botanical name',
 'dermatology titled nouvelle pratique',
 'disease caused by microsporon',
 'medium of low ph',
 'physician born in nantes',
 'was a french physician'}

In [56]:
with open('./counters-dump/quadgram_counter.pickle', 'rb') as f:
    quad = pickle.load(f)

In [59]:
list(quad)[-20:]

['family of ukrainian industrialists',
 'influence on various facets',
 'manner of poetic expression',
 'influence on ukrainian culture',
 'figure with unmatched significance',
 'poet located throughout ukraine',
 'soviet union as part',
 'series of ornamental textiles',
 'vincent illuzzi of barre',
 'tipperary hill in syracuse',
 'shevchenko park in northeast',
 'taras shevchenko in curitiba',
 'bronze bust by lysenko',
 'lusavorich cathedral in yerevan',
 'cultural garden in rockefeller',
 'anlæg park in copenhagen',
 'ii won worst picture',
 'east across northern pakistan',
 'mountain valleys at altitudes',
 'climate with dry winters']

In [1]:
import json
from collections import Counter

# create a counter
my_counter = Counter(['apple', 'banana', 'apple', 'orange', 'banana', 'apple'])

# write the counter to a JSON file
with open('counter.json', 'w') as f:
    json.dump(my_counter, f)

In [4]:
# read the counter from the JSON file
with open('counter.json', 'r') as f:
    loaded_counter = Counter(json.load(f))

print(loaded_counter)


Counter({'apple': 3, 'banana': 2, 'orange': 1})


In [10]:
step = 1000000
[(l[0],l[-1]) for l in (range(6584626)[i:i+step] for i in range(6584626)[::step])]

[(0, 999999),
 (1000000, 1999999),
 (2000000, 2999999),
 (3000000, 3999999),
 (4000000, 4999999),
 (5000000, 5999999),
 (6000000, 6584625)]

In [1]:
import json

In [7]:
with open('./doc-query-dumps/900000-999999/counters_docs_999000-999999.json', 'r') as f:
    # count = 0
    loaded = []
    for line in f:
        counter_dict = json.loads(line)
        new_dict = {int(key): value for key, value in counter_dict.items()}
        loaded.append(new_dict)
        # count += 1
        # if count == 10:
        #     break
        # loaded = json.load(line)
        # print(loaded)

In [10]:
docids = [list(k.keys())[0] for k in loaded]

# all([int(docid) in range(0,1000) for docid in docids])
print(len(docids))
# docids.index(999999)

1000


In [15]:
all(docid in range(999000,1000000) for docid in docids)

True

In [16]:
loaded[-1][999999]['quadgram']

{'game developed by bluesky': 1,
 'baseball starring deion sanders': 1,
 'series concluded with world': 1,
 'use of real life': 1}

In [18]:
import pickle

In [19]:
with open('./counters-dumps/0-999999/bigram_counter.pickle', 'rb') as f:
    bigram = pickle.load(f)

In [20]:
len(bigram)

15974145

#### Add all partial ngram counters to give total counters for each ngrams

In [1]:
import os
import pickle
from collections import Counter
from tqdm import tqdm

In [2]:
root_dir = './counters-dumps/'
step = 1000000
dir_names = [f'{l[0]}-{l[-1]}' for l in (range(6584626)[i:i+step] for i in range(6584626)[::step])]
paths = [os.path.join(root_dir, dir_name) for dir_name in dir_names]

In [14]:
total_unigram_counter = Counter()
for path in tqdm(paths):
    with open(os.path.join(path, 'unigram_counter.pickle'), 'rb') as f:
        unigram_counter = pickle.load(f)
    total_unigram_counter += unigram_counter

with open('./counters-dumps/total/unigram_counter.pickle', 'wb') as f:
    pickle.dump(total_unigram_counter, f)

100%|██████████| 7/7 [00:12<00:00,  1.75s/it]


In [15]:
total_bigram_counter = Counter()
for path in tqdm(paths):
    with open(os.path.join(path, 'bigram_counter.pickle'), 'rb') as f:
        bigram_counter = pickle.load(f)
    total_bigram_counter += bigram_counter

with open('./counters-dumps/total/bigram_counter.pickle', 'wb') as f:
    pickle.dump(total_bigram_counter, f)

100%|██████████| 7/7 [01:58<00:00, 16.98s/it]


In [3]:
total_trigram_counter = Counter()
for path in tqdm(paths):
    with open(os.path.join(path, 'trigram_counter.pickle'), 'rb') as f:
        trigram_counter = pickle.load(f)
    total_trigram_counter += trigram_counter

with open('./counters-dumps/total/trigram_counter.pickle', 'wb') as f:
    pickle.dump(total_trigram_counter, f)

100%|██████████| 7/7 [02:47<00:00, 23.92s/it]


In [4]:
total_quadgram_counter = Counter()
for path in tqdm(paths):
    with open(os.path.join(path, 'quadgram_counter.pickle'), 'rb') as f:
        quadgram_counter = pickle.load(f)
    total_quadgram_counter += quadgram_counter

with open('./counters-dumps/total/quadgram_counter.pickle', 'wb') as f:
    pickle.dump(total_quadgram_counter, f)

100%|██████████| 7/7 [01:00<00:00,  8.59s/it]


#### Azzopardi's tf-cutoff for each ngrams

In [1]:
from collections import Counter
import pickle

In [2]:
with open('./counters-dumps/0-1692096/unigram_counter.pickle', 'rb') as f:
    unigram_counter = pickle.load(f)

In [3]:
count_values = sorted(Counter(unigram_counter.values()).items(), key=lambda x: x[0])

# print frequency distribution
for count, freq in count_values:
    print(f'{count} occurs {freq} times')

1 occurs 8614701 times
2 occurs 3148208 times
3 occurs 896845 times
4 occurs 478333 times
5 occurs 284286 times
6 occurs 219760 times
7 occurs 147889 times
8 occurs 125978 times
9 occurs 95193 times
10 occurs 95173 times
11 occurs 65395 times
12 occurs 60820 times
13 occurs 51547 times
14 occurs 44443 times
15 occurs 38597 times
16 occurs 34920 times
17 occurs 29511 times
18 occurs 27405 times
19 occurs 23735 times
20 occurs 22431 times
21 occurs 19630 times
22 occurs 18271 times
23 occurs 16543 times
24 occurs 16026 times
25 occurs 14296 times
26 occurs 14214 times
27 occurs 12400 times
28 occurs 11697 times
29 occurs 10480 times
30 occurs 10499 times
31 occurs 9465 times
32 occurs 9462 times
33 occurs 8310 times
34 occurs 7950 times
35 occurs 7478 times
36 occurs 7484 times
37 occurs 6747 times
38 occurs 6507 times
39 occurs 6183 times
40 occurs 6310 times
41 occurs 5844 times
42 occurs 5681 times
43 occurs 5200 times
44 occurs 5119 times
45 occurs 4930 times
46 occurs 4567 times
47 

In [4]:
unigrams = [unigram for unigram, tf in unigram_counter.items() if tf >= 5]

In [5]:
len(unigrams)

1902575

In [8]:
# Save as JSON
with open('./final-queries/unigram-queries.json', 'w') as f:
    json.dump(unigrams, f)

### Ngrams list truncation at specific no. of queries for each ngram

In [1]:
import pickle
import os
from collections import Counter
import json

In [9]:
with open('./MSMARCO/counters-dumps/unigram_counter.pickle', 'rb') as f:
    bigram_counter = pickle.load(f)

In [10]:
bigram_filtered_counter = Counter({bigram:tf for bigram, tf in bigram_counter.items() if tf >= 5})

In [11]:
len(bigram_filtered_counter)

332106

In [12]:
top_counter = bigram_filtered_counter.most_common(2000000)

In [13]:
top_ngrams = [query for query,_ in top_counter]

In [14]:
len(top_ngrams)

332106

In [15]:
# Save as JSON
with open('./MSMARCO/final-queries/unigram-queries.json', 'w') as f:
    json.dump(top_ngrams, f)

#### Add all ngrams into one JSON file for ALL Artificial Queries

In [16]:
import os
import json

In [17]:
# all queries in one list
all_queries = []

# Set the directory path
directory = './MSMARCO/final-queries/'

# Loop through all files in the directory
for filename in os.listdir(directory):
    # Check if the file ends with 'gram-queries.json'
    if filename.endswith('gram-queries.json'):
        # Load the JSON file
        with open(os.path.join(directory, filename), 'r') as f:
            queries = json.load(f)
        all_queries.extend(queries)

with open('./MSMARCO/final-queries/all-queries.json', 'w') as f:
    json.dump(all_queries, f)

In [18]:
len(all_queries)

2134678