In [98]:
import numpy as np
import nltk

In [6]:
#download tokenizer
path='D:/misc/Projects/Python/NLP/misc'
nltk.download('punkt_tab',download_dir=path)
nltk.download('wordnet',download_dir=path)
nltk.download('omw-1.4',download_dir=path)
nltk.data.path.append(path)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     D:/misc/Projects/Python/NLP/misc...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     D:/misc/Projects/Python/NLP/misc...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     D:/misc/Projects/Python/NLP/misc...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 2)

In [99]:
with open('keywords.txt','r') as f:
    kws=f.read().lower()
with open('abstracts.txt','r') as f:
    abstracts=f.read().lower()
    
#remove punctuation from abstracts
punctuation=['"','!','.',',',"'",'(',')',';','``',"''",'?','--',':']
for punct in punctuation:
    abstracts=abstracts.replace(punct,'')

#get all the keywords for each abstract
kws=kws.replace('\n','').split('-next-')
kws=[kw.split(',') for kw in kws]
for kw in kws:
    for i in range(len(kw)):
        kw[i]=kw[i].strip()

#get text for each abstract
abstracts=abstracts.replace('\n','').split('-next-')

In [100]:
#single string from all abstracts
doc=''.join(abstract for abstract in abstracts)
#keywords to be used as queries from the keywords file
keywords=[kws[0][1],kws[19][3],kws[2][4],kws[5][2],kws[10][3]]
print(keywords)

tokenized_doc=nltk.tokenize.word_tokenize(doc)
output='\n\n'

#logical query matching 
for keyword in keywords:
    tokenized_keyword=nltk.tokenize.word_tokenize(keyword)
    check=True
    for word in tokenized_keyword:
        if word not in tokenized_doc:
            check=False
            break
    output+=f'{keyword} {int(check)}\n'
print(output[:-1])


['deep learning', 'abnormality detection', 'social media forensics', 'text classification', 'rules generation']


deep learning 1
abnormality detection 0
social media forensics 1
text classification 1
rules generation 0


## 3)

In [101]:
from collections import defaultdict

In [102]:
#custom dictionary class to conveniently add new items
class inverted_index_dict:
    def __init__(self) -> None:
        self.dict=defaultdict()
        
    def add_item(self,token,doc):
        if self.dict.__contains__(token):
            if doc not in self.dict[token]:
                self.dict[token].append(doc)
        else:
            self.dict[token]=[doc]
    
inverted_index=inverted_index_dict()

#iterate over all abstracts, for each keyword add the document id to the dictionary
for i,abstract in enumerate(abstracts):
    doc_id=f'A{i}'
    tokenized=nltk.tokenize.word_tokenize(abstract)
    for token in tokenized:
        inverted_index.add_item(token,doc_id)

#output text
output=''
items=list(inverted_index.dict.items())
for item in items:
    output+=f'{item[0]} '
    for document_id in item[1]:
        output+=f'-> {document_id}'
    output+='\n'

#save to file since its size is too large
with open('inverted_index.txt','w') as f:
    f.write(output)

To find the documents where a keyword appears it is enough to call the dictionary with the token

In [103]:
#for instance, for these keywords:
keywords_example=['deep','learning','neural','networks']
for k in keywords_example:
    print(f'{k}-> {inverted_index.dict[k]}')

deep-> ['A0', 'A1']
learning-> ['A0', 'A1', 'A10', 'A12', 'A15']
neural-> ['A1', 'A7', 'A12', 'A18']
networks-> ['A8']


## 4)

In [104]:
#relaxed matching definition
def relaxed_matching(w1,w2):
    w2=list(w2)
    counter=0
    for c in w1:
        try:
            i=w2.index(c)
            w2[i]=''
            counter+=1
        except:
            continue
    return counter/len(w1)>=0.9

In [105]:
#single string from all abstracts
doc=''.join(abstract for abstract in abstracts)
#keywords to be used as queries from the keywords file
keywords=[kws[0][1],kws[19][3],kws[2][4],kws[5][2],kws[10][3]]
#print(keywords)

tokenized_doc=nltk.tokenize.word_tokenize(doc)
output='\n\n'

#logical query matching 
for keyword in keywords:
    tokenized_keyword=nltk.tokenize.word_tokenize(keyword)
    found_words=['not found']*len(tokenized_keyword)
    check=True
    for i,word in enumerate(tokenized_keyword):
        for word2 in tokenized_doc:
            kwcheck=False
            if relaxed_matching(word,word2):
                found_words[i]=word2
                kwcheck=True
                break
        if not kwcheck:
            check=False
    output+=f'Query on:   {tokenized_keyword}\n'
    for i in range(len(found_words)):
        output+=f'{tokenized_keyword[i]} - {found_words[i]}    ||    '
    output+=f'\nquery result: {int(check)}\n\n'
print(output[:-2])




Query on:   ['deep', 'learning']
deep - deep    ||    learning - learning-based    ||    
query result: 1

Query on:   ['abnormality', 'detection']
abnormality - abnormalities    ||    detection - detection    ||    
query result: 1

Query on:   ['social', 'media', 'forensics']
social - accomplishing    ||    media - mediation    ||    forensics - forensics    ||    
query result: 1

Query on:   ['text', 'classification']
text - extraction    ||    classification - classification    ||    
query result: 1

Query on:   ['rules', 'generation']
rules - result    ||    generation - presentation    ||    
query result: 1
