### BSII

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os, re, json
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords


__paths to change__

In [None]:
# input variables
documents_path = './input/docs-raw-texts/'
queries_path = './input/queries-raw-texts/'

# output varibles
inverted_index_path = './output/inverted_index.json'
bsii_and_path = './output/BSII-AND-queries_results.txt'
bsii_or_path = './output/BSII-OR-queries_results.txt'

### Read documents methods

In [None]:
def get_documents(path: str) -> list:
    """
    read raw text from naf documents located in the directory path
    """
    data = []
    for file in sorted(os.listdir(path)):
        if file.endswith(".naf"):
            tree = ET.parse(path + file)
            text = tree.find('raw').text
            header = tree.find('nafHeader')
            if header:
                desc = header.find('fileDesc')
                if desc:
                    title = desc.attrib.get('title')
                    text = title + ' ' + text if title else text
            data.append(text)
    return data

In [None]:
def remove_stopwords(document: str) -> list:
    """
    remove the english stop words from data
    """
    lower = document.lower()
    words = lower.split(' ')
    stop_words = stopwords.words('english')
    return [word for word in words if word not in stop_words]

In [None]:
def remove_nonlatin(document: str) -> str:
    """
    replace problematic characters
    """
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

In [None]:
def preprocessing(document: str) -> list:
    """
    clean data by removing non-latin characters or numbers
    stem data sentences
    remove stop words from a document
    """
    porter = PorterStemmer()
    document = remove_nonlatin(document)
    document = porter.stem_sentence(document)
    return remove_stopwords(document)

### Inverted index

In [None]:
def get_inverted_index(documents: pd.Series) -> dict:
    """
    iterate over the words in all the documents and add their index in a dictionary
    """
    inverted_index = {}
    for i in range(len(documents)):
        for j in range(len(documents.iloc[i])):
            word = documents.iloc[i][j]
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(i+1)
    return inverted_index

### AND/OR Queries

In [None]:
def BSII_AND(query: list) -> str:
    """
    iterate over all words in the query
    with each word retrieve all documents where is presented
    comparing with the next word's retrieve documents
    """
    a_list = inverted_index.get(query[0])
    if len(a_list) > 0: 
        for i in range(1, len(query)):
            b_list = inverted_index.get(query[i])
            if not b_list: 
                a_list = []
                break
            a_list = and_merge_algorihtm(a_list, b_list)
    return ','.join([f'd{x:03}' for x in a_list]) if len(a_list) > 0 else ''

In [None]:
def and_merge_algorihtm(l1: list, l2: list) -> list:
    """
    and implementatin for merge algorithm
    """
    answer = []
    i1, i2 = 0, 0
    len_l1, len_l2 = len(l1), len(l2)
    while i1 < len_l1 and i2 < len_l2:
        val1, val2 = l1[i1], l2[i2]
        if val1 == val2:
            answer.append(val1)
            i1 += 1
            i2 += 1
        elif val1 < val2:
            i1 += 1
        else:
            i2 += 1
    return answer

In [None]:
def BSII_OR(query: list) -> str:
    """
    iterate over all query's words
    searching each one in the inverted index dictionary
    removing the repeating items
    """
    a_list = inverted_index.get(query[0])
    if len(a_list) == 0: 
        a_list = []
    for i in range(1, len(query)):
        b_list = inverted_index.get(query[i])
        if not b_list: 
            b_list = []
        a_list = a_list + b_list
    a_list = np.unique(a_list)
    return ','.join([f'd{x:03}' for x in a_list]) if len(a_list) > 0 else ''

### NOT Queries

In [None]:
def BSII_NOT(query):
    """
    iterate over all query's words
    searching each one in the inverted index dictionary
    removing the repeating items
    """
    all_list= list(range(1,332))
    if len(query) == 0:
        return ','.join([f'd{x:03}' for x in all_list])
    if len(query) == 1:
        a_list = inverted_index[query[0]]
        for elem in a_list:
            all_list.remove(elem)
        return ','.join([f'd{x:03}' for x in all_list])
    else:
        a_list = inverted_index[query[0]].copy()
        for i in range(1,len(query)):
            if query[i] in inverted_index:
                b_list = inverted_index[query[i]]
                c_list =  a_list.copy()+b_list.copy()
                a_list = c_list.copy()

        a_list = np.unique(a_list)  
        
        for elem in a_list:
            all_list.remove(elem)
        return ','.join([f'd{x:03}' for x in all_list])     

### Process the data

In [None]:
# Step 1: get documents
data = get_documents(documents_path)
documents = pd.DataFrame(data, columns=['data'])
documents.head()

In [None]:
# Step 2: apply the preprocessing function
documents['filtered'] = documents.data.apply(preprocessing)
documents.filtered = documents.filtered.apply(np.unique)
documents.head()

In [None]:
import nltk
nltk.download()

In [None]:
# Step 3: get inverted index
inverted_index = get_inverted_index(documents.filtered)
len(inverted_index['also'])

In [None]:
# save json file with inverted index
with open(inverted_index_path, "w") as file:
    json.dump(inverted_index, file)
    file.close()

In [None]:
# open json file with inverted index
with open(inverted_index_path, "r") as file:
    json_file = file.read()
    inverted_index = json.loads(json_file)

In [None]:
data_query = get_documents(queries_path)
queries = pd.DataFrame(data_query, columns=['data'])
queries.head()

In [None]:
queries['filtered'] = queries.data.apply(preprocessing)
queries.filtered = queries.filtered.apply(np.unique)
queries.head()

In [None]:
q_and=queries.filtered.apply(BSII_AND)
print(q_and)

In [None]:
with open(bsii_and_path, "w") as f:
    for i in range(len(q_and)):
        f.write(f'q{i+1:02} {q_and[i]}\n')
    f.close()

In [None]:
q_or=queries.filtered.apply(BSII_OR)
print(q_or)

In [None]:
with open(bsii_or_path, "w") as f:
    for i in range(len(q_or)):
        f.write(f'q{i+1:02} {q_or[i]}\n')
    f.close()

In [None]:
# %%
q_not=queries.filtrada.apply(BSII_NOT)
print(q_not)

In [None]:
# %%
f = open("salida/BSII-NOT-queries_results.txt", "w")
for i in range(len(q_not)):
    f.write(f'q{i+1:02} {q_not[i]}\n')
f.close() 