### BSII

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os, re, json
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords


__paths to change__

In [2]:
# input variables
documents_path = './datos/docs-raw-texts/'
queries_path = './datos/queries-raw-texts/'

# output varibles
inverted_index_path = './salida/inverted_index.json'
bsii_and_path = './salida/BSII-AND-queries_results.txt'
bsii_or_path = './salida/BSII-OR-queries_results.txt'

### Read documents methods

In [3]:
def get_documents(path: str) -> list:
    """
    read raw text from naf documents located in the directory path
    """
    data = []
    for file in sorted(os.listdir(path)):
        if file.endswith(".naf"):
            tree = ET.parse(path + file)
            text = tree.find('raw').text
            header = tree.find('nafHeader')
            if header:
                desc = header.find('fileDesc')
                if desc:
                    title = desc.attrib.get('title')
                    text = title + ' ' + text if title else text
            data.append(text)
    return data

In [4]:
def remove_stopwords(document: str) -> list:
    """
    remove the english stop words from data
    """
    lower = document.lower()
    words = lower.split(' ')
    stop_words = stopwords.words('english')
    return [word for word in words if word not in stop_words]

In [5]:
def remove_nonlatin(document: str) -> str:
    """
    replace problematic characters
    """
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    return document

In [6]:
def preprocessing(document: str) -> list:
    """
    clean data by removing non-latin characters or numbers
    stem data sentences
    remove stop words from a document
    """
    porter = PorterStemmer()
    document = remove_nonlatin(document)
    document = porter.stem_sentence(document)
    return remove_stopwords(document)

### Inverted index

In [7]:
def get_inverted_index(documents: pd.Series) -> dict:
    """
    iterate over the words in all the documents and add their index in a dictionary
    """
    inverted_index = {}
    for i in range(len(documents)):
        for j in range(len(documents.iloc[i])):
            word = documents.iloc[i][j]
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(i+1)
    return inverted_index

### AND/OR Queries

In [8]:
def BSII_AND(query: list) -> str:
    """
    iterate over all words in the query
    with each word retrieve all documents where is presented
    comparing with the next word's retrieve documents
    """
    a_list = inverted_index.get(query[0])
    if len(a_list) > 0: 
        for i in range(1, len(query)):
            b_list = inverted_index.get(query[i])
            if not b_list: 
                a_list = []
                break
            a_list = and_merge_algorihtm(a_list, b_list)
    return ','.join([f'd{x:03}' for x in a_list]) if len(a_list) > 0 else ''

In [9]:
def and_merge_algorihtm(l1: list, l2: list) -> list:
    """
    and implementatin for merge algorithm
    """
    answer = []
    i1, i2 = 0, 0
    len_l1, len_l2 = len(l1), len(l2)
    while i1 < len_l1 and i2 < len_l2:
        val1, val2 = l1[i1], l2[i2]
        if val1 == val2:
            answer.append(val1)
            i1 += 1
            i2 += 1
        elif val1 < val2:
            i1 += 1
        else:
            i2 += 1
    return answer

In [10]:
def BSII_OR(query: list) -> str:
    """
    iterate over all query's words
    searching each one in the inverted index dictionary
    removing the repeating items
    """
    a_list = inverted_index.get(query[0])
    if len(a_list) == 0: 
        a_list = []
    for i in range(1, len(query)):
        b_list = inverted_index.get(query[i])
        if not b_list: 
            b_list = []
        a_list = a_list + b_list
    a_list = np.unique(a_list)
    return ','.join([f'd{x:03}' for x in a_list]) if len(a_list) > 0 else ''

### NOT Queries

In [11]:
def BSII_NOT(query):
    """
    iterate over all query's words
    searching each one in the inverted index dictionary
    removing the repeating items
    """
    all_list= list(range(1,332))
    if len(query) == 0:
        return ','.join([f'd{x:03}' for x in all_list])
    if len(query) == 1:
        a_list = inverted_index[query[0]]
        for elem in a_list:
            all_list.remove(elem)
        return ','.join([f'd{x:03}' for x in all_list])
    else:
        a_list = inverted_index[query[0]].copy()
        for i in range(1,len(query)):
            if query[i] in inverted_index:
                b_list = inverted_index[query[i]]
                c_list =  a_list.copy()+b_list.copy()
                a_list = c_list.copy()

        a_list = np.unique(a_list)  
        
        for elem in a_list:
            all_list.remove(elem)
        return ','.join([f'd{x:03}' for x in all_list])     

### Process the data

In [12]:
# Step 1: get documents
data = get_documents(documents_path)
documents = pd.DataFrame(data, columns=['data'])
documents.head()

Unnamed: 0,data
0,William Beaumont and the Human Digestion.\n\nW...
1,Selma Lagerlöf and the wonderful Adventures of...
2,Ferdinand de Lesseps and the Suez Canal.\n\nFe...
3,Walt Disney’s ‘Steamboat Willie’ and the Rise ...
4,Eugene Wigner and the Structure of the Atomic ...


In [13]:
# Step 2: apply the preprocessing function
documents['filtered'] = documents.data.apply(preprocessing)
documents.filtered = documents.filtered.apply(np.unique)
documents.head()

Unnamed: 0,data,filtered
0,William Beaumont and the Human Digestion.\n\nW...,"[accid, acid, activ, affect, ag, alexi, also, ..."
1,Selma Lagerlöf and the wonderful Adventures of...,"[abl, academi, accept, acclaim, accomplish, ac..."
2,Ferdinand de Lesseps and the Suez Canal.\n\nFe...,"[abandon, act, adopt, affair, africa, afterwar..."
3,Walt Disney’s ‘Steamboat Willie’ and the Rise ...,"[aboard, accident, accompani, ad, along, also,..."
4,Eugene Wigner and the Structure of the Atomic ...,"[accept, achiev, ad, administr, albert, along,..."


In [14]:
# Step 3: get inverted index
inverted_index = get_inverted_index(documents.filtered)
len(inverted_index['also'])

288

In [15]:
# save json file with inverted index
with open(inverted_index_path, "w") as file:
    json.dump(inverted_index, file)
    file.close()

In [16]:
# open json file with inverted index
with open(inverted_index_path, "r") as file:
    json_file = file.read()
    inverted_index = json.loads(json_file)

In [17]:
data_query = get_documents(queries_path)
queries = pd.DataFrame(data_query, columns=['data'])
queries.head()

Unnamed: 0,data
0,Fabrication of music instruments
1,famous German poetry
2,Romanticism
3,University of Edinburgh research
4,bridge construction


In [18]:
queries['filtered'] = queries.data.apply(preprocessing)
queries.filtered = queries.filtered.apply(np.unique)
queries.head()

Unnamed: 0,data,filtered
0,Fabrication of music instruments,"[fabric, instrument, music]"
1,famous German poetry,"[famou, german, poetri]"
2,Romanticism,[romantic]
3,University of Edinburgh research,"[edinburgh, research, univers]"
4,bridge construction,"[bridg, construct]"


In [19]:
q_and=queries.filtered.apply(BSII_AND)
print(q_and)

0                                       
1                              d291,d293
2          d105,d147,d152,d283,d291,d318
3                                   d286
4     d026,d029,d069,d257,d297,d303,d329
5                              d004,d034
6               d108,d110,d117,d205,d251
7                         d198,d205,d223
8                                   d231
9                         d176,d250,d277
10                                      
11                                      
12    d132,d150,d176,d184,d229,d250,d277
13                             d121,d271
14                   d192,d194,d203,d210
15                                  d179
16                                      
17                                      
18                   d129,d221,d240,d282
19                                      
20                                      
21                                      
22                             d136,d174
23                        d037,d046,d294
24              

In [20]:
with open(bsii_and_path, "w") as f:
    for i in range(len(q_and)):
        f.write(f'q{i+1:02} {q_and[i]}\n')
    f.close()

In [21]:
q_or=queries.filtered.apply(BSII_OR)
print(q_or)

0     d004,d006,d008,d016,d021,d024,d028,d032,d038,d...
1     d001,d002,d003,d004,d005,d007,d010,d014,d017,d...
2                         d105,d147,d152,d283,d291,d318
3     d001,d003,d004,d005,d006,d007,d008,d009,d010,d...
4     d003,d004,d012,d018,d021,d023,d025,d026,d029,d...
5     d004,d011,d031,d033,d034,d042,d044,d057,d060,d...
6     d001,d002,d003,d005,d006,d007,d008,d009,d012,d...
7     d008,d010,d016,d021,d029,d038,d041,d052,d055,d...
8     d001,d003,d009,d010,d012,d014,d015,d016,d017,d...
9     d003,d010,d011,d012,d015,d021,d034,d035,d048,d...
10    d001,d002,d003,d004,d007,d012,d013,d014,d015,d...
11    d002,d005,d008,d010,d016,d021,d022,d023,d025,d...
12    d021,d024,d034,d049,d056,d060,d071,d076,d077,d...
13    d003,d024,d030,d066,d074,d084,d091,d106,d121,d...
14    d002,d020,d027,d028,d048,d052,d053,d055,d058,d...
15    d001,d004,d009,d015,d021,d028,d035,d051,d055,d...
16    d003,d004,d007,d010,d011,d015,d019,d020,d022,d...
17    d001,d002,d003,d004,d005,d007,d013,d014,d0

In [22]:
with open(bsii_or_path, "w") as f:
    for i in range(len(q_or)):
        f.write(f'q{i+1:02} {q_or[i]}\n')
    f.close()

In [23]:
q_not=queries.filtered.apply(BSII_NOT)
print(q_not)

0     d001,d002,d003,d005,d007,d009,d010,d011,d012,d...
1     d006,d008,d009,d011,d012,d013,d015,d016,d025,d...
2     d001,d002,d003,d004,d005,d006,d007,d008,d009,d...
3     d002,d011,d016,d017,d025,d026,d033,d034,d035,d...
4     d001,d002,d005,d006,d007,d008,d009,d010,d011,d...
5     d001,d002,d003,d005,d006,d007,d008,d009,d010,d...
6     d004,d010,d011,d014,d016,d018,d025,d027,d031,d...
7     d001,d002,d003,d004,d005,d006,d007,d009,d011,d...
8     d002,d004,d005,d006,d007,d008,d011,d013,d019,d...
9     d001,d002,d004,d005,d006,d007,d008,d009,d013,d...
10    d005,d006,d008,d009,d010,d011,d025,d027,d030,d...
11    d001,d003,d004,d006,d007,d009,d011,d012,d013,d...
12    d001,d002,d003,d004,d005,d006,d007,d008,d009,d...
13    d001,d002,d004,d005,d006,d007,d008,d009,d010,d...
14    d001,d003,d004,d005,d006,d007,d008,d009,d010,d...
15    d002,d003,d005,d006,d007,d008,d010,d011,d012,d...
16    d001,d002,d005,d006,d008,d009,d012,d013,d014,d...
17    d006,d008,d009,d010,d011,d012,d015,d016,d0

In [24]:
f = open("salida/BSII-NOT-queries_results.txt", "w")
for i in range(len(q_not)):
    f.write(f'q{i+1:02} {q_not[i]}\n')
f.close() 