In [1]:
import os
import re
import nltk
import numpy as np
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

directory = "../collection/20_newsgroup"

[nltk_data] Downloading package punkt to /home/parzival/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/parzival/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/parzival/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Processing :
### Preprocessing the data

In [10]:
def preprocess_line(text, lemmatizer=WordNetLemmatizer()):
    words = [w.lower() for w in word_tokenize(text)]
    tokens = []
    for token in words:
        tokens.extend(re.split('[^a-zA-Z]', token))
    token_list = [lemmatizer.lemmatize(token) for token in tokens if not token in stopwords.words('english')] 
    return list(filter(lambda token: len(token), token_list))

In [13]:
preprocess_line("graphic")

['graphic']

In [4]:
def preprocess_doc(doc_path):
    
    lemmatizer = WordNetLemmatizer()
    
    with open(doc_path, encoding="utf8", errors='ignore') as f:

        tokens = []
        endOfDoc=0
        isHeader=1
        
        data = list(filter( lambda line: line!="\n",f.readlines()))
        for line in data:
            text = line
            
#             if isHeader==1:
#                 parts = line.split(": ")
#                 try:
#                     text = parts[1]
#                 except IndexError as err:
#                     print(line)
#                     raise IndexError(err)
#                 if str(parts[0])=="Lines":
#                     isHeader=0
            
            tokens.extend(preprocess_line(text,lemmatizer))
            
        return tokens

### Creation of Posting List

In [5]:
def create_posting(newsgroup="comp.graphics"):
    
    doc_id = -1
    doc_name = {}
    post_list = {}
    doc_len = {}
    idf = {}
    tf = {}
    
    working_dir = directory+"/"+newsgroup
    for file in os.listdir(working_dir):
        doc_id += 1
        tf[doc_id] = {}
        doc_len[doc_id] = 0
        
        filename = os.fsdecode(file)
        doc_name[doc_id] = filename
        
        file_path = os.path.join(working_dir,file)
        
        tokens = preprocess_doc(file_path)
        
        doc_len[doc_id] = len(tokens)
        
        for token in tokens:    
            if post_list.get(token) == None:
                post_list[token] = [doc_id]
                tf[doc_id][token] = 0
                idf[token] = 1
            elif tf[doc_id].get(token) == None:
                post_list[token].append(doc_id)
                tf[doc_id][token] = 0
                idf[token] += 1
            tf[doc_id][token] += 1
            
        for j in tf[doc_id].keys():
            tf[doc_id][j] /= doc_len[doc_id]
            
    for i in idf.keys():
        idf[i]=math.log10((doc_id+1)/idf[i])
    
    return {"doc_names": doc_name, "post_list": post_list, "doc_lens": doc_len, "tf":tf, "idf":idf}            

In [6]:
postings = create_posting()

## Boolean Model

In [20]:
 class BooleanModel:
    def __init__(self,posting_list):
        self.postings = posting_list
    
    def __and__(self,token_1,token_2):
        return list(set(token_1) & set(token_2))
            
    
    def __or__(self,token_1,token_2):
        return list(set(token_1).union(set(token_2)))
    
    def __not__(self,token):
        doc_list = set(list(self.postings["doc_names"].keys()))
        print(len(doc_list))
        print(len(token))
        return list(doc_list.difference(set(token)))
    
    def evaluate(self,query):
    
        ops = ('and', 'or', 'not')
        stk = []
        result = []
    
        postfix = self.__parse__(query)
        print(postfix)
    
        for op in postfix:
            if op not in ops:
                stk.append(self.postings["post_list"][op])
                continue
            
            if op == "not":
                word = stk.pop()
                stk.append(self.__not__(word))
            elif op == "and":
                word1 = stk.pop()
                word2 = stk.pop()
                stk.append(self.__and__(word1,word2))
            else:
                word1 = stk.pop()
                word2 = stk.pop()
                stk.append(self.__or__(word1,word2))
                
        return stk[0]
            
    
    def __parse__(self,query):
        words = query.split()
        ops = ('and', 'or', 'not')
        stk = []
        tokenized_string = ""
        postfix = []
        
        for word in words:
            if word not in ops:
                temp = preprocess_line(word)
                for t in temp:
                    tokenized_string += t + " and "
                tokenized_string = tokenized_string[:-4]    
            else:
                tokenized_string += word + " "
        
        tokens = tokenized_string.split()
        
        for word in tokens:
            if word not in ops:              
                postfix.append(word)
                continue
            if word == "not":
                stk.append("not")
            else:
                if len(stk)==0 or stk[-1] != "not":
                    stk.append(word)
                else:
                    stk.reverse()
                    for t in stk:
                        postfix.append(t)
                    stk = []
                    stk.append(word)
                    
        if len(stk) > 0:
            stk.reverse()
            for t in stk:
                postfix.append(t)
                
        return postfix

In [21]:
bm = BooleanModel(postings)

bm.evluate("computer and graphics")

In [22]:
bm.evaluate("computer and graphics and not good")

['computer', 'graphic', 'good', 'not', 'and', 'and']
1000
148


[2,
 4,
 516,
 6,
 8,
 520,
 522,
 17,
 20,
 539,
 540,
 29,
 543,
 39,
 551,
 552,
 554,
 556,
 565,
 57,
 62,
 574,
 576,
 577,
 67,
 580,
 72,
 74,
 79,
 80,
 593,
 596,
 602,
 93,
 96,
 609,
 102,
 104,
 616,
 618,
 620,
 112,
 117,
 630,
 122,
 123,
 127,
 128,
 642,
 131,
 132,
 134,
 646,
 137,
 649,
 650,
 140,
 145,
 148,
 157,
 670,
 671,
 160,
 161,
 163,
 678,
 679,
 681,
 174,
 690,
 696,
 186,
 188,
 703,
 705,
 707,
 198,
 201,
 202,
 713,
 714,
 208,
 731,
 732,
 222,
 735,
 741,
 230,
 742,
 747,
 237,
 238,
 239,
 757,
 758,
 760,
 763,
 767,
 256,
 771,
 774,
 781,
 271,
 274,
 277,
 279,
 285,
 286,
 287,
 798,
 292,
 805,
 807,
 300,
 301,
 302,
 303,
 816,
 306,
 309,
 824,
 825,
 315,
 316,
 834,
 838,
 329,
 335,
 336,
 340,
 343,
 856,
 354,
 866,
 356,
 867,
 358,
 868,
 874,
 365,
 366,
 878,
 881,
 372,
 374,
 887,
 889,
 891,
 380,
 382,
 383,
 895,
 900,
 390,
 902,
 394,
 907,
 397,
 915,
 405,
 919,
 922,
 411,
 929,
 931,
 933,
 937,
 938,
 427,
 429,
 

In [43]:
bm.evaluate("computer or graph")

['computer', 'graph', 'or']


[2,
 4,
 5,
 6,
 8,
 17,
 20,
 26,
 29,
 39,
 42,
 52,
 56,
 57,
 60,
 62,
 65,
 66,
 67,
 69,
 72,
 74,
 79,
 80,
 93,
 96,
 99,
 102,
 104,
 112,
 117,
 122,
 123,
 127,
 128,
 131,
 132,
 134,
 137,
 140,
 145,
 148,
 153,
 157,
 160,
 161,
 163,
 169,
 174,
 177,
 180,
 186,
 188,
 197,
 198,
 201,
 202,
 208,
 222,
 230,
 233,
 235,
 237,
 238,
 239,
 251,
 254,
 256,
 257,
 271,
 274,
 277,
 279,
 285,
 286,
 287,
 292,
 300,
 301,
 302,
 303,
 306,
 309,
 315,
 316,
 329,
 330,
 335,
 336,
 340,
 342,
 343,
 354,
 356,
 358,
 365,
 366,
 372,
 374,
 380,
 382,
 383,
 390,
 393,
 394,
 397,
 398,
 403,
 405,
 407,
 410,
 411,
 420,
 427,
 428,
 429,
 432,
 439,
 440,
 441,
 442,
 444,
 445,
 451,
 455,
 465,
 481,
 490,
 491,
 492,
 493,
 496,
 497,
 498,
 499,
 514,
 516,
 520,
 522,
 530,
 539,
 540,
 543,
 544,
 551,
 552,
 554,
 556,
 565,
 574,
 576,
 577,
 580,
 589,
 593,
 596,
 597,
 599,
 602,
 606,
 609,
 613,
 616,
 618,
 620,
 628,
 630,
 632,
 642,
 646,
 649,
 650,


In [32]:
lg = bm.evaluate("graph")

['graph']


In [62]:
lc = bm.evaluate("not computer")

['computer', 'not']
1000
250


In [63]:
print(len(lg), len(lc))

19 750


In [35]:
set(lg) & set(lc)

{233, 254, 256, 257, 279, 301, 407, 597, 696, 720}

In [36]:
set(lg) | set(lc)

{2,
 4,
 5,
 6,
 8,
 17,
 20,
 26,
 29,
 39,
 42,
 52,
 56,
 57,
 60,
 62,
 65,
 66,
 67,
 69,
 72,
 74,
 79,
 80,
 93,
 96,
 99,
 102,
 104,
 112,
 117,
 122,
 123,
 127,
 128,
 131,
 132,
 134,
 137,
 140,
 145,
 148,
 153,
 157,
 160,
 161,
 163,
 169,
 174,
 177,
 180,
 186,
 188,
 197,
 198,
 201,
 202,
 208,
 222,
 230,
 233,
 235,
 237,
 238,
 239,
 251,
 254,
 256,
 257,
 271,
 274,
 277,
 279,
 285,
 286,
 287,
 292,
 300,
 301,
 302,
 303,
 306,
 309,
 315,
 316,
 329,
 330,
 335,
 336,
 340,
 342,
 343,
 354,
 356,
 358,
 365,
 366,
 372,
 374,
 380,
 382,
 383,
 390,
 393,
 394,
 397,
 398,
 403,
 405,
 407,
 410,
 411,
 420,
 427,
 428,
 429,
 432,
 439,
 440,
 441,
 442,
 444,
 445,
 451,
 455,
 465,
 481,
 490,
 491,
 492,
 493,
 496,
 497,
 498,
 499,
 514,
 516,
 520,
 522,
 530,
 539,
 540,
 543,
 544,
 551,
 552,
 554,
 556,
 565,
 574,
 576,
 577,
 580,
 589,
 593,
 596,
 597,
 599,
 602,
 606,
 609,
 613,
 616,
 618,
 620,
 628,
 630,
 632,
 642,
 646,
 649,
 650,
