In [47]:
import os
import re
import nltk
import numpy as np
import math

from numpy.linalg import norm
from numpy import dot

from collections import Counter 

from math import factorial as fct
from scipy.special import factorial

from decimal import Decimal

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

directory = "../collection/20_newsgroup"

[nltk_data] Downloading package punkt to /home/parzival/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/parzival/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/parzival/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Processing :
### Preprocessing the data

In [3]:
def preprocess_line(text, lemmatizer=WordNetLemmatizer()):
    words = [w.lower() for w in word_tokenize(text)]
    tokens = []
    for token in words:
        tokens.extend(re.split('[^a-zA-Z]', token))
    token_list = [lemmatizer.lemmatize(token) for token in tokens if not token in stopwords.words('english')] 
    return list(filter(lambda token: len(token), token_list))

In [4]:
def preprocess_doc(doc_path):
    
    lemmatizer = WordNetLemmatizer()
    
    with open(doc_path, encoding="utf8", errors='ignore') as f:

        tokens = []
        endOfDoc=0
        isHeader=1
        
        data = list(filter( lambda line: line!="\n",f.readlines()))
        for line in data:
            text = line
            
#             if isHeader==1:
#                 parts = line.split(": ")
#                 try:
#                     text = parts[1]
#                 except IndexError as err:
#                     print(line)
#                     raise IndexError(err)
#                 if str(parts[0])=="Lines":
#                     isHeader=0
            
            tokens.extend(preprocess_line(text,lemmatizer))
            
        return tokens

### Creation of Posting List

In [20]:
def create_posting(newsgroup="comp.graphics"):
    
    doc_id = -1
    doc_name = {}
    post_list = {}
    doc_len = {}
    vocab = {}
    tf = {}
    cnt = 0
    working_dir = directory+"/"+newsgroup
    for file in os.listdir(working_dir):
        doc_id += 1
        tf[doc_id] = {}
        doc_len[doc_id] = 0
        
        filename = os.fsdecode(file)
        doc_name[doc_id] = filename
        
        file_path = os.path.join(working_dir,file)
        
        tokens = preprocess_doc(file_path)
        
        doc_len[doc_id] = len(tokens)
        
        for token in tokens:    
            if post_list.get(token) == None:
                post_list[token] = [doc_id]
                tf[doc_id][token] = 0
                vocab[token] = cnt
                cnt += 1
            elif tf[doc_id].get(token) == None:
                post_list[token].append(doc_id)
                tf[doc_id][token] = 0
            tf[doc_id][token] += 1
            
        for j in tf[doc_id].keys():
            tf[doc_id][j] /= doc_len[doc_id]
    
    return {"doc_names": doc_name, "post_list": post_list, "doc_lens": doc_len, "tf":tf, "vocab":vocab}            

In [21]:
postings = create_posting()

## Language Models

In [93]:
class LanguageModel:
    def __init__(self,posting_list):
        self.postings = posting_list
        self.doc_prob = np.ones((len(posting_list["doc_names"]),1))
        self.total_words = 0
        self.vocab_freq = np.zeros((1,len(posting_list["vocab"])))
        self.doc_lens = np.zeros((len(posting_list["doc_lens"]),1))
        self.term_prob = self.__create_prob_matrix__(posting_list)
        
        
    def evaluate(self,query):
        query_tokens = preprocess_line(query)
        no_docs = len(self.postings["doc_names"])
        vocab_prob = self.vocab_freq / self.total_words
#         pqd = np.prod(np.power(mat/self.doc_lens,self.term_prob)+epsilon,axis=1,keepdims=True)
#         pdq = np.multiply(pqd,self.doc_prob)
        
        token_ids = [self.postings["vocab"][token] for token in query_tokens]
        
        mat_freq = np.zeros((len(self.postings["doc_names"]),len(token_ids)))
        mat_vocab_prob = np.zeros((1,len(token_ids)))
        cnt=0
        
        for ids in token_ids:
            mat_freq[:,cnt] = self.term_prob[:,ids]
            mat_vocab_prob[0][cnt] = vocab_prob[0][ids]
            cnt += 1
        
        mat_prob = mat_freq / self.doc_lens
            
        pqd = np.prod(np.power(mat_prob,mat_freq)+(mat_vocab_prob),axis=1,keepdims=True)
#         pdq = np.multiply(pqd,self.doc_prob)
    
        ranked_order = []
        
        for i in range(no_docs):
            ranked_order.append((pqd[i][0],self.postings["doc_names"][i]))
            
        ranked_order.sort(reverse=True)
        
        return ranked_order
        
    def __create_prob_matrix__(self, posting_list):        
        no_docs = len(posting_list["doc_names"])
        
        mat = np.zeros((no_docs, len(posting_list["vocab"])))
        
        for i in range(no_docs):
            for token in posting_list["tf"][i]:
                mat[i][posting_list["vocab"][token]] = posting_list["tf"][i][token]*posting_list["doc_lens"][i]
                
        mat = np.round(mat)
        self.vocab_freq = np.sum(mat,axis=0,keepdims=True)
        self.doc_lens = np.sum(mat,axis=1,keepdims=True) 
        self.total_words = np.sum(self.vocab_freq,axis=1,keepdims=True)[0][0]

        term_prob = self.vocab_freq / self.total_words
        
#         for i in range(no_docs):
#             self.doc_prob[i][0] = (Decimal(fct(posting_list["doc_lens"][i]))/Decimal(np.prod(factorial(mat[i]))))*Decimal(np.prod(np.power(term_prob,mat[i])))
#             self.doc_prob[i][0] = 
        return mat

In [94]:
lm = LanguageModel(postings)

In [95]:
lm.evaluate("computer graphics is good")

[(0.029589850154527877, '37958'),
 (0.029315690641160744, '39622'),
 (0.028793482044270964, '38639'),
 (0.02830340936103594, '39619'),
 (0.028069511034946492, '38234'),
 (0.027622352470363726, '38654'),
 (0.027408494026432847, '38735'),
 (0.027200745823757127, '38716'),
 (0.026998849683128615, '39013'),
 (0.02680256176862867, '37940'),
 (0.026245104091448826, '38960'),
 (0.026245104091448826, '38443'),
 (0.02606906482497098, '38840'),
 (0.02606906482497098, '38688'),
 (0.02589759800697308, '39023'),
 (0.02589759800697308, '38624'),
 (0.025730527774052048, '39072'),
 (0.025730527774052048, '38656'),
 (0.025730527774052048, '38575'),
 (0.025567687167280917, '39050'),
 (0.025567687167280917, '38775'),
 (0.025408917575679063, '39737'),
 (0.025408917575679063, '38954'),
 (0.025408917575679063, '38761'),
 (0.025408917575679063, '38594'),
 (0.025408917575679063, '38274'),
 (0.025102995679665736, '39053'),
 (0.025102995679665736, '38841'),
 (0.025102995679665736, '38502'),
 (0.0251029956796657