In [19]:
import os
import re
import nltk
import numpy as np
import math

from numpy.linalg import norm
from numpy import dot

from collections import Counter 

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

directory = "../collection/20_newsgroup"

[nltk_data] Downloading package punkt to /home/parzival/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/parzival/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/parzival/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Processing :
### Preprocessing the data

In [7]:
def preprocess_line(text, lemmatizer=WordNetLemmatizer()):
    words = [w.lower() for w in word_tokenize(text)]
    tokens = []
    for token in words:
        tokens.extend(re.split('[^a-zA-Z]', token))
    token_list = [lemmatizer.lemmatize(token) for token in tokens if not token in stopwords.words('english')] 
    return list(filter(lambda token: len(token), token_list))

In [4]:
def preprocess_doc(doc_path):
    
    lemmatizer = WordNetLemmatizer()
    
    with open(doc_path, encoding="utf8", errors='ignore') as f:

        tokens = []
        endOfDoc=0
        isHeader=1
        
        data = list(filter( lambda line: line!="\n",f.readlines()))
        for line in data:
            text = line
            
#             if isHeader==1:
#                 parts = line.split(": ")
#                 try:
#                     text = parts[1]
#                 except IndexError as err:
#                     print(line)
#                     raise IndexError(err)
#                 if str(parts[0])=="Lines":
#                     isHeader=0
            
            tokens.extend(preprocess_line(text,lemmatizer))
            
        return tokens

### Creation of Posting List

In [5]:
def create_posting(newsgroup="comp.graphics"):
    
    doc_id = -1
    doc_name = {}
    post_list = {}
    doc_len = {}
    idf = {}
    tf = {}
    
    working_dir = directory+"/"+newsgroup
    for file in os.listdir(working_dir):
        doc_id += 1
        tf[doc_id] = {}
        doc_len[doc_id] = 0
        
        filename = os.fsdecode(file)
        doc_name[doc_id] = filename
        
        file_path = os.path.join(working_dir,file)
        
        tokens = preprocess_doc(file_path)
        
        doc_len[doc_id] = len(tokens)
        
        for token in tokens:    
            if post_list.get(token) == None:
                post_list[token] = [doc_id]
                tf[doc_id][token] = 0
                idf[token] = 1
            elif tf[doc_id].get(token) == None:
                post_list[token].append(doc_id)
                tf[doc_id][token] = 0
                idf[token] += 1
            tf[doc_id][token] += 1
            
        for j in tf[doc_id].keys():
            tf[doc_id][j] /= doc_len[doc_id]
            
    for i in idf.keys():
        idf[i]=math.log10((doc_id+1)/idf[i])
    
    return {"doc_names": doc_name, "post_list": post_list, "doc_lens": doc_len, "tf":tf, "idf":idf}            

In [6]:
postings = create_posting()

## Vector Space Model

In [29]:
class VectorSpaceModel:
    def __init__(self,posting_list):
        self.postings = posting_list
        self.word_ids = {}
        self.doc_vecs = self.__create_embeddings__(posting_list)
        
    def __create_embeddings__(self,posting_list):
        no_docs = len(posting_list["doc_names"].keys())
        doc_vec = np.zeros((len(posting_list["doc_names"]),len(posting_list["idf"])))
        cnt = 0
        word_ids = {}
        for w in posting_list["idf"].keys():
            word_ids[w] = cnt
            cnt += 1
        for i in range(no_docs):
            for j in posting_list["tf"][i].keys():
                doc_vec[i][word_ids[j]]=posting_list["tf"][i][j]*posting_list["idf"][j]
        for i in range(no_docs):
            if norm(doc_vec[i])>0:
                doc_vec[i]=doc_vec[i]/norm(doc_vec[i])
        
        self.word_ids = word_ids
        return doc_vec
    
    def evaluate(self,query):
        query_vec = self.__get_queryvec__(query)
        no_docs = len(self.postings["doc_names"].keys())
        
        doc_similarities = np.dot(self.doc_vecs,query_vec)
        
        ranked_order = []
        
        for i in range(no_docs):
            ranked_order.append((doc_similarities[i][0],postings["doc_names"][i]))
        
        ranked_order.sort(reverse=True)
        return ranked_order
            
    def __get_queryvec__(self,query):
        query_tokens = preprocess_line(query)
        query_token_freq = Counter(query_tokens)
        word_ids = self.word_ids
        vec = np.zeros((len(postings["idf"]),1))
        
        for token in query_tokens:
            if token in word_ids.keys():
                vec[word_ids[token]] = (query_token_freq[token]*postings["idf"][token])/len(query_tokens)
        
        if norm(vec) != 0:
            vec = vec/norm(vec)
            
        return vec
        

In [30]:
vs = VectorSpaceModel(postings)

In [31]:
vs.evaluate("computer graphics is good")

[(0.1203008048411016, '38604'),
 (0.10206637249182507, '38473'),
 (0.08485342531485185, '38336'),
 (0.0816972687069431, '38943'),
 (0.08141298378331963, '39062'),
 (0.07805762305485786, '38782'),
 (0.07512324490951301, '39052'),
 (0.07325057934014358, '38262'),
 (0.07132748576456703, '38567'),
 (0.06663812007326181, '38923'),
 (0.06563179508167616, '38430'),
 (0.06500811014261478, '37926'),
 (0.06492399402896204, '38344'),
 (0.06434113386356696, '38603'),
 (0.06378936267748052, '39032'),
 (0.06364852016335493, '38741'),
 (0.06207484730950387, '38367'),
 (0.060342359902694584, '38449'),
 (0.05924401199151408, '38870'),
 (0.059184687127312195, '37930'),
 (0.058399053338652605, '39669'),
 (0.05767986460394065, '38839'),
 (0.056486346476708096, '37960'),
 (0.0554210043272769, '38956'),
 (0.05429353205662283, '38976'),
 (0.05399006221125538, '39070'),
 (0.05294291664648811, '38848'),
 (0.05257758375802058, '38370'),
 (0.05252051204128778, '39083'),
 (0.051925301364920434, '39058'),
 (0.0514