In [1]:
from math import log, sqrt
from heapq import heappop, heappush, heapify

In [2]:
N = 12202

In [3]:
class Posting:
    def __init__(self, word):
        self.word = word
        self.total_tf = 0  # Total frequency in all documents
        self.total_df = 0  # Total number of documents containing the word
        self.doc_tf = dict()  # Term frequency per document
        self.positions = dict()  # Positional index
        self.champion_list = None  # Champion list (another PostingList)

    def add(self, doc_id, position):
        self.total_tf += 1
        if self.doc_tf.get(doc_id) is None:
            self.doc_tf[doc_id] = 0
            self.total_df += 1
            self.positions[doc_id] = []
        self.doc_tf[doc_id] += 1
        self.positions[doc_id].append(position)

    def tf(self, doc_id):
        if self.doc_tf.get(doc_id) is None:
            return 0
        freq = self.doc_tf[doc_id]
        return 1 + log(freq, 10)  # Logarithmic TF

    def idf(self, N):
        if self.total_df == 0:
            return 0
        return log(N / self.total_df, 10)  # Inverse Document Frequency

    def get_list_of_docs(self):
        return list(self.positions.keys())

    def create_champion_list(self, size, N):
        heap = []
        heapify(heap)

        # Calculate scores and maintain a heap of top `size` documents
        for doc_id in self.positions.keys():
            score = self.tf(doc_id) * self.idf(N) 
            heappush(heap, (score, doc_id))
            if len(heap) > size:
                heappop(heap)

        # Extract the top documents and sort by doc_id
        top_docs = []
        while len(heap):
            score, doc_id = heappop(heap)
            top_docs.append((doc_id, score))

        # Sort by doc_id (ascending order)
        top_docs.sort(key=lambda x: x[0])

        # Create a new Posting object for the champion list
        champion_posting_list = Posting(self.word)

        # Add the sorted documents and their positions to the champion list
        for doc_id, _ in top_docs:
            for position in self.positions[doc_id]:
                champion_posting_list.add(doc_id, position)

        self.champion_list = champion_posting_list
        return self.champion_list


    def getfreq(self):
        return self.doc_tf

    def __str__(self):
        return (
            f"total_tf={self.total_tf}, total_df={self.total_df}, word = {self.word}\n"
            f"doc_tf={self.doc_tf}, positions={self.positions}"
        )


In [4]:
posting_list = Posting("کلمه")
posting_list.add(1, 2)
posting_list.add(1, 5)
posting_list.add(2, 3)
posting_list.add(2, 7)
posting_list.add(3, 1)
posting_list.add(3, 4)
posting_list.add(3, 5)

champion_list = posting_list.create_champion_list(size=2, N=N)
print(posting_list)
print("")
print(champion_list)


total_tf=7, total_df=3, word = کلمه
doc_tf={1: 2, 2: 2, 3: 3}, positions={1: [2, 5], 2: [3, 7], 3: [1, 4, 5]}

total_tf=5, total_df=2, word = کلمه
doc_tf={2: 2, 3: 3}, positions={2: [3, 7], 3: [1, 4, 5]}


In [5]:
class PostingsList:
    def __init__(self):
        self.dic_size = 0
        self.norm_weight = dict() #for each doc
        self.map = dict()  # (word,posting)
        self.doc = dict()  # (doc(word,freq))
        
        self.champion_lists = dict()
        self.freq_with_word = []


    def add(self, word, doc_id, position):
        if word not in self.map:
            posting = Posting(word)
            self.dic_size += 1
            self.map[word] = posting

        if doc_id not in self.doc:
            self.doc[doc_id] = dict()

        if word not in self.doc[doc_id]:
            self.doc[doc_id][word] = 0

        self.doc[doc_id][word] += 1
        self.map[word].add(doc_id, position)

    def tf(self, doc_id, word):
        if word not in self.map:
            return 0
        return self.map[word].tf(doc_id)

    def idf(self, word, N):
        if word not in self.map:
            return 0
        return self.map[word].idf(N)

    def score(self, doc_id, word, N):
        return self.tf(doc_id, word) * self.idf(word, N)

    def calcute_norm_weight(self):
        for doc_id, word_freqs in self.doc.items():
            norm_vector = {}
            squared_sum = 0
            for word, freq in word_freqs.items():
                tf_idf_score = self.tf(doc_id, word) * self.idf(word, N)
                norm_vector[word] = tf_idf_score
                squared_sum += tf_idf_score ** 2

            # Normalize weights
            normalization_factor = sqrt(squared_sum)
            if normalization_factor > 0:
                for word in norm_vector:
                    norm_vector[word] /= normalization_factor

            self.norm_weight[doc_id] = norm_vector

    def get_weight(self, doc_id, word):
        if word not in self.norm_weight.get(doc_id, {}):
            return 0
        return self.norm_weight[doc_id][word]

    def get_list_word(self, word):
        if word not in self.map:
            return []
        return self.map[word].get_list_of_docs()

    def create_champion_list(self, size):
        for word, posting in self.map.items():
            if posting.total_df > size:
                self.champion_lists[word] = posting.create_champion_list(size, N)

    def give_champion_list(self, word):
        if word not in self.champion_lists:
            return []
        return self.champion_lists[word].get_list_of_docs()

    def create_freq_word(self):
        for word, posting in self.map.items():
            self.freq_with_word.append((posting.total_df, word))

    def find_word_with_most_freq_and_del(self):
        self.create_freq_word()
        self.freq_with_word.sort(reverse=True, key=lambda x: x[0])
        most_frequent_words = [word for _, word in self.freq_with_word[:50]]
        for word in most_frequent_words:
            self.map.pop(word, None)
        return most_frequent_words

    def __str__(self):
        result = ""
        for word, posting in self.map.items():
            result += f"{word}: {posting}\n"
            result += "-----------------------------------------\n"
        return result
    


In [6]:
posting_list = PostingsList()
posting_list.add("کلمه", 1, 2)
posting_list.add("کلمه", 1, 4)
posting_list.add("کلمه", 1, 5)
posting_list.add("کلمه", 2, 7)
posting_list.add("کلمه2", 3, 4)

posting_list.create_champion_list(size=2)

print(posting_list)

print(posting_list.get_list_word('کلمه'))


کلمه: total_tf=4, total_df=2, word = کلمه
doc_tf={1: 3, 2: 1}, positions={1: [2, 4, 5], 2: [7]}
-----------------------------------------
کلمه2: total_tf=1, total_df=1, word = کلمه2
doc_tf={3: 1}, positions={3: [4]}
-----------------------------------------

[1, 2]
