In [1]:
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from collections import OrderedDict
import math
import re
import numpy as np 

In [2]:
#nltk.download()

In [3]:
def tokenization(text):
    tokens = []
    ExpReg = nltk.RegexpTokenizer('(?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*')
    tokens = ExpReg.tokenize(text)
    return tokens

In [4]:
def stop_words(tokens):
    nltk_stopwords = nltk.corpus.stopwords.words('english')
    tokens_without_stopwords = []
    tokens_without_stopwords = [word for word in tokens if word.lower() not in nltk_stopwords]
    return tokens_without_stopwords

In [5]:
def normalization_porter(tokens): #stemming
    Porter = nltk.PorterStemmer()
    normalized_words = []
    normalized_words = [Porter.stem(word) for word in tokens]
    return normalized_words

In [6]:
def normalization_lancaster(tokens): #stemming
    Lancaster = nltk.LancasterStemmer()
    normalized_words = []
    normalized_words = [Lancaster.stem(word) for word in tokens]
    return normalized_words

In [7]:
from collections import defaultdict
def index(files, Inverse, Tokenize, PorterStemmer):
    word_file_count = defaultdict(set)
    d = {}
    dict = {}
    for filename in files: 
        with open(os.path.join("Collection", filename), "r") as file:
            text = file.read()

            if Tokenize:
                tokens = tokenization(text)
            else:
                tokens = text.split()

            tokens_without_stopwords = stop_words(tokens)

            if PorterStemmer:
                normalized_words = normalization_porter(tokens_without_stopwords)
            else:
                normalized_words = normalization_lancaster(tokens_without_stopwords)

            words_frequency = nltk.FreqDist(normalized_words)

            file_id = int(re.search(r'\d+', os.path.basename(filename)).group())

            for word in words_frequency.keys():
                if Inverse:
                    if word in d:
                        d[word].append((file_id, words_frequency[word], max(list(words_frequency.values()))))
                    else:
                        d[word] = [(file_id, words_frequency[word], max(list(words_frequency.values())))]
                else:
                    if file_id in d:
                        d[file_id].append((word, words_frequency[word], max(list(words_frequency.values()))))
                    else:
                        d[file_id] = [(word, words_frequency[word], max(list(words_frequency.values())))]

                    word_file_count[word].add(file_id)  


    for key1, values in d.items():
        for (key2, freq, max_freq) in values:
                if Inverse:
                    value = (key2, freq, (freq / max_freq) * np.log10(((len(files) / len(d[key1]))+1)))
                else: 
                    value = (key2, freq, (freq / max_freq) * np.log10(((len(files) / len(word_file_count[key2]))+1)))
                if key1 in dict:
                    dict[key1].append(value)
                else:
                    dict[key1] = [value]
                    
    return dict


In [8]:
def write_dict_to_file(dictionary, filename):
    with open(filename, 'w') as file:
        for key, values in dictionary.items():
            for value in values:
                (files_list, freq, weight) = value
                file.write(f"{key} {files_list} {freq} {weight:.5f}\n")

In [9]:
def write_relevance_to_file(list, filename):
    with open(filename, 'w') as file:
        for key, value in list:
            file.write(f"{key} {value:.5f}\n")

In [10]:
files = os.listdir(os.path.abspath("Collection"))
dict = index(files, Inverse=True, Tokenize=True, PorterStemmer=True)
write_dict_to_file(dict, "InverseTokenPorter.txt")
        
print(dict)
print(len(dict))
print("*************************************************************")

{'research': [(1, 2, 0.19084850188786498), (3, 3, 0.17892047051987342), (6, 2, 0.08674931903993863)], 'success': [(1, 1, 0.16901960800285137)], 'appli': [(1, 1, 0.16901960800285137)], 'larg': [(1, 1, 0.07958800173440753), (2, 1, 0.0497425010840047), (3, 1, 0.0497425010840047), (4, 1, 0.04421555651911529)], 'languag': [(1, 1, 0.09542425094393249), (3, 1, 0.059640156839957804), (4, 1, 0.05301347274662915)], 'model': [(1, 3, 0.1806179973983887), (2, 7, 0.26340124620598354), (3, 1, 0.03762874945799765), (4, 2, 0.06689555459199582), (5, 1, 0.06020599913279624), (6, 11, 0.3010299956639812)], 'llm': [(1, 3, 0.28627275283179743), (3, 2, 0.11928031367991561), (4, 2, 0.1060269454932583)], 'chatgpt': [(1, 1, 0.16901960800285137)], 'rerank': [(1, 5, 0.8450980400142568)], 'inform': [(1, 1, 0.12041199826559248), (6, 1, 0.05473272648436022)], 'retriev': [(1, 1, 0.06848453616444126), (2, 3, 0.12840850530832737), (3, 3, 0.12840850530832737), (5, 2, 0.13696907232888253), (6, 1, 0.031129334620200573)], '

# utils

In [11]:
def preprocessing(files, Tokenize, PorterStemmer):
    docs = {}

    for filename in files: 
        with open(os.path.join("Collection", filename), "r") as file:
            text = file.read()

            if Tokenize:
                tokens = tokenization(text)
            else:
                tokens = text.split()

            tokens_without_stopwords = stop_words(tokens)

            if PorterStemmer:
                normalized_words = normalization_porter(tokens_without_stopwords)
            else:
                normalized_words = normalization_lancaster(tokens_without_stopwords)
            
            file_id = int(re.search(r'\d+', os.path.basename(filename)).group())
            docs[file_id] = normalized_words
    return docs

In [12]:
def preprocess_query(query, Tokenize, PorterStemmer):
    if Tokenize:
        q = tokenization(query)
    else :
        q = query.split()
    if PorterStemmer:
        q = normalization_porter(q)
    else:
        q = normalization_lancaster(q)
    return q

In [13]:
def get_docs_ids(files):
    docs_id = []
    for filename in files: 
        file_id = int(re.search(r'\d+', os.path.basename(filename)).group())
        docs_id.append(file_id)
    return docs_id

# Vectorial Model

In [29]:
def scalar_product(query, file_path):
    terms_by_doc = {}
    with open(file_path, 'r') as file:
        for line in file:
            term, doc_id, freq, weight = line.split()
            if term in query:
                weight = float(weight) 

                if doc_id in terms_by_doc:
                    terms_by_doc[doc_id] += weight
                else:
                    terms_by_doc[doc_id] = weight

    terms_by_doc = sorted(terms_by_doc.items(), key=lambda x: x[1], reverse=True)

    return terms_by_doc

In [15]:
def cosine_measure(query, file_path):
    terms_by_doc = scalar_product(query, file_path)
    weight_by_doc = {}
    result_by_doc = {}
    sum_vi = len(query)

    with open(file_path, 'r') as file:
        for line in file:
            term, doc, freq, weight = line.split()
            weight = float(weight) 
  
            if doc in weight_by_doc:
                weight_by_doc[doc] += weight**2
            else:
                weight_by_doc[doc] = weight**2

    sum_vi = math.sqrt(sum_vi)
    
    for doc, sum_squared in weight_by_doc.items():
        weight_by_doc[doc] = math.sqrt(sum_squared)

    for doc, terms in terms_by_doc.items():
        result_by_doc[doc] = terms / (sum_vi * weight_by_doc[doc])

    result_by_doc= sorted(result_by_doc.items(), key=lambda x: x[1], reverse=True)

    return result_by_doc

In [16]:
def jaccard_measure(query, file_path):
    terms_by_doc = scalar_product(query, file_path)
    weight_by_doc = {}
    result_by_doc = {}
    sum_vi = len(query)

    with open(file_path, 'r') as file:
        for line in file:
            term, doc, freq, weight = line.split()
            weight = float(weight)  
  
            if doc in weight_by_doc:
                weight_by_doc[doc] += weight**2
            else:
                weight_by_doc[doc] = weight**2

    for doc, terms in terms_by_doc.items():
        result_by_doc[doc] = terms / (sum_vi + weight_by_doc[doc] - terms)

    result_by_doc= sorted(result_by_doc.items(), key=lambda x: x[1], reverse=True)

    return result_by_doc

In [17]:
def file(Tokenize, PorterStemmer):
    if Tokenize:
        if PorterStemmer:
            return "InverseTokenPorter.txt"
        else:
            return "InverseTokenLancaster.txt"
    else:
        if PorterStemmer:
            return "InverseSplitPorter.txt"
        else:
            return "InverseSplitLancaster.txt"

In [18]:
def vectorial_model(query, Tokenize, PorterStemmer, SP, cosine, jaccard):
        query = preprocess_query(query, Tokenize, PorterStemmer)
        file_path = file(Tokenize, PorterStemmer)
        if SP:
            result = scalar_product(query, file_path)
        else: 
            if cosine:
                result = cosine_measure(query, file_path)
            else:
                if jaccard:
                    result = jaccard_measure(query, file_path)

        return result

# Probabilistic Model

In [73]:
def n_docs_terms(file_path, query):
    documents_containing_terms = {}

    with open(file_path, 'r') as file:
        for line in file:
            current_term, _, _, _ = line.strip().split()

            if current_term in query:
                if current_term not in documents_containing_terms:
                    documents_containing_terms[current_term] = 1
                else:
                    documents_containing_terms[current_term] += 1
            elif len(documents_containing_terms) == len(query):
                # If we have encountered all terms and the next one is different, stop the loop
                break
    return documents_containing_terms

In [84]:
def BM25(query, file_path, K, B):
    dl = {}
    result = {}
    vocab_len = 0

    with open(file_path, 'r') as file:
        for line in file:
            _, doc_id, freq, _ = line.strip().split()
            dl[doc_id] = dl.get(doc_id, 0) + int(freq)
            vocab_len += int(freq)

    N = len(dl)
    avdl = vocab_len / N
    ni = n_docs_terms(file_path, query)

    with open(file_path, 'r') as file:
        for line in file:
            term, doc, freq, _ = line.split()
            freq = int(freq)
            if term in query:
                if doc in result:
                    result[doc] += ((freq / (K * ((1 - B) + B * (dl[doc] / avdl)) + freq)) * np.log10(((N - ni[term] + 0.5) / (ni[term] + 0.5))))
                else:
                    result[doc] = ((freq / (K * ((1 - B) + B * (dl[doc] / avdl)) + freq)) * np.log10(((N - ni[term] + 0.5) / (ni[term] + 0.5))))
                    
    result = sorted(result.items(), key=lambda x: x[1], reverse=True)

    return result

In [85]:
def probabilistic_model(query, Tokenize, PorterStemmer, K, B):
    query = preprocess_query(query, Tokenize, PorterStemmer)
    file_path = file(Tokenize, PorterStemmer)
    result = BM25(query, file_path, K, B)
        
    return result

# Boolean Model

In [86]:
def boolean_query(query):
    
    if isinstance(query, list):
        query = ' '.join(query)

    # reg expresssion 
    reg_exp = r'\b(?:((?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*)\b|and|or|not)\b'

    matches = re.findall(reg_exp, query)

    if not is_valid_boolean_query(matches):
        print("La requête n'est pas valide.")
        return None
    return matches

def is_valid_boolean_query(matches):
    if not matches:
        return False

    operators = {'and', 'or', 'not'}
    for match in matches:
        if match not in operators and not re.match(r'\b\w+\b', match):
            return False
    
    # operator order
    if matches[0] in operators-{'not'} or matches[-1] in operators:
        return False
    
    # NOT & term term 
    for i in range(len(matches) - 1):
        if matches[i] == 'not' and ((not matches[i + 1]) or (matches[i+1] in operators)):
            return False
        if  matches[i] not in operators and matches[i+1] not in operators:
            print(matches[i])
            return False
        
    #  AND OR / OR AND
    for i in range(len(matches) - 2):
        if matches[i] in operators-{'not'} and matches[i + 1] in operators-{'not'}:
            return False
        
    return True


In [87]:
def boolean_query_evaluation(query, dict, docs_id):

    terms_and_operators = boolean_query(query)
    if terms_and_operators == None:
        return None
    else:
        
        result_set = set(docs_id)

        operator_stack = []

        for token in terms_and_operators:
            if token == 'not':
                operator_stack.append('not')
            elif token == 'and':
                operator_stack.append('and')
            elif token == 'or':
                operator_stack.append('or')
            else:
                term_results = set(tup[0] for tup in dict[token]) if token in dict else set()
                if 'not' in operator_stack:
                    term_results = set(docs_id) - term_results
                    operator_stack.remove('not')
                if 'and' in operator_stack:
                    result_set = result_set.intersection(term_results)
                    operator_stack.remove('and')
                elif 'or' in operator_stack:
                    result_set = result_set.union(term_results)
                    operator_stack.remove('or')
                else:
                    result_set = term_results

        return result_set

In [88]:
def boolean_model(query, files, Tokenize, PorterStemmer):
    result_dict = {}
    query = preprocess_query(query, Tokenize, PorterStemmer)
    docs_id = get_docs_ids(files)
    dict = index(files, True, Tokenize, PorterStemmer)
    results = boolean_query_evaluation(query, dict, docs_id)
    if results != None:
        for doc in docs_id:
            if doc in results:
                result_dict[doc] = 'YES'
            else:
                result_dict[doc] = 'NO'
    else:
        result_dict = None

    return result_dict

# Test

In [89]:
query = "Documents ranking"
res = probabilistic_model(query, Tokenize=True, PorterStemmer=True, K=1.50, B=0.75)
#res2 = vectorial_model(query, Tokenize=True, PorterStemmer=True, SP=True, cosine=False, jaccard=False)
#res3 = boolean_model(query, files, Tokenize=True, PorterStemmer=True)
print(res)


avdl= 127.33333333333333
dict des documents de chaque terme: {'document': 5, 'rank': 4}
dl[doc]= 145
doc= 3
dl[doc]= 123
doc= 4
dl[doc]= 137
doc= 5
dl[doc]= 120
doc= 6
[('2', -0.4530776383132547), ('3', -0.45967871691057005), ('6', -0.47714350676623185), ('5', -0.5028401178205029), ('4', -0.5371824829383636)]


# Metrics

In [90]:
def precision(relevant_docs, retrieved_docs, cutoff=None):
    total_retrieved = len(retrieved_docs[:cutoff])
    if total_retrieved == 0:
        return 0
    relevant_retrieved = len(set(relevant_docs) & set(retrieved_docs[:cutoff]))
    return relevant_retrieved / total_retrieved

def recall(relevant_docs, retrieved_docs, cutoff=None):
    total_relevant = len(relevant_docs)
    if total_relevant == 0:
        return 0
    relevant_retrieved = len(set(relevant_docs) & set(retrieved_docs[:cutoff]))
    return relevant_retrieved / total_relevant

def f_score(precision_value, recall_value):
    if precision_value + recall_value > 0:
        return 2 * (precision_value * recall_value) / (precision_value + recall_value)
    else:
        return 0

In [91]:
with open('Queries.txt', 'r') as queries_file:
    queries = [line.strip() for line in queries_file]

with open('Judgements.txt', 'r') as judgments_file:
    judgments = [tuple(map(int, line.strip().split())) for line in judgments_file]

print(queries)
print(judgments)

['Have recent language models delivered competitive results in information retrieval?', 'What recent studies explore query expansion through embeddings?']
[(1, 1), (1, 3), (1, 4), (2, 2), (2, 6)]


In [92]:
for i, query in enumerate(queries, start=1):
    dict = vectorial_model(query, files, Tokenize=True, PorterStemmer=True, SP=True, cosine=False, jaccard=False)
    relevant_docs = [doc_id for (q_id, doc_id) in judgments if q_id == i]
    selected_docs = list(dict.keys())
    selected_relevant_docs = [doc for doc in relevant_docs if doc in selected_docs]
    # Metrics evaluation
    precision_value = precision(selected_relevant_docs, selected_docs)
    precision_5 = precision(selected_relevant_docs, selected_docs, 5)
    precision_10 = precision(selected_relevant_docs, selected_docs, 10)
    recall_value = recall(selected_relevant_docs, selected_docs)
    f_score_value = f_score(precision_value, recall_value) 
    
    print(f"Query {i} Metrics:")
    print(f"P@5: {precision_5}")
    print(f"P@10: {precision_10}")
    print(f"Recall: {recall_value}")
    print(f"F-Score: {f_score_value}")
    print()

TypeError: vectorial_model() got multiple values for argument 'Tokenize'