In [None]:
import csv
from flask import Flask, render_template, request
import math
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
import re

In [None]:
# Preprocessing, remove stopwords and punctuations and tokenizing the words
def preprocess(query):
    st = ISRIStemmer()
    stopwords = set(nltk.corpus.stopwords.words('arabic'))
    query = re.sub(r'[^\w\s]', '', query)
    tokens = word_tokenize(query.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords]
    stemmed_tokens = [st.stem(token) for token in filtered_tokens] 
    return stemmed_tokens

# Compute cosine similarity
def cosine_similarity(query_vector, document_vector):
    intersection = set(query_vec.keys()) & set(document_vec.keys())
    numerator = sum(query_vector[token] * document_vector[token] for token in intersection)
    query_norm = math.sqrt(sum(val ** 2 for val in query_vector.values()))
    doc_norm = math.sqrt(sum(val ** 2 for val in document_vector.values()))
    demonirator = query_norm * doc_norm
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

# Build inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, text in documents.items():
        tokens = preprocess(text)
        for token in tokens:
            inverted_index[token].append(doc_id)
    return inverted_index

# Compute TF-IDF
def TF(word, document):
    words = word_tokenize(document.lower())
    #words = document.split()
    word_count = 0
    total_words = len(words)
    for w in words:
        if w == word:
            word_count += 1
    tf = word_count / total_words
    return math.log10(tf + 1)

def IDF(word, corpus):
    total_documents = len(corpus)
    document_count = 0
    for doc in corpus:
        if word in doc:
            document_count += 1
    return math.log10(total_documents / (document_count+1))

def compute_tfidf(corpus):
    tfidf_scores = {}
    for document in corpus:
        words = word_tokenize(document.lower())
        for word in set(words):
            tf = TF(word, document)
            idf = IDF(word, corpus)
            tfidf_scores[word] = tf * idf    

# Retrieve and rank documents based on cosine similarity
def retrieve_and_rank_cosine(query, documents, tfidf, inverted_index):
    query_tokens = preprocess(query)
    query_vector = Counter(query_tokens)
    scores = defaultdict(int)

    for token in query_tokens:
        for doc_id in inverted_index[token]:
            document_vector = Counter({token: tfidf.get((doc_id, token), 0)})
            scores[doc_id] += cosine_similarity(query_vector, document_vector)

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True) 
    return ranked_docs

# Retrieve and rank documents based on TF-IDF
def retrieve_and_rank_tfidf(query, documents, tfidf, inverted_index):
    query_tokens = preprocess(query)
    query_vector = Counter(query_tokens)
    scores = defaultdict(int)

    for token in query_tokens:
        for doc_id in inverted_index[token]:
            scores[doc_id] += tfidf.get((doc_id, token), 0)

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)  
    return ranked_docs

In [None]:
filename = 'queries.csv'  
documents = {}
with open(filename, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader) 
    for row_id, row in enumerate(reader, start=1):
        document = ' '.join(row)  # Join all elements in the row into a single string
        documents[row_id] = document
     
app = Flask(__name__)

# Home page
@app.route('/')
def home():
    return render_template('index.html')

# Search results page
@app.route('/search', methods=['POST'])
def search():
    query = request.form['query']
    inverted_index = build_inverted_index(documents)
    tfidf = compute_tfidf(documents)
    tfidf_ranked_docs = retrieve_and_rank_tfidf(query, documents, tfidf, inverted_index)
    cosine_ranked_docs = retrieve_and_rank_cosine(query, documents, tfidf, inverted_index)
    return render_template('search_results.html', query=query, tfidf_ranked_docs=tfidf_ranked_docs, cosine_ranked_docs=cosine_ranked_docs, documents=documents)

if __name__ == '__main__':
    app.run()