In [25]:
import csv
from flask import Flask, render_template, request
import math
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import word_tokenize
import re

In [26]:
# Load Arabic stopwords from NLTK
stopwords = set(nltk.corpus.stopwords.words('english'))

# Preprocessing
def preprocess(query):
    # Remove punctuation marks
    query = re.sub(r'[^\w\s]', '', query)
    
    # Tokenization
    tokens = word_tokenize(query.lower())

    # Stop-word removal
    filtered_tokens = [token for token in tokens if token not in stopwords]

    return filtered_tokens

# Build inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, text in documents.items():
        tokens = preprocess(text)
        for token in tokens:
            inverted_index[token].append(doc_id)
    return inverted_index

# Compute TF-IDF
def compute_tfidf(documents, inverted_index):
    tfidf = {}
    document_count = len(documents)
    token_count = Counter()

    # Compute term frequency (TF) for each document
    for doc_id, text in documents.items():
        tokens = preprocess(text)
        tf = Counter(tokens)
        token_count.update(tokens)

        # Compute TF-IDF for each term
        for token, freq in tf.items():
            tfidf[doc_id, token] = (freq / len(tokens)) * math.log(document_count / len(inverted_index[token]))

    return tfidf

# Compute cosine similarity
def cosine_similarity(query_vector, document_vector):
    dot_product = sum(query_vector[token] * document_vector[token] for token in query_vector)
    query_norm = math.sqrt(sum(val ** 2 for val in query_vector.values()))
    doc_norm = math.sqrt(sum(val ** 2 for val in document_vector.values()))
    return dot_product / (query_norm * doc_norm) if query_norm != 0 and doc_norm != 0 else 0

# Retrieve and rank documents based on TF-IDF
def retrieve_and_rank_tfidf(query, documents, tfidf, inverted_index):
    query_tokens = preprocess(query)
    query_vector = Counter(query_tokens)
    scores = defaultdict(int)

    for token in query_tokens:
        for doc_id in inverted_index[token]:
            scores[doc_id] += tfidf.get((doc_id, token), 0)

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:3]  # Only top 3 ranked
    return ranked_docs

# Retrieve and rank documents based on cosine similarity
def retrieve_and_rank_cosine(query, documents, tfidf, inverted_index):
    query_tokens = preprocess(query)
    query_vector = Counter(query_tokens)
    scores = defaultdict(int)

    for token in query_tokens:
        for doc_id in inverted_index[token]:
            document_vector = Counter({token: tfidf.get((doc_id, token), 0)})
            scores[doc_id] += cosine_similarity(query_vector, document_vector)

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:]  # Only top 3 ranked
    return ranked_docs

In [27]:
filename = 'queries2.csv'  
documents = {}
with open(filename, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader) 
    for row_id, row in enumerate(reader, start=1):
        document = ' '.join(row)  # Join all elements in the row into a single string
        documents[row_id] = document
        
app = Flask(__name__)

# Home page
@app.route('/')
def home():
    return render_template('index.html')

# Search results page
@app.route('/search', methods=['POST'])
def search():
    query = request.form['query']
    inverted_index = build_inverted_index(documents)
    tfidf = compute_tfidf(documents, inverted_index)
    tfidf_ranked_docs = retrieve_and_rank_tfidf(query, documents, tfidf, inverted_index)
    cosine_ranked_docs = retrieve_and_rank_cosine(query, documents, tfidf, inverted_index)
    return render_template('search_results.html', query=query, tfidf_ranked_docs=tfidf_ranked_docs, cosine_ranked_docs=cosine_ranked_docs, documents=documents)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [09/May/2024 20:56:47] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [09/May/2024 20:56:51] "POST /search HTTP/1.1" 200 -
127.0.0.1 - - [09/May/2024 20:57:53] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [09/May/2024 20:57:57] "POST /search HTTP/1.1" 200 -
