# Packages

In [122]:
import math
from flask import Flask, render_template, request, redirect, url_for
import json
import os
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import cv2
from collections import defaultdict

# Pre-processing

In [123]:
# Initialize Porter Stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenization and Lowercasing
    tokens = word_tokenize(text.lower())
    # Remove punctuation, stopwords, and perform stemming
    processed_tokens = []
    for token in tokens:
        # Remove punctuation and check if token is not empty after stripping
        token = token.strip(string.punctuation)
        if token != '' and len(token) >= 2:
            # Perform stemming and filter out stopwords
            stemmed_token = stemmer.stem(token)
            if stemmed_token not in stop_words:
                processed_tokens.append(stemmed_token)
    return processed_tokens  # return as list

# Inverted Index

In [124]:
def get_static_path(file_name):
    # Assuming notebook is in the same directory as the static folder
    notebook_dir = os.getcwd()
    static_folder = os.path.join(notebook_dir, 'static')
    return os.path.join(static_folder, file_name)

In [125]:
def load_inverted_index(file_path):
    inverted_index = {}
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            # Split the line into term and postings
            term, postings_str = line.strip().split(":", 1)
            # Convert postings string to list of dictionaries
            postings = eval(postings_str)
            # Create a dictionary entry for the term
            inverted_index[term] = postings
    return inverted_index

# Assuming your inverted index file is located in a 'static' folder in the same directory as your script
# Construct the file path dynamically using the get_static_path function
inverted_index_path = get_static_path('updated_inverted_index.txt')
inverted_index = load_inverted_index(inverted_index_path)

# Ranking: BM-25

In [126]:
def idf(term, N, doc_freq):
    return math.log((N - doc_freq + 0.5) / (doc_freq + 0.5) + 1)

def compute_bm25(inverted_index, query_terms, k1=1.5, b=0.75):
    bm25_scores = []
    N = len(inverted_index)  # Total number of images
    total_text_length = sum(sum(posting['term_frequency'] for posting in postings) for postings in inverted_index.values())
    avgdl = total_text_length / N  # Average document length
    
    for term in query_terms:
        idf_val = idf(term, N, len(inverted_index.get(term, [])))
        if idf_val == 0:
            continue  # Skip terms with IDF of 0
        for posting in inverted_index.get(term, []):
            doc_id = posting['image']
            doc_len = sum(posting['term_frequency'] for posting in inverted_index[term])  # Assuming all terms contribute to the document length
            # Calculate BM25 term score
            term_score = idf_val * (posting['term_frequency'] * (k1 + 1)) / (posting['term_frequency'] + k1 * (1 - b + b * (doc_len / avgdl)))
            bm25_scores.append((term, doc_id, term_score))
    
    return bm25_scores

def rank_bm25(query, inverted_index, k1=1.5, b=0.75):
    query_terms = preprocess(query)  # Assuming the query is already preprocessed
    bm25_scores = compute_bm25(inverted_index, query_terms, k1, b)
    # Filter images that don't contain all query terms
    relevant_docs = set(posting[1] for posting in bm25_scores)
    for term in query_terms:
        if term in inverted_index:
            relevant_docs.intersection_update(posting['image'] for posting in inverted_index[term])
    ranked_docs = [posting for posting in bm25_scores if posting[1] in relevant_docs]
    return ranked_docs


# Remove Duplicates - OpenCV

In [127]:
#  Compare images using OpenCV feature extraction
def compute_image_features(image_path):
    try:
        image_path = get_static_path(image_path)
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # Use a feature extraction technique like ORB
        orb = cv2.ORB_create()
        keypoints, descriptors = orb.detectAndCompute(gray, None)
        return descriptors
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

#  Group similar images together
def find_similar_images(inverted_index):
    similar_images = defaultdict(list)
    for keyword, image_info_list in inverted_index.items():
        for image_info in image_info_list:
            image_path = image_info['positions']
            features = compute_image_features(image_path)
            if features is not None:
                similar_images[keyword].append({'image': image_info['image'], 'features': features})
    return similar_images

#Identify representative images from each group
def identify_representative_images(similar_images):
    representative_images = {}
    for keyword, images_info in similar_images.items():
        representative_images[keyword] = []
        # Choose a representative image based on a criteria, e.g., most features
        representative_image = max(images_info, key=lambda x: len(x['features']))
        representative_images[keyword].append(representative_image)
    return representative_images


# Retrieval: Return Images by User Query

In [156]:
def filter_images_by_query(query_tokens, selected_country, inverted_index, file_path, run_name):
   
    #static_folder = r"C:\Users\MR962RZ\OneDrive - EY\Documents\EY\NUIG Masters\DCU modules\Mechanics of Search\Assignment 2\static"
    # Define the path to the images metadata file
    metadata_file = get_static_path("textual_surrogate2.txt")
    # Initialize a dictionary to store image metadata
    image_metadata = {}
    # Read the metadata file and parse its contents as JSON
    with open(metadata_file, "r", encoding='utf-8') as file:
        data = json.load(file)
        
        # Calculate BM25 scores for images
        bm25_scores = rank_bm25(' '.join(query_tokens), inverted_index)
        write_results(bm25_scores, file_path, run_name)
        # Get the image metadata based on the ranked image IDs
        for term, image_id, score in bm25_scores:
            image_entry = next((entry for entry in data if entry['id'] == image_id), None)
            if image_entry:
                image_metadata[image_id] = image_entry
                image_metadata[image_id]['original_caption'] = image_entry['original_caption']
                image_metadata[image_id]['original_country'] = image_entry['original_country']
              #  image_entry['original_caption'] = image_entry['caption']
                
    # Step 6: Filter out duplicate images based on image features
    def filter_duplicates(image_metadata):
        unique_image_metadata = []
        seen_features = set()
        for image in image_metadata.values():
            features = compute_image_features(image['image_path'])
            if features is not None:
                features_tuple = tuple(tuple(row) for row in features)  # Convert NumPy array to tuple of tuples
                hash_value = hash(features_tuple)
                if hash_value not in seen_features:
                    unique_image_metadata.append(image)
                    seen_features.add(hash_value)
        return unique_image_metadata

    # Filter out duplicate images
    unique_image_metadata = filter_duplicates(image_metadata)

    return unique_image_metadata

# Output Model Results per Query

In [157]:
def write_results(results, file_path, run_name):
    print("Results:", results)  # Add this line to inspect the results
    with open(file_path, 'w') as f:
        rank = 1  # Initialize rank counter
        for result in results:
            if result[2] != 0.0:  # Exclude results with score 0.0
                query_id =  str(result[0])
                document_id =  str(result[1])
                score = result[2]
                # Write in TREC format: <query_id> <Q0> <doc_id> <rank> <score> <run_id>
                f.write(f"{query_id} Q0 {document_id} {rank} {score} {run_name}\n")
                rank += 1  # Increment rank for the next document



# Web Application - Flask

In [158]:
# Parse the textual_surrogate2.txt file to extract unique country values
textual_surrogate_file= get_static_path("original_textual_surrogate.txt")
with open(textual_surrogate_file, 'r') as f:
    data = json.load(f)

# Extract unique country values
unique_countries = sorted(set(entry['country'] for entry in data))

# Modify the image entries to include original captions
for entry in data:
    entry['original_caption'] = entry['caption']

In [None]:

app = Flask(__name__)

@app.route('/')
def home():
    return render_template('combined.html', countries = unique_countries)

@app.route('/search-results', methods=['POST'])
def handle_form():
    if request.method == 'POST':
        # Get the search keywords and selected country from the form
        search_keywords = request.form.get('message')
        selected_country = request.form.get('country')

        # Combine search keywords and selected country if both are provided
        if search_keywords and selected_country:
            search_query = f"{search_keywords} {selected_country}"
        else:
            # Use either search keywords or selected country if one of them is provided
            search_query = search_keywords or selected_country

        # Preprocess the combined search query
        processed_query = preprocess(search_query)

        # Filter images based on the processed query
        file_path = get_static_path('results.txt')
        image_metadata = filter_images_by_query(processed_query, selected_country, inverted_index, file_path, run_name=search_query)
        num_images = len(image_metadata)

        # Render the gallery page with the filtered images
        return render_template('combined.html', country=search_query, num_images=num_images, images=image_metadata, countries=unique_countries, show_dynamic=True)
    else:
        # If the method is not POST, render the gallery page without the dynamic content
        return render_template('combined.html', show_dynamic=False, countries=unique_countries)


app.run(debug=True, port=8080, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:8080
Press CTRL+C to quit
127.0.0.1 - - [03/May/2024 15:55:19] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/bootstrap/css/bootstrap.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/bootstrap-icons/bootstrap-icons.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/swiper/swiper-bundle.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/glightbox/css/glightbox.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/aos/aos.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/css/main.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/bootstrap/js/bootstrap.bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/swiper/swiper-bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:19] "GET /static/vendor/glightbox/js/glightbox.min.js 

Results: [('robin', 'a2244995-131d-4d5e-9c8c-ec217a03b049', 2.440009675367606), ('robin', 'e8eede5f-7b6b-4e59-bbed-c4f373a478da', 3.987016329034715), ('robin', 'c7e034b8-3b9a-43c1-abf4-a24a2fa4b8e1', 2.440009675367606), ('robin', '791f4467-217c-4fab-b33f-af4e4ff158c2', 2.440009675367606), ('robin', 'ccb49917-7103-485a-8418-a128d7f63d1d', 2.440009675367606), ('robin', 'e15a884f-90f9-4a08-939d-e04a574df2c1', 2.440009675367606), ('robin', '439ba49d-db1e-40ae-bdb0-e861dadfdb88', 2.440009675367606), ('robin', '5477687b-5d1a-4a63-80e2-2aa4caf80d0c', 2.440009675367606), ('robin', 'ab5ea243-6b03-4bf4-a615-bb339343b4b4', 2.440009675367606), ('robin', '0d895b27-dfee-4470-9191-362a4f86ba0c', 3.987016329034715), ('robin', 'a1118364-eaf1-4aab-b80c-a0a36e51d6fd', 2.440009675367606), ('robin', '83167374-0e96-46b4-86c8-0295d8b51b98', 2.440009675367606), ('robin', 'a4478de6-d87c-44f6-85c2-379366a79ebf', 2.440009675367606), ('robin', '4a67b191-1965-4de5-bcd1-31ecb8032626', 2.440009675367606), ('robin', 

127.0.0.1 - - [03/May/2024 15:55:27] "POST /search-results HTTP/1.1" 200 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/bootstrap/css/bootstrap.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/bootstrap-icons/bootstrap-icons.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/swiper/swiper-bundle.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/glightbox/css/glightbox.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/aos/aos.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/css/main.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/bootstrap/js/bootstrap.bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/swiper/swiper-bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:27] "GET /static/vendor/glightbox/js/glightbox.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024

Results: [('robin', '07f2a06c-060f-4db9-abaa-9bd12c5b2b88', 3.987016329034715), ('ireland', '07f2a06c-060f-4db9-abaa-9bd12c5b2b88', 1.5265863814452612)]


" 304 -
127.0.0.1 - - [03/May/2024 15:55:38] "GET /static/css/main.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:38] "GET /static/vendor/bootstrap/js/bootstrap.bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:38] "GET /static/vendor/swiper/swiper-bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:38] "GET /static/vendor/glightbox/js/glightbox.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:38] "GET /static/vendor/aos/aos.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:38] "GET /static/js/main.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:48] "POST /search-results HTTP/1.1" 200 -
127.0.0.1 - - [03/May/2024 15:55:48] "GET /static/vendor/bootstrap/css/bootstrap.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:48] "GET /static/vendor/bootstrap-icons/bootstrap-icons.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:48] "GET /static/vendor/swiper/swiper-bundle.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:55:48] "GET /stat

Results: [('owl', '1a12ddc4-9ecc-4c28-aeef-5b1599024064', 0.97988313958459), ('owl', '30beeca7-d037-4a8a-abe7-6560c1ba5e27', 0.97988313958459), ('ireland', '1a12ddc4-9ecc-4c28-aeef-5b1599024064', 1.5265863814452612), ('ireland', '30beeca7-d037-4a8a-abe7-6560c1ba5e27', 1.5265863814452612)]
Results: [('duck', 'a595a315-15fb-4e49-b50b-e443f954e3f8', 0.08874669653359799), ('duck', '558293dd-60db-4a9c-8187-43592fef366a', 0.08874669653359799), ('duck', 'd9716e2c-159f-42b6-a486-6816a3751004', 0.1741200635078951), ('duck', '142750ab-006e-40f4-a330-5f99a1d3fb4a', 0.08874669653359799), ('duck', 'd6b0ba30-843c-49a6-bbdf-55dd2cfb24a5', 0.08874669653359799), ('duck', '98be8389-0c9b-4238-b322-9f13532a6fbd', 0.08874669653359799), ('duck', 'ae74c165-6f1b-4b51-8447-3e3c92d1f1cb', 0.1741200635078951), ('duck', 'dbab38dc-f999-40fe-b422-639913d7ea6c', 0.08874669653359799), ('duck', '05bd9432-6ed0-4834-b02d-1fd1d89954d3', 0.08874669653359799), ('ireland', 'a595a315-15fb-4e49-b50b-e443f954e3f8', 1.526586381

127.0.0.1 - - [03/May/2024 15:56:06] "POST /search-results HTTP/1.1" 200 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/bootstrap/css/bootstrap.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/bootstrap-icons/bootstrap-icons.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/swiper/swiper-bundle.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/glightbox/css/glightbox.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/aos/aos.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/bootstrap/js/bootstrap.bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/css/main.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/swiper/swiper-bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:06] "GET /static/vendor/glightbox/js/glightbox.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024

Results: [('bird', 'e655d0c9-88c0-401e-8d90-2c235c52e420', 0.02547840820577644), ('bird', 'a01c35aa-c6d8-483b-8bb8-7ad6a6bf9c08', 0.02547840820577644), ('bird', 'c49de302-2878-4b71-9894-4f657ddafde1', 0.02547840820577644), ('bird', '2a2e00e8-b851-433b-8db9-92477646f2b2', 0.02547840820577644), ('bird', 'd09695c9-99a9-4dfc-8fb4-41ef965d06ab', 0.02547840820577644), ('bird', 'b7d0c2c2-ddb5-4ebf-a72d-a16f25bc419b', 0.02547840820577644), ('bird', '795c7595-701e-499e-a4ae-fbde2a2de9cc', 0.02547840820577644), ('bird', '290295e6-56dc-4f22-8cf0-34d8f3eac506', 0.02547840820577644), ('bird', 'f9c3072d-c0ff-4b53-a320-335490fefa8a', 0.02547840820577644), ('bird', '60f0b9f9-baee-455e-9f50-f5828a3be8d5', 0.02547840820577644), ('bird', 'f0c4edfc-bb7b-4a39-b0e9-d6e4654ed633', 0.02547840820577644), ('bird', 'b6c72815-d51b-493a-a5d5-6012e94b27f0', 0.02547840820577644), ('bird', 'a3435421-65bf-4512-b9c2-9d9821cbefd8', 0.02547840820577644), ('bird', '7f890eb7-8ab4-4886-b524-d9cd2d690e96', 0.0254784082057764

Error processing image C:\Users\MR962RZ\OneDrive - EY\Documents\EY\NUIG Masters\DCU modules\Mechanics of Search\Assignment 2\static\images\3f9f7ac2-aff8-406f-aa67-77b9b494218e.jpg: OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'

Error processing image C:\Users\MR962RZ\OneDrive - EY\Documents\EY\NUIG Masters\DCU modules\Mechanics of Search\Assignment 2\static\images\4ecd600e-b16b-4c91-95db-75a48c376622.jpg: OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'



127.0.0.1 - - [03/May/2024 15:56:41] "POST /search-results HTTP/1.1" 200 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/bootstrap/css/bootstrap.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/bootstrap-icons/bootstrap-icons.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/swiper/swiper-bundle.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/glightbox/css/glightbox.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/css/main.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/aos/aos.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/bootstrap/js/bootstrap.bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/swiper/swiper-bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 15:56:41] "GET /static/vendor/glightbox/js/glightbox.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024

Results: [('bird', 'a6b8da71-947a-45a4-9a69-78cdab7f9a10', 0.050425684553674875), ('bird', '6f668a88-a6d0-4239-bd5a-f6b6d89cc1c3', 0.02547840820577644), ('bird', 'f54311dc-e83e-4562-90cb-ec931141e745', 0.02547840820577644), ('prey', 'a6b8da71-947a-45a4-9a69-78cdab7f9a10', 0.29750617213272484), ('prey', '6f668a88-a6d0-4239-bd5a-f6b6d89cc1c3', 0.29750617213272484), ('prey', 'f54311dc-e83e-4562-90cb-ec931141e745', 0.29750617213272484), ('ireland', 'a6b8da71-947a-45a4-9a69-78cdab7f9a10', 1.5265863814452612), ('ireland', '6f668a88-a6d0-4239-bd5a-f6b6d89cc1c3', 1.5265863814452612), ('ireland', 'f54311dc-e83e-4562-90cb-ec931141e745', 1.5265863814452612)]


127.0.0.1 - - [03/May/2024 16:00:47] "POST /search-results HTTP/1.1" 200 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/bootstrap/css/bootstrap.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/bootstrap-icons/bootstrap-icons.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/swiper/swiper-bundle.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/glightbox/css/glightbox.min.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/aos/aos.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/css/main.css HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/bootstrap/js/bootstrap.bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/swiper/swiper-bundle.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024 16:00:47] "GET /static/vendor/glightbox/js/glightbox.min.js HTTP/1.1" 304 -
127.0.0.1 - - [03/May/2024