<a href="https://colab.research.google.com/github/moses-crasto/Mechanics-of-Search/blob/main/Mos_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pexels scraper

In [None]:
import argparse
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrap_pexels(
    chromedriver_path=r"chromedriver_win32\chromedriver.exe",
    links_file="links.csv",
    category="animals",
    max_posts=250,
    scroll_coef=0.4,
):
    # Use Chrome options
    chrome_options = Options()
    chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
    browser = webdriver.Chrome(options=chrome_options)
    browser.get(f"https://www.pexels.com/search/{category}/")  # Construct URL directly
    body_scroll_height = (
        browser.execute_script("return document.body.scrollHeight") * scroll_coef
    )
    scroll_height_from = 0
    scroll_height_to = body_scroll_height

    browser.execute_script(
        f"window.scrollTo({scroll_height_from}, {scroll_height_to});"
    )
    scroll_height_from = scroll_height_to
    scroll_height_to += body_scroll_height

    data = []
    unique_image_sources = set()  # Set to store unique image sources
    while len(data) < max_posts:
        # Wait for the elements to be present
        elements = WebDriverWait(browser, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "MediaCard_card__6_MG7"))
        )
        for el in elements:
            # Find the img element within the WebElement el
            img_element = el.find_element(By.TAG_NAME, "img")

            # Get the src attribute of the img element
            img_src = img_element.get_attribute("src")
            if img_src in unique_image_sources:
                continue  # Skip if image source is already encountered

            alt_text = img_element.get_attribute("alt")
            data.append((img_src, alt_text))
            unique_image_sources.add(img_src)  # Add image source to set
            if len(data) >= max_posts:
                break

        browser.execute_script(
            f"window.scrollTo({scroll_height_from}, {scroll_height_to});"
        )
        scroll_height_from = scroll_height_to
        scroll_height_to += body_scroll_height

        time.sleep(0.2)

    print("Saving links: ", links_file)
    # Open the CSV file in append mode
    with open(links_file, "a", newline='') as fout:
        writer = csv.writer(fout)
        # Write header only if the file is empty
        if fout.tell() == 0:
            writer.writerow(["Image Source", "Alt Text"])
        # Write data to CSV
        for img_src, alt_text in data:
            writer.writerow([img_src, alt_text])
    print("Done")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--links_file", default="links.csv", help="CSV file used to store links to posts"
    )
    parser.add_argument(
        "-c", "--category", choices=['animals', 'places', 'sports', 'food', 'rainbow', 'holiday', 'vehicles', 'indoor', 'man', 'woman'], help="Category (animals, places, sports, food, rainbow, holiday, vehicles, indoor, man, woman)"
    )
    parser.add_argument(
        "--max_posts",
        default=250,
        type=int,
        help="Maximum number of posts to be scraped",
    )
    args = parser.parse_args()

    # Check if category is provided
    if args.category is None:
        parser.error("Please provide a category.")
    else:
        scrap_pexels(**vars(args))

# Annotation

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests
from io import BytesIO
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_en_stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load the CSV file
csv_file_path = '/content/drive/MyDrive/datasets/links.csv'  # Update with your file path
df = pd.read_csv(csv_file_path)

# Load the pre-trained ResNet50 model
resnet_model = ResNet50(weights='imagenet', include_top=True)

# Define a function to preprocess the image
def preprocess_image(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status()  # Raise an exception for non-200 status codes

        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))  # Resize image to fit ResNet input size
        img_array = np.array(img)
        img_array = preprocess_input(img_array)  # Apply ResNet specific preprocessing

        return np.expand_dims(img_array, axis=0)  # Add batch dimension

    except requests.exceptions.RequestException as e:
        print(f"Error downloading image from {image_url}: {e}")
        return None  # Handle download errors gracefully

# Define a function to get annotations for an image
def get_annotation(image_url):
    image = preprocess_image(image_url)
    if image is not None:
        predictions = resnet_model.predict(image)
        decoded_preds = decode_predictions(predictions, top=5)[0]
        annotation = ', '.join([class_name for (_, class_name, _) in decoded_preds])
        return annotation
    else:
        return None

# Annotate images and update CSV file
df['Annotation'] = df['Image Source'].apply(get_annotation)

# Save the updated DataFrame to CSV
df.to_csv(csv_file_path, index=False)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/imagenet_class_index.json


# VSM

In [None]:
# Load English stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    tokens = word_tokenize(text)
    text = [word for word in tokens if word not in stop_words]  # remove stop words
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(lemmatized)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Load data
df = pd.read_csv("/content/drive/MyDrive/datasets/links.csv")
text_data = df[['Alt Text', 'Annotation']].apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Preprocess text
preprocessed_text = [preprocess_text(item) for item in text_data]

# Fit and transform the data with TfidfVectorizer
tfidf_vectors = vectorizer.fit_transform(preprocessed_text)

# Store TF-IDF matrix as a NumPy array
np.save("/content/drive/MyDrive/datasets/tfidf_matrix.npy", tfidf_vectors.toarray())

In [None]:
# Load the TF-IDF matrix
tfidf_matrix = np.load("/content/drive/MyDrive/datasets/tfidf_matrix.npy")

# Preprocess and vectorize the query text
query_text = "elephant"
preprocessed_query = preprocess_text(query_text)
query_vector = vectorizer.transform([preprocessed_query])

# Calculate cosine similarity between query vector and document vectors
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

# Rank documents based on similarity scores
ranked_indices = np.argsort(similarity_scores, axis=1)[0][::-1]  # Descending order

# Retrieve relevant information for the top-ranked documents
top_k = 10  # Choose the number of top documents to retrieve
for idx in ranked_indices[:top_k]:
    alt_text = df.iloc[idx]['Alt Text']
    annotation = df.iloc[idx]['Annotation']
    image_url = df.iloc[idx]['Image Source']
    similarity_score = similarity_scores[0][idx]
    print(f"Alt Text: {alt_text}")
    print(f"Annotation: {annotation}")
    print(f"Image URL: {image_url}")
    print(f"Similarity Score: {similarity_score}\n")

Alt Text: Free 1 Elephant Beside on Baby Elephant Stock Photo
Annotation: African_elephant, tusker, Indian_elephant, warthog, Arabian_camel
Image URL: https://images.pexels.com/photos/66898/elephant-cub-tsavo-kenya-66898.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
Similarity Score: 0.8182450983737064

Alt Text: Free Gray Elephant Stock Photo
Annotation: African_elephant, tusker, Indian_elephant, hartebeest, zebra
Image URL: https://images.pexels.com/photos/59840/elephant-baby-safari-elephants-africa-59840.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
Similarity Score: 0.7830139351584846

Alt Text: Free Grayscale Photo of 2 Elephants Stock Photo
Annotation: African_elephant, tusker, Indian_elephant, triceratops, warthog
Image URL: https://images.pexels.com/photos/33394/elephant-ivory-animals-africa.jpg?auto=compress&cs=tinysrgb&dpr=1&w=500
Similarity Score: 0.7635549482424364

Alt Text: Free Photo of Elephants on Grass Stock Photo
Annotation: tusker, Indian_elephant, African_elephant, water

# app.py

In [None]:
from flask import Flask, render_template, request
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.tokenize import word_tokenize

app = Flask(__name__)

# Load the DataFrame with image information
df = pd.read_csv("links.csv")

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Load English stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    tokens = word_tokenize(text)
    text = [word for word in tokens if word not in stop_words]  # remove stop words
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(lemmatized)

# Fit TF-IDF vectorizer
text_data = df[['Alt Text', 'Annotation']].apply(lambda row: ' '.join(row.astype(str)), axis=1)
preprocessed_text = [preprocess_text(item) for item in text_data]
tfidf_vectors = vectorizer.fit_transform(preprocessed_text)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/search', methods=['POST'])
def search():
    query = request.form['query']
    preprocessed_query = preprocess_text(query)
    query_vector = vectorizer.transform([preprocessed_query])
    similarity_scores = cosine_similarity(query_vector, tfidf_vectors)
    ranked_indices = np.argsort(similarity_scores, axis=1)[0][::-1]
    top_k = 10  # Number of top-ranked documents to retrieve
    top_results = []
    for idx in ranked_indices[:top_k]:
        alt_text = df.iloc[idx]['Alt Text']
        annotation = df.iloc[idx]['Annotation']
        image_url = df.iloc[idx]['Image Source']
        similarity_score = similarity_scores[0][idx]
        top_results.append({'alt_text': alt_text, 'annotation': annotation, 'image_url': image_url, 'similarity_score': similarity_score})
    return render_template('index.html', query=query, results=top_results)

if __name__ == '__main__':
    app.run(debug=True)


# index.html

In [None]:
<!DOCTYPE html>
<html class="scroll-smooth" lang="en">

<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Image Search Engine</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet">
</head>

<body class="p-16 bg-blue-50">
    <header class="w-full p-8 flex justify-center">
        <style type="text/css">
            @import url("https://fonts.googleapis.com/css?family=Inconsolata|Roboto+Mono|Ubuntu+Mono|Cutive+Mono");
            pre{
            display: inline-block;
            font-size: 7px;
            text-align: center;
            font-family: Ubuntu Mono, monospace !important;
            text-rendering: optimizeSpeed;
            }
        </style>
        <pre style="font-size: 7px; font-family: &quot;Ubuntu Mono&quot;; background-color: rgb(255, 255, 255); color: rgb(0, 0, 0); line-height: 7px;">
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        $$$$$$$$$$.........$$$$$$$$$$$$$$V........$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$..............$$$$$$$$$$
        $$$$$$$$$$.........$$$$$$$$$$$$$$.........$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$:...................$$$$$$$
        $$$$$$$$$$..........$$$$$$$$$$$$..........$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$.......................$$$$$
        $$$$$$$$$...........$$$$$$$$$$$$..........V$$$$$$$$$$$$$$$$$$$$$$$$$$$$........................$$$$$
        $$$$$$$$$............$$$$$$$$$$............$$$$$$$$$$$$$$$$$$$$$$$$$$$$..........V$$..........$$$$$$
        $$$$$$$$$.............$$$$$$$$V............$$$$$$$$$$$$$$$$$$$$$$$$$$$$.........$$$$$$$....V$$$$$$$$
        $$$$$$$$:.............$$$$$$$$.............$$$$$$$$$$$$$$$$$$$$$$$$$$$$...........V$$$$$$$$$$$$$$$$$
        $$$$$$$$...............$$$$$$...............$$$$$$$$$$$$$$$$$$$$$$$$$$$$.............$$$$$$$$$$$$$$$
        $$$$$$$$...............$$$$$$...............$$$$$$$$$$$$$$$$$$$$$$$$$$$$$..............$$$$$$$$$$$$$
        $$$$$$$$................$$$$................$$$$$$$$$..........$$$$$$$$$$$:...............$$$$$$$$$$
        $$$$$$$........$.........$$.........V........$$$$$................$$$$$$$$$$$...............$$$$$$$$
        $$$$$$$........$$........$$........$$........$$$....................$$$$$$$$$$$...............$$$$$$
        $$$$$$$........$$:................$$$........$$......................$$$$$$$$$$$$..............$$$$$
        $$$$$$........$$$$................$$$................................:$$$$$$$$$$$$$$............$$$$
        $$$$$$........$$$$$..............$$$$$................................$$$$$..$$$$$$$$$..........$$$$
        $$$$$$........$$$$$..............$$$$$................................$:......$$$$$$$$$.........$$$$
        $$$$$........$$$$$$$............$$$$$$..........................................:$$$$V..........$$$$
        $$$$$........$$$$$$$$..........$$$$$$$$.........................................................$$$$
        $$$$$........$$$$$$$$..........$$$$$$$$........$....................$$$........................$$$$$
        $$$$........:$$$$$$$$$........$$$$$$$$$........$$$................:$$$$$.....................$$$$$$$
        $$$$........$$$$$$$$$$........$$$$$$$$$$.......$$$$$$...............$$$$$$$................$$$$$$$$$
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$........$$$$$$$$$$:....V$$$$$$$$$$$$$$
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$.......:$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$........$$$$$$$$$$$$$$$$$$$$$$$$$$$
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$........$$$$$$$$$$$$$$$$$$$$$$$$$
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$.......$$$$$$$$$$$$$$$$$$$$$$$$$
        $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$...$$$$$$$$$$$$$$$$$$$$$$$$$$$
        </pre>
    </header>
    <main class="px-8">
        <form action="/search" method="POST" class="flex justify-center items-center">
            <input type="text" name="query" placeholder="Enter your query" required
                class="w-4/12 italic px-2 py-2 transition ease-in-out border-2 border-gray-800 rounded-lg focus:outline-none focus:border-blue-700 focus:ring-2 focus:ring-blue-700">
            <button type="submit"><img class="w-16 h-16 rounded-full" src="http://www.endlessicons.com/wp-content/uploads/2012/12/search-icon.png"></button>
        </form>
        {% if query %}
        <h2 class="italic m-4 text-lg">Showing results for <a class="underline">{{ query }}</a></h2>
        <div class="results grid grid-cols-3 gap-4">
            {% for result in results %}
            <div class="result row-auto">
                <img class="rounded-lg shadow-gray-800 hover:shadow-2xl transition ease-in-out" src="{{ result.image_url }}" alt="{{ result.alt_text }}">
            </div>
            {% endfor %}
        </div>
        {% endif %}
    </main>
</body>

</html>