## Stage I: Image Collection via Web Scraping

In the first stage of the project, the goal was to collect images from websites, with a special focus on identifying images that could correspond to company logos.

In [None]:
import os
import requests
import pandas as pd
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import warnings

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", UserWarning)

dataset_path = './dataset/'

if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)
else:
    print(f"The folder '{dataset_path}' already exists.")

with open("logs.txt", "a") as logs_file:
    pass

The `get_logo_from_html(domain)` function was used to extract images from the webpages of the specified domains.

### 1. Accessing URLs
The function attempts to access multiple variations of URLs for each domain to ensure that a functional URL is found:
- `https://www.{domain}`
- `http://www.{domain}`
- `https://{domain}`
- `http://{domain}`

### 2. Searching for Relevant Images
After obtaining the HTML page, the function uses **BeautifulSoup** to parse the content and search for all `<img>` tags, which contain the sources of the images. In this way, all images from the website are obtained. However, since not all images are automatically logos, a filter is applied based on the following conditions to save only the images that appear to be logos:

- **Searching within the `<header>` tag**:
  - If the image is found within a `<header>` tag, which is often used for visible sections of websites (where logos are typically located), and if the image's `alt` attribute contains the word "logo" or if the image file name contains the words "logo" or "favicon," the image is considered relevant.

- **`<link>` tags with `rel='icon'` or `rel='shortcut icon'` attributes**:
  - These tags are used to specify the site favicons, which are usually logos. These images are also added to the list of relevant images.

- **Manually adding `/favicon.ico`**:
  - By default, `/favicon.ico` is added to the list of images, as this is the standard location for favicons used by many sites.

- **Checking Image Dimensions**:
  - It is required that the dimensions of these images be smaller than 512px in both width and height, as logos typically have smaller dimensions.

### 3. Saving Images to the Dataset
Finally, only the images from the website that meet the conditions listed above are saved to the dataset.

In [None]:
def save_image(url, file_name, dataset_path):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36 Edge/112.0.1722.64',
            'Accept-Language': 'en-US,en;q=0.5',
        }

        try:
            response = requests.get(url, headers=headers, timeout=10, verify=False)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            if 'favicon.ico' not in url:
                with open("logs.txt", "a") as logs_file:
                    logs_file.write(f"Error accessing {url}: {e}\n")
            return None

        content_type = response.headers.get('Content-Type', '')
        valid_types = ['image/png', 'image/jpeg', 'image/jpg', 'image/webp', 'image/gif', 'image/x-icon', 'image/vnd.microsoft.icon']
        if content_type not in valid_types:
            return None

        image_data = BytesIO(response.content)
        try:
            image = Image.open(image_data)
            image.verify()
            image = Image.open(image_data)
        except UnidentifiedImageError:
            return None

        width, height = image.size
        if width > 512 or height > 512:
            return None

        image = image.convert("RGBA")
        file_path = os.path.join(dataset_path, file_name + ".png")
        image.save(file_path, format="PNG")

    except Exception as e:
        with open("logs.txt", "a") as logs_file:
            logs_file.write(f"Error downloading the image from {url}: {e}\n")

In [None]:
def get_logo_from_html(domain):
    response = None
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36 Edge/112.0.1722.64',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept': 'image/x-icon, image/*;q=0.8, */*;q=0.5'
        }

        urls = [
            f'https://www.{domain}',
            f'http://www.{domain}',
            f'https://{domain}',
            f'http://{domain}'
        ]

        correctUrl = ''
        for url in urls:
            headers['Referer'] = url
            response = requests.get(url, verify=False, headers=headers, timeout=10)
                    
            if response.status_code == 403:
                correctUrl = url
                break

            response.raise_for_status()
            correctUrl = url
            break

        if response is None:
            with open("logs.txt", "a") as logs_file:
                logs_file.write(f"All requests failed for {domain}\n")
            return None, []

        try:
            soup = BeautifulSoup(response.text, 'html.parser')

            found_images = []
            images = soup.find_all('img')
            for image in images:
                image_url = image.get('src')
                alt_text = image.get('alt', '').lower()
                file_name = os.path.basename(image_url) if image_url else ''

                if image.find_parent('header') and (
                    'logo' in alt_text or 
                    'logo' in file_name.lower() or 
                    'favicon' in file_name.lower()
                ):
                    found_images.append(image_url)

            link_tags = soup.find_all('link', rel='icon') + soup.find_all('link', rel='shortcut icon')
            for link in link_tags:
                found_images.append(link.get('href'))
            
            found_images.append('/favicon.ico')
            
            found_images = list(set(filter(None, found_images)))
            return correctUrl, found_images
        
        except Exception as e:
            with open("logs.txt", "a") as logs_file:
                logs_file.write(f"Parsing error for {domain}: {e}\n")
            return None, []
    except Exception as e:
        with open("logs.txt", "a") as logs_file:
            logs_file.write(f"Error processing domain {domain}: {e}\n")
        return None, []

The process was repeated for each domain in the dataset to collect as many images that appeared to be logos as possible.

In [None]:
def scrape_from_domains(domains):
    size = len(domains)

    print(f"Starting to process {size} domains...")

    for i, domain in enumerate(domains):
        file_name = domain + '_' + str(i)

        if i % 100 == 0:
            print(f"Processing {i + 1}/{size} domains...")

        try:
            url, logo_urls = get_logo_from_html(domain)
            
            for j, logo_url in enumerate(logo_urls):
                logo_url = urljoin(url, logo_url)
                save_image(logo_url, f"{file_name}_{j}", dataset_path)
        except Exception as e:
            with open("logs.txt", "a") as logs_file:
                logs_file.write(f"Error processing domain {domain}: {e}\n")

    print("Processing complete. All domains have been processed.")

df = pd.read_parquet("logos.snappy.parquet", engine="pyarrow")

domains = df['domain']

scrape_from_domains(domains)