# Image Data Extraction

__Goal:__ Download images to use for braces prediction via before/after photos from orthodontic websites.

## 0 Set Up

### 0.1 Imports

In [1]:
# import utility libraries
import os
import re
import shutil
import time

# import environment variables
from dotenv import load_dotenv
from environments import environment as env

# import scraping libraries
from bs4 import BeautifulSoup
import requests

# import google colab
from google.colab import files

### 0.2 Directories

Create directories to store raw data in google colab

In [2]:
!mkdir -p data/raw

In [4]:
RAW_DATA_DIRPATH = 'data/raw'

### 0.3 Custom Utilities

Creating a `ToothScraper` class to support image scraping (site urls are stored in environment.py)

In [None]:
from environment.environment import SCRAPE_URLS

In [None]:
class ToothScraper:
  '''web scraper class to scrape (fetch and download) dental images'''

  def __init__(self, site_url: str, dest_dir: str, imgs_selector: str = 'img', img_url_parser = lambda img: img['src']):
    self.site_url = site_url
    self.dest_dir = dest_dir
    self.imgs_selector = imgs_selector
    self.img_url_parser = img_url_parser

  @staticmethod
  def download_image(url: str, fpath: str):

    # download the image
    res = requests.get(url, stream = True)

    # check if the download was successful
    if res.status_code == 200:

      # save the image to the file path
      with open(fpath, 'wb') as f:
        shutil.copyfileobj(res.raw, f)

      return 0

    else:
      print(f'Unable to fetch image: {url}')
      return 1

  def fetch_img_urls(self):

    # fetch the main page
    self.page = requests.get(self.site_url)

    # parse the page
    self.soup = BeautifulSoup(self.page.content, 'html.parser')

    # select all images
    img_tags = self.soup.select(self.imgs_selector)

    # extract the image urls and filter out empty urls
    self.img_urls = list(filter(None, [self.img_url_parser(img) for img in img_tags]))


  def download_images(self):
    for i, url in enumerate(self.img_urls):

      file_split = os.path.splittext(url)
      if len(file_split) != 2:
        print(f'Unable to parse file ending: {url}')
        continue

      file_ending = file_split[1]

      img_rename = f'raw_{i}{file_ending}'
      img_dest = os.path.join(self.dest_dir, img_rename)

      ret = self.download_image(url, img_dest)
      if not ret:
        print(f'Unable to download file: {url}')
        continue

      # sleep to avoid getting blocked
      time.sleep(0.25)

## 1 Scrape Data

### 1.1 Scrape the data from first URL (doing this 1-by-1 on the first pass)

In [None]:
# grab the first URL from the environments variable
url_1_obj = SCRAPE_URLS[0]

# initialize the destination to save the images from first downlod
# TODO: automatically generate destination path
dest = os.path.join(RAW_DATA_DIRPATH, '1')

Define the site-specific `img_url_parser` (Note: many of the sites have varying conventions for where they're storing images and how they're reference in html)

Initialize the `tooth_scraper` for the first URL

In [None]:
# define the img_url parser for first url
def img_url_parser(img_tag):

    img_url_re = re.compile(r'src=\"([a-zA-Z/0-9_\-\.]+)\"')
    re_search = img_url_re.search(str(img_tag))
    storage_url = url_1_obj['storage_url']

    if re_search:
      return storage_url + re_search.groups(0)

    return None

# create the tooth scraper for first URL
tooth_scraper = ToothScraper(url_1_obj['site_url'], dest, imgs_selector='li img', img_url_parser=img_url_parser)

Scrape the web page to get the storage URLs of each image

Download each image from the storage URL location

In [None]:
# scrape the webpage for the image storage locations
tooth_scraper.fetch_img_urls()

# download the images to google colab
tooth_scraper.download_images()

Zip and save the data to google colab

In [None]:
!zip -r data/1/raw.zip data/1/raw

### 1.2 Scrape data from the second URL

In [None]:
# grab the second URL from the environments variable
url_2_obj = SCRAPE_URLS[1]

# initialize the destination to save the images from second downlod
# TODO: automatically generate destination path
dest = os.path.join(RAW_DATA_DIRPATH, '2')

# define the img_url parser for second url
def img_url_parser(img):

    img_url_re = re.compile(r'href=\"([a-zA-Z/0-9_\-\.]+)\" title')
    re_search = img_url_re.search(str(img))
    storage_url = url_2_obj['storage_url']

    if re_search:
      return storage_url + re_search.groups(0)

    return None

# create the tooth scraper for second URL
tooth_scraper = ToothScraper(url_2_obj, dest, imgs_selector='div#links a', img_url_parser=img_url_parser)

In [None]:
# scrape the webpage for the image storage locations
tooth_scraper.fetch_img_urls()

# download the images to google colab
tooth_scraper.download_images()

Zip and save the data to google colab

In [None]:
!zip -r data/2/raw.zip data/2/raw

## 2 Save Locally

### 2.1 Download zipped files

in the Google Colab UI, download the zipped data filed for subsequent training