<a href="https://colab.research.google.com/github/piaizv/Colaboratory/blob/main/scrapy_faces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creación del proyecto

In [None]:
# instalación de Scrapy
!pip install Scrapy

In [None]:
# creación del proyecto
!scrapy startproject project_faces scrapy

In [None]:
# creación del spider
!cd scrapy/project_faces && scrapy genspider FindFaces https://www.investigart.com/

## Edición de path/to/settings.py



*   USER_AGENT
*   ROBOTSTXT_OBEY
*   DEFAULT_REQUEST_HEADERS



In [None]:
%%writefile scrapy/project_faces/settings.py

BOT_NAME = 'project_faces'

SPIDER_MODULES = ['project_faces.spiders']
NEWSPIDER_MODULE = 'project_faces.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'es',
}

REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'

## Ejecución del spider

### No se debe ejecutar hasta que se haya implementado el spider.

In [None]:
!cd scrapy/project_faces && scrapy crawl FindFaces

In [None]:
!cd scrapy/project_faces && scrapy crawl FindFaces -o faces.json

# Implementación del spider

## Versión inicial

In [None]:
%%writefile scrapy/project_faces/spiders/FindFaces.py
import scrapy

class FindfacesSpider(scrapy.Spider):
    name = 'FindFaces'
    allowed_domains = ['www.investigart.com']
    start_urls = ['https://www.investigart.com/']

    def parse(self, response):
        pass

## Versión modificada

In [None]:
%%writefile scrapy/project_faces/spiders/FindFaces.py
import scrapy

class FindfacesSpider(scrapy.Spider):
    name = 'FindFaces'
    allowed_domains = ['www.investigart.com']
    start_urls = ['https://www.investigart.com/']
    pages = 1

    def parse(self, response):
      posts = response.css('.et_pb_salvattore_content > .et_pb_post')
      print('numero de entradas:', len(posts))
      for post in posts:
        href = post.css('.et_pb_image_container > a::attr(href)').get()
        print('href', href)
        yield scrapy.Request(href, callback = self.parse_post, meta={'href': href})
      next_page = response.css('.nextpostslink')
      self.pages += 1
      if next_page and self.pages < 4:
        next_href = next_page.css('a::attr(href)').get()
        yield scrapy.Request(next_href)

    def parse_post(self, response):
      href = response.meta.get('href')
      element = response.xpath('//*[contains(@class, "post-meta")]/following-sibling::img')
      img = element.css('img::attr(src)').get()
      yield {
        'href': href,
        'img': img
      }

## Procesando las caras de las imágenes

In [58]:
from io import BytesIO
import cv2
from PIL import Image
import json
import numpy as np
import requests

In [None]:
json_file = 'scrapy/project_faces/faces.json'
with open(json_file, 'r') as f:
  data = json.load(f)

images = []
for item in data:
  url = item['img']
  img = Image.open(BytesIO(requests.get(url).content))
  img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
  images.append(img)

In [None]:
faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
faces_coords = []
face_count = 0
for img in images:
  gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  faces = faceCascade.detectMultiScale(
        gray,
        scaleFactor = 1.2,
        minNeighbors = 8,
        minSize = (30, 30)
  )
  faces_coords.append(faces)
  print("Found {0} Faces!".format(len(faces)))
  face_count += len(faces)