
# Pipeline Web Scrape New Product Images

This collect all product codes from silver layer and use Google Images through Serper API to collect product images.


## Required Libraries

In [0]:

import re
import requests

from pathlib import Path
from datetime import datetime
from pyspark.sql.connect.dataframe import DataFrame

from src.common import (
    read_sql_template,
    table_exists, 
    get_tz,    
    use_schema_and_create_if_not_exists,
)



## Define Unity Catalog

In [0]:

CATALOG = "precos_pmc"
SCHEMA = "silver"



In [0]:
use_schema_and_create_if_not_exists(spark, catalog=CATALOG, schema=SCHEMA)



## Helper functions


In [0]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

retry_strategy = Retry(
    total=3,
    backoff_factor=0.1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["POST"] # Add POST and other desired methods
)

adapter = HTTPAdapter(
    max_retries=retry_strategy
)

sess = requests.Session()
sess.mount("https://", adapter)



In [0]:

def query_serpapi_google_images(session: requests.Session, creds: dict, query: str) -> dict:
    
    url = "https://google.serper.dev/images"

    payload = {
      "q": query,
      "gl": "br",
      "hl": "pt-br"
    }
    headers = {
      'X-API-KEY': creds['api_key'],
      'Content-Type': 'application/json'
    }

    try:
        response = session.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f"Request failed after all attempts: {e}")
        return None
    
    except Exception as e:
        print(f"Unexpected error: {e}")
        return


In [0]:

def get_serpapi_creds():
    return {
        "api_key": dbutils.secrets.get("precos-pmc-serpapi", "serpapi_api_key")
    }




In [0]:

def retrieve_desired_content(cd_produto: int, response_json: dict) -> dict:

    if not 'images' in response_json:
        print(f"There's no image on {response_json}")
        return None

    images = response_json.get('images', [])

    for image in images:
        
        url_thumbnail = image.get('thumbnailUrl', '')
        
        source_title = image.get('title')
        source_name = image.get('source')
        source_link = image.get('link')

        if not url_thumbnail:
            print(f"There's no thumbnail on {image}")
            continue
            
        return {
            'cd_produto': cd_produto,
            'nm_url_thumbnail': url_thumbnail,
            'nm_source_title': source_title,
            'nm_source_name': source_name,
            'nm_source_link': source_link
        }


In [0]:

def grant_table_is_created(catalog: str, schema: str):


    return spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {catalog}.{schema}.produtos_imagens (
            cd_produto BIGINT PRIMARY KEY,
            nm_url_thumbnail STRING,
            nm_source_title STRING,
            nm_source_name STRING,
            nm_source_link STRING,
            dt_ultima_atualizacao TIMESTAMP
        )
        USING DELTA
        PARTITIONED BY 
        (
            dt_ultima_atualizacao
        )
    """)
    



In [0]:

def get_candidates(sql_template_file: str) -> list[dict]:
    _sql  = read_sql_template(sql_template_file)
    candidates = spark.sql(_sql).collect()
    return candidates


## Execution Flow

In [0]:

creds = get_serpapi_creds()
grant_table_is_created(CATALOG, SCHEMA)


In [0]:


df_candidates = get_candidates(sql_template_file=Path('src') / 'silver_list_image_update_candidates.sql')
print('Got : ', len(df_candidates))

In [0]:


new_product_images = []

for i, row in enumerate(df_candidates):
    cd_produto = row['cd_produto']
    query = row['query']

    response_json = query_serpapi_google_images(sess, creds, query)
    
    if response_json is None:
        print(f"Error on query {query}")
        continue
    
    desired_content = retrieve_desired_content(cd_produto, response_json)
    
    if not desired_content or not isinstance(desired_content, dict):
        print(f"Error recieving desired content for {query}")
        continue

    if not desired_content.get('cd_produto'):
        print(f"Error on desired content {desired_content}")
        continue

    new_product_images.append(desired_content)

print('Got: ', len(new_product_images))

In [0]:
tz = get_tz()
now = datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')

base_sql = (f"""
    INSERT INTO {CATALOG}.{SCHEMA}.produtos_imagens 
        (
            cd_produto,
            nm_url_thumbnail,
            nm_source_title,
            nm_source_name,   
            nm_source_link,   
            dt_ultima_atualizacao 
        )
    VALUES 
""")

for i, item in enumerate(new_product_images):
    if not isinstance(item, dict):
        continue
    if not item.get('cd_produto'): 
        continue
    
    pattern = "[']"

    item_sql = f"""
        (
            {item['cd_produto']}, 
            '{item['nm_url_thumbnail']}', 
            '{re.sub(pattern, '', item['nm_source_title'])}', 
            '{re.sub(pattern, '', item['nm_source_name'])}', 
            '{item['nm_source_link']}', 
            '{now}'
        )
    """

    base_sql += item_sql

    if i < len(new_product_images) - 1:
        base_sql += ','

spark.sql(base_sql)
