In [140]:
import requests
from bs4 import BeautifulSoup
import json
import warnings
warnings.filterwarnings('ignore')

In [141]:
import pandas as pd
from matplotlib import pyplot as plt

In [142]:
user_agent_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36',
                   'Chrome/79.0.3945.130']

In [143]:
def search_watch(brand, model, page_number = 1):
    model_brand = brand + ' ' + model
    url = 'https://www.chrono24.com.br/{}/index.htm?dosearch=true&pageSize=120&query={}&showpage={}'.format(
        brand.replace(' ', ''), model_brand.replace(' ', '+'), page_number)
    headers = {
        'User-Agent': user_agent_list[0],
        'Accept-Language': 'en-US,en;q=0.9'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return (response)
    else:
        print(f"Failed to retrieve content. Status code: {response.status_code}")

In [144]:
def extract_html_element(response, element):
    soup = BeautifulSoup(response.text, 'html.parser')
    script_element = soup.find('script', {'type': element})
    if script_element:
        return(script_element.string)
    else:
        print("Script element not found.")


In [145]:
def convert_string_to_json(string):
    try:
        json_data = json.loads(string)
        return(json_data)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")


In [146]:
def format_search_results_to_dataframe(json_data):

    offer_frame = pd.DataFrame()
    for offer_ in json_data['@graph'][1]['offers']:
        offer = offer_.copy()
        del offer['image']
        s = pd.DataFrame(offer, index = [0])
        offer_frame = pd.concat([offer_frame, s], axis=0)
        
    offer_frame.availability = offer_frame.availability.apply(lambda x: str(x).split('/')[-1])
    offer_frame = offer_frame[offer_frame.price.isna() == False]
    offer_frame = offer_frame[offer_frame.availability == 'InStock']
    offer_frame = offer_frame[offer_frame['@type'] == 'Offer']
    offer_frame = offer_frame.drop(['@type', 'availability'], axis=1)
    offer_frame.price = offer_frame.price.astype(int)

    return offer_frame.sort_values('price')

In [147]:
def watch_search(brand, model, page_number = 1):
    response = search_watch(brand, model, page_number)
    html_element = extract_html_element(response, element = 'application/ld+json')
    json_data = convert_string_to_json(html_element)
    offer_frame = format_search_results_to_dataframe(json_data)
    return offer_frame

In [148]:
def get_offer_information(url):
    headers = {
        'User-Agent': user_agent_list[0],
        'Accept-Language': 'en-US,en;q=0.9'}

    response = requests.get(url, headers=headers)
    text = response.text
    soup = BeautifulSoup(text, 'html.parser')
    data_dict = {}

    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) == 2:
            key = cells[0].strong.text.strip()
            value = cells[1].text.strip()
            data_dict[key] = value

    out = pd.DataFrame(data_dict, index = [0])
    out['url'] = url
    return out

In [None]:
infos = pd.DataFrame()
for url in res.url:
    infos = pd.concat([infos, get_offer_information(url)], axis=0)

In [None]:
infos.isna().sum()

Código do anúncio           0
Marca                       0
Modelo                      0
Número de referência        2
Movimento                   5
Material da caixa           4
Material da bracelete       7
Ano de fabrico              0
Estado                      0
Conteúdo fornecido          0
Género                      8
Localização                 0
Preço                       0
Disponibilidade             0
Diâmetro                   16
Material da luneta         23
Mostrador                   5
Cor da bracelete           21
Material do fecho          20
url                         0
Calibre/Movimento          28
Calibre de base            58
Número de rubis            34
Largura                    58
Fecho                      20
Código do comerciante      35
Altura                     57
Estanqueidade              33
Vidro                      20
Algarismos do mostrador    23
Largura de aperto          59
Frequência                 58
Reserva de corda           35
dtype: int