In [10]:
import json
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

def export_to_parquet(df: pd.DataFrame, output_path: str) -> None:
    df.to_parquet(output_path, index=False)

chromedriver_path = "C:\\Users\\josef\\projects\\data-engineer-test-suzano\\plugins\\chromedriver.exe"


def init_driver(chromedriver_path: str) -> webdriver.Chrome:
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--lang=pt-BR')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    service = Service(executable_path=chromedriver_path)
    return webdriver.Chrome(service=service, options=options)

def fetch_usd_cny_data(driver: webdriver.Chrome) -> pd.DataFrame:
    
    url = "https://api.investing.com/api/financialdata/2111/historical/chart/?interval=P1M&period=MAX&pointscount=160"
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    tag = soup.find("pre")
    if not tag:
        raise ValueError()

    json_data = json.loads(tag.text)
    data = json_data.get("data", [])

    df = pd.DataFrame(data, columns=["timestamp", "close", "open", "high", "low", "volume", "unknown"])
    df["date"] = pd.to_datetime(df["timestamp"], unit="ms")
    df = df[df["date"] >= "1991-01-01"]
    df.drop(columns=["timestamp", "unknown"], inplace=True)

    return df[["date", "close", "open", "high", "low", "volume"]]

driver = init_driver(chromedriver_path)
df = df_usd_cny = fetch_usd_cny_data(driver)
export_to_parquet(df, "C:\\Users\\josef\\projects\\data-engineer-test-suzano\\include\\datasets\\usd_cny.parquet")

