In [78]:
#Selenium imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Rest
from bs4 import BeautifulSoup
import time
import pandas as pd
import datetime
import h5py
import numpy as np

# Global settings for the driver
chrome_options = Options()
chrome_options.add_experimental_option("detach", True) # keeps driver open until manual termination

In [79]:
def launch_chrome(url):
    """A method to install the latest Chrome driver and get url
    ---------
    Parameters:
    url = url defined by user"""

    global driver
    # installing driver making sure that new driver is up to date
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    return driver


def yahoo_header():
    """Returns headers from finance.yahoo.com/cryptocurrencies"""

    global yahoo_cols
    target_columns = ["Name", "Price (Intraday)", "Market Cap", "Circulating Supply"]
    # Getting all columns on URL using XPATH
    cols_mask = driver.find_elements(
        By.XPATH, '//*[@id="scr-res-table"]/div[1]/table/thead/tr/th'
    )
    # Sequencing all elements to list if the header is in target columns
    yahoo_cols = [i.text for i in cols_mask if i.text in target_columns]
    yahoo_cols.insert(0, "Date")
    # Return final list
    return yahoo_cols


def yahoo_body():
    """A method that returns the body of the table from finance.yahoo.com/cryptocurrencies"""

    global cryptocurrencies_list
    try:
        mask = pd.read_hdf(
            "/Users/emre/Documents/GitHub/crypto_scraping_project/hdf5_data.h5"
        )
        records = mask.to_records(index=False)
        cryptocurrencies_list = list(records)
    except:
        cryptocurrencies_list = []

    # Checking length of table on URL
    x = len(
        driver.find_elements(By.XPATH, '//*[@id="scr-res-table"]/div[1]/table/tbody/tr')
    )

    # Looping through every row on URL until last row
    for row_n in range(1, x):
        # date_time = ct = datetime.datetime.now().strftime("%D %T")

        # Finding elements matching to headers using relative XPATH
        date = datetime.datetime.now().strftime("%D")
        name = driver.find_element(
            By.XPATH, f'//*[@id="scr-res-table"]/div[1]/table/tbody/tr[{row_n}]/td[2]'
        )  # Name
        price = driver.find_element(
            By.XPATH, f'//*[@id="scr-res-table"]/div[1]/table/tbody/tr[{row_n}]/td[3]'
        )  # Price (Intraday)
        marketcap = driver.find_element(
            By.XPATH, f'//*[@id="scr-res-table"]/div[1]/table/tbody/tr[{row_n}]/td[6]'
        )  # MarketCap
        circ_supply = driver.find_element(
            By.XPATH, f'//*[@id="scr-res-table"]/div[1]/table/tbody/tr[{row_n}]/td[10]'
        )  # Circulating Supply

        # Creating Tuple as preperation for HDF5 file
        my_tuple = (date, name.text, price.text, marketcap.text, circ_supply.text)
        # Adding to body
        cryptocurrencies_list.append(my_tuple)

    return cryptocurrencies_list


def create_hdf_file(groupname, filename, data_name, username):
    """A method which creates a hdf file"""

    with h5py.File(
        "/Users/emre/Documents/GitHub/crypto_scraping_project/hdf5_data.h5", "w"
    ) as hdf:
        group = hdf.create_group(groupname)  # Name of Group
        file = group.create_dataset(
            filename,  # name of dataset
            data=data_name,  # name of data
            maxshape=(
                None,
                None,
            ),  # shape of dataset set up for resizing and appending later
            chunks=True,  # Set up access pattern
        )
        filename.attrs["USER"] = username  # Add metadata


In [80]:
launch_chrome('https://finance.yahoo.com/cryptocurrencies/')

<selenium.webdriver.chrome.webdriver.WebDriver (session="99488bf2fee3b66478c9f772582b9ff7")>

In [81]:
yahoo_header();

In [82]:
yahoo_body();

In [83]:
# Context Manager to create HDF5 file for yahoo

with h5py.File('/Users/emre/Documents/GitHub/crypto_scraping_project/hdf5_data.h5', 'w') as hdf:
    yahoo_group = hdf.create_group('crypto_prices') # Name of Group
    yahoo_prices = yahoo_group.create_dataset(
        'yahoo_prices', # name of dataset
        data=cryptocurrencies_list, # name of data
        maxshape=(None,len(yahoo_cols)), # shape of dataset set up for resizing and appending later
        chunks=True, # Set up access pattern
    )
    yahoo_prices.attrs['USER'] = 'Emre Erturk' # Add metadata

In [37]:
with h5py.File('/Users/emre/Documents/GitHub/crypto_scraping_project/hdf5_data.h5', 'a') as hdf:
    mask_lsit = np.array(cryptocurrencies_list) # Convert list of tuples to Numpy array
    # resize target dataset and add shape of new data to it
    hdf['crypto_prices']['yahoo_prices'].resize((hdf['crypto_prices']['yahoo_prices'].shape[0] + mask_lsit.shape[0]), axis=0)
    # fill newly created rows/columns with new data
    hdf['crypto_prices']['yahoo_prices'][mask_lsit.shape[0]:] = mask_lsit;

TypeError: Only chunked datasets can be resized

In [89]:
# Creating new dataset in existing subgroup

with h5py.File('/Users/emre/Documents/GitHub/crypto_scraping_project/hdf5_data.h5', 'r+') as hdf:
    print(list(hdf.keys()))
    group = hdf['/crypto_prices']
    group.create_dataset(
        'bitcoin', # name of dataset
        data= cryptocurrencies_list, # name of data
        maxshape= (None,len(yahoo_cols)), # shape of dataset set up for resizing and appending later
        chunks= True, # Set up access pattern
    )

['crypto_prices']


NameError: name 'mask' is not defined