In [63]:
# Selenium imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


# Rest
from bs4 import BeautifulSoup
import time
import pandas as pd
import datetime
import h5py
import numpy as np

# Global settings for the driver
chrome_options = Options()
chrome_options.add_experimental_option(
    "detach", True
)  # keeps driver open until manual termination


In [64]:
def launch_chrome(url):
    """A method to install the latest Chrome driver and get url
    ---------
    Parameters:
    url = url defined by user"""

    global driver
    # installing driver making sure that new driver is up to date
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)
    return driver


In [65]:
top_crypto = [
    "bitcoin",
    "ethereum",
    "tether",
    "usd-coin",
    "bnb",
    "xrp",
    "binance-usd",
    "cardano",
    "solana",
    "dogecoin",
    "polkadot-new",
    "polygon",
    "tron",
]

crypto_project_info = []

for crypto in top_crypto:
    try:
        launch_chrome(f"https://coinmarketcap.com/currencies/{crypto}/project-info/")
        driver.implicitly_wait(10)  # seconds

        driver.maximize_window()

        # Not all pages have an additional layer in dynamic component. If there is no dynamic component, the try block is not needed.
        # The try block clicks on "Social Stats"
        try:
            # Cookie element blocks the selection of target element "Social Stats"
            cookie = driver.find_element(
                By.XPATH, '//*[@id="cmc-cookie-policy-banner"]/div[2]'
            )
            cookie.click()

            # Depending on the information available, there are multiple menus to click,
            # determining the lenth of that block is key to find the correct X-Path for "Social Status"
            mask_number_of_buttons = driver.find_elements(
                By.XPATH,
                '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[1]',
            )

            # For loop to check the lenghth of sub-menu
            for e in mask_number_of_buttons:
                test = e.text.split("\n")
                number_of_buttons = len(test)

            # finding "social status" element depending on page
            if number_of_buttons == 2:
                element = driver.find_element(
                    By.XPATH,
                    '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/div[2]',
                )
            elif number_of_buttons == 3:
                element = driver.find_element(
                    By.XPATH,
                    '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/div[3]',
                )
            elif number_of_buttons == 4:
                element = driver.find_element(
                    By.XPATH,
                    '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/div[4]',
                )
            else:
                element = driver.find_element(
                    By.XPATH,
                    '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/div[5]',
                )

            # clicking on "Social Status"
            WebDriverWait(driver, timeout=5)
            element.click()
            wait = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "selected"))
            )
        except NoSuchElementException:
            pass

        github_commits = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[1]/div[2]',
        )
        github_stars = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[2]/div[2]',
        )
        github_forks = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[3]/div[2]',
        )
        github_contributors = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[4]/div[2]',
        )
        github_followers = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[5]/div[2]',
        )
        twitter_followers = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[6]/div[2]',
        )
        reddit_members = driver.find_element(
            By.XPATH,
            '//*[@id="__next"]/div/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/div/div[2]/div[7]/div[2]',
        )

        list = [
            github_commits.text,
            github_stars.text,
            github_forks.text,
            github_contributors.text,
            github_followers.text,
            twitter_followers.text,
            reddit_members.text,
        ]

        list_new = [n.replace(",", "") for n in list]

        crypto_project_info.append(list_new)

    # Catch errarnous
    except NoSuchElementException:
        raise NoSuchElementException(f"{crypto.title()}")

driver.quit()

In [125]:
mask = crypto_project_info.copy()
mask_list_1 = []
for n in mask:
    mask_list_2 = []
    for x in n:
        if x == '--':
            x = 0
        x = int(x)
        mask_list_2.append(x)
    mask_list_1.append(mask_list_2)

mask = mask_list_1
crypto_project_info = mask

In [126]:
crypto_project_info

[[35628, 66489, 33099, 878, 3929, 0, 4634477],
 [13723, 39735, 15128, 834, 2181, 2910074, 1502623],
 [0, 0, 0, 0, 0, 299197, 0],
 [836, 358, 216, 17, 24, 0, 0],
 [1, 3750, 2045, 30, 506, 9571719, 876708],
 [12706, 4233, 1406, 97, 506, 2572297, 353502],
 [8, 95, 74, 4, 23, 9571719, 0],
 [0, 0, 0, 0, 0, 1342400, 697509],
 [0, 0, 0, 0, 0, 2055475, 153985],
 [14211, 14164, 2621, 272, 853, 3459279, 2342604],
 [3570, 6316, 1391, 212, 482, 1352935, 40634],
 [1859, 601, 308, 16, 68, 1686578, 51886],
 [17087, 3151, 1146, 179, 351, 3356959, 125083]]

In [127]:
with h5py.File('/Users/emre/Documents/GitHub/crypto_scraping_project/hdf5_data.h5', 'r+') as hdf:
    group = hdf
    group.create_dataset(
        'crypto_project_info_2', # name of dataset
        data= crypto_project_info, # name of data
        maxshape= (None,len(crypto_project_info)), # shape of dataset set up for resizing and appending later
        chunks= True, # Set up access pattern
    )