# SCRAPING SET UP

In [1]:
# Import necessary modules
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from datetime import datetime
import pandas as pd
import inspect
import random
import time
import re

# FUNCTIONS

# Sleep
def fSleep(x, y):
    # x must be larger than y
    time.sleep(random.uniform(x, y))

# 5etools Start
def startScraping(url):
    # VARIABLES

    # Buttons
    button_filter = '//*[@id="filter-search-group"]/button[1]'
    button_all_sources = '/html/body/div[7]/div/div[2]/div[1]/div[1]/div[2]/div[2]/div[2]/button[1]'
    button_save_filters = '/html/body/div[7]/div/div[1]/div[2]/div[2]/button[3]'

    # Lists
    list_parent = '//*[@id="listcontainer"]/div[4]'

    # SCRAPING

    # Open the webpage

    # Instantiate an Options object
    option = webdriver.ChromeOptions()
    #Remove navigator.webdriver flag
    option.add_argument('--disable-blink-features=AutomationControlled')
    # Change the resolution of the browser
    option.add_argument("window-size=1920,1080")
    # Adjusting the user agent
    option.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
    #Open Browser
    driver = webdriver.Chrome(executable_path='Z:\\chromedriver.exe', options=option)
    # Open the specfied URL
    driver.get(url)
    # Sleep to avoid errors
    time.sleep(random.uniform(10.0, 15.0))

    # Set up the filters

    # Open the filter menu
    driver.find_element_by_xpath(button_filter).click()
    # Sleep to avoid errors
    fSleep(3, 5)
    # Set the filter to all sources
    driver.find_element_by_xpath(button_all_sources).click()
    # Sleep to avoid errors
    fSleep(3, 5)
    # Extra filter settings for items
    #if type == 'item':
        
    # Save the filter settings
    driver.find_element_by_xpath(button_save_filters).click()
    # Sleep to avoid errors
    fSleep(3, 5)
    # Print noting success
    print('Filters set to all sources')
    # Return the driver to work with
    return driver, list_parent

# BACKGROUNDS

In [2]:
# Set up a DataFrame
dfBackgrounds = pd.DataFrame(columns = ['NAME', 'SOURCE', 'PAGE_NUMBER', 'SKILL_PROFICIENCIES', 'LANGUAGES', 'LANGUAGES_COUNT', 'TOOL_PROFICIENCIES', 'EQUIPMENT', 'PERSONALITY_TRAIT', 'IDEAL', 'BOND', 'FLAW'])

# Open 5etools.com and set up the filters
driver, list_parent = startScraping("https://5e.tools/backgrounds.html#acolyte_phb")

# VARIABLES

# Count the number of rows
myList = driver.find_element_by_xpath(list_parent)
vRowCount = len(myList.find_elements_by_xpath("./*"))

# Loop through all the rows and scrape their data
for row in range(1, vRowCount + 1):
    print(f'Scraping item {row} of {vRowCount}')

    # Set up the row based on its index
    xRow = f'//*[@id="listcontainer"]/div[4]/div[{row}]'
    # Click into the row
    driver.find_element_by_xpath(xRow).click()
    # Sleep
    fSleep(0.5, 1)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # SCRAPING
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # ROW NAME
    tName = driver.find_element_by_xpath('//*[@id="pagecontent"]/tr[2]').text
    workingName = tName.split("\n")
    # Name
    finalName = workingName[0]
    # Source
    finalSource = workingName[1]
    # Page Number
    finalPageNumber = workingName[2]

    # ROW BACKGROUND PROPERTIES
    try:
        properties = driver.find_element_by_xpath('//*[@id="pagecontent"]/tr[4]/td/div/ul').text
        properties = properties.split("\n")
        # Define list of properties to look for
        propList = ['Skill Proficiencies', 'Languages', 'Equipment', 'Tool Proficiencies']
        # Create a dictionary to store the properties
        dictProperties = {"Skill Proficiencies": '', "Languages": '', "Equipment": '', "Tool Proficiencies": ''}
        # Iterate over items in propList
        for i in propList:
            # Iterate over items in properties
            for z in properties:
                if i in z:
                    prop = z.replace(f'{i} ', "")
                    # Append to the Dictionary
                    dictProperties[i] = prop
    except Exception:
        # Create a dictionary to store the properties
        dictProperties = {"Skill Proficiencies": '', "Languages": '', "Equipment": '', "Tool Proficiencies": ''}
    
    # Extract number of extra languages
    if 'One ' in dictProperties['Languages'] or 'one ' in dictProperties['Languages']:
        languagesExtra = 1
    elif 'Two ' in dictProperties['Languages'] or 'two ' in dictProperties['Languages']:
        languagesExtra = 2
    elif 'Three ' in dictProperties['Languages'] or 'three ' in dictProperties['Languages']:
        languagesExtra = 3
    else:
        languagesExtra = None
    # ROLLING TABLES

    # Peronsality Traits
    for num in range(2, 6 + 1):
        try:
            traitsRaw = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[1]/tbody').text
            tableTitle = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[1]/thead').text
            if 'Trait' in tableTitle:
                break
        except Exception:
            traitsRaw = None
    if traitsRaw != None:
        traitSplit = traitsRaw.split("\n")
        traitList = []
        # Loop through the traitSplit to clean the data
        for i in traitSplit:
            i = re.sub("\d\s", "", i)
            traitList.append(i)
        # Combine into one string to store in the DF
        finalTrait = "-----".join(traitList)
    else:
        finalTrait = traitsRaw

    # Ideal
    for num in range(2, 6 + 1):
        try:
            idealsRaw = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[2]/tbody').text
            tableTitle = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[2]/thead').text
            if 'Ideal' in tableTitle:
                break
        except Exception:
            idealsRaw = None
    if idealsRaw != None:
        idealSplit = idealsRaw.split("\n")
        idealList = []
        # Loop through the idealSplit to clean the data
        for i in idealSplit:
            i = re.sub("\d\s", "", i)
            idealList.append(i)
        # Combine into one string to store in the DF
        finalIdeal = "-----".join(idealList)
    else:
        finalIdeal = idealsRaw

    # Bond
    for num in range(2, 6 + 1):
        try:
            bondsRaw = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[3]/tbody').text
            tableTitle = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[3]/thead').text
            if 'Bond' in tableTitle:
                break
        except Exception:
            bondsRaw = None
    if bondsRaw != None:
        bondSplit = bondsRaw.split("\n")
        bondList = []
        # Loop through the bondSplit to clean the data
        for i in bondSplit:
            i = re.sub("\d\s", "", i)
            bondList.append(i)
        # Combine into one string to store in the DF
        finalBond = "-----".join(bondList)
    else:
        finalBond = bondsRaw

    # Flaw
    for num in range(2, 6 + 1):
        try:
            flawsRaw = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[4]/tbody').text
            tableTitle = driver.find_element_by_xpath(f'//*[@id="pagecontent"]/tr[4]/td/div/div[{num}]/table[4]/thead').text
            if 'Flaw' in tableTitle:
                break
        except Exception:
            flawsRaw = None
    if flawsRaw != None:
        flawSplit = flawsRaw.split("\n")
        flawList = []
        # Loop through the flawSplit to clean the data
        for i in flawSplit:
            i = re.sub("\d\s", "", i)
            flawList.append(i)
        # Combine into one string to store in the DF
        finalFlaw = "-----".join(flawList)
    else:
        finalFlaw = flawsRaw

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # DATAFRAME
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~

    dfBackgrounds = dfBackgrounds.append({'NAME': finalName, 'SOURCE': finalSource, 'PAGE_NUMBER': finalPageNumber, 'SKILL_PROFICIENCIES': dictProperties['Skill Proficiencies'], 'LANGUAGES': dictProperties['Languages'],  'LANGUAGES_COUNT': languagesExtra, 'TOOL_PROFICIENCIES': dictProperties['Tool Proficiencies'], 'EQUIPMENT': dictProperties['Equipment'], 'PERSONALITY_TRAIT': finalTrait, 'IDEAL': finalIdeal, 'BOND': finalBond, 'FLAW': finalFlaw}, ignore_index = True)


# Export to XLSX
dfBackgrounds.to_excel('backgrounds.xlsx')
# Close the driver    
driver.close()

Filters set to all sources
Scraping item 1 of 99
Scraping item 2 of 99
Scraping item 3 of 99
Scraping item 4 of 99


KeyboardInterrupt: 


# ITEMS

# 

## Scraping

In [None]:
# Set up a DataFrame

# Open 5etools.com and set up the filters for the bestiary
driver, list_parent = startScraping('https://5e.tools/items.html#battleaxe%20armblade_erlw')

# BASIC VARIABLES

# Mundane table
tableMundane = '//*[@id="listcontainer"]/div[1]/div[4]'
# Magic Table
tableMagic = '//*[@id="listcontainer"]/div[2]/div[3]'

# Count the number of rows in Mundane
mundaneList = driver.find_element_by_xpath(tableMundane)
# 367 Mundane Items
vMundaneCount = len(mundaneList.find_elements_by_xpath("./*"))
# Print
print(f'Mundane Items: {vMundaneCount}')

# Count the number of rows in Magic
magicList = driver.find_element_by_xpath(tableMagic)
#1014 Magic Items
vMagicCount = len(magicList.find_elements_by_xpath("./*"))
# Print
print(f'Magic Items: {vMagicCount}')

# TABLE VARIABLES

# ROW 2
row2 = driver.find_element_by_xpath('//*[@id="pagecontent"]/tr[2]').text
workingName = row2.split("\n")
# Name
finalName = workingName[0]
# Source
finalSource = workingName[1]
# Page Number
finalPageNumber = workingName[2]

# ROW 3
row3 = driver.find_element_by_xpath('//*[@id="pagecontent"]/tr[3]').text
row3 = row3.split(" (")
rarity = row3[0].split(", ")
# Attunement Requirements
try:
    finalAttunement = row3[1].replace(")", "")
except:
    pass
# Rarity Tier
finalTier = rarity[0]
# Rarity
finalRarity = rarity[1]




# ----------------
# CLOSE THE DRIVER
# ----------------
driver.close()

# BESTIARY

## Scraping

In [None]:
# Set up a DataFrame

# Open 5etools.com and set up the filters for the bestiary
driver, list_parent = startScraping('https://5e.tools/bestiary.html#aarakocra_mm')

# VARIABLES

# Count the number of rows
myList = driver.find_element_by_xpath(list_parent)
# 1630 Bestiary entries
vRowCount = len(myList.find_elements_by_xpath("./*"))
# Print
print(vRowCount)

driver.close()

# SPELLS

## Scraping (DONE)

In [None]:
# Set up a DataFrame
dfSpells = pd.DataFrame(columns = ['NAME', 'SOURCE', 'PAGE_NUMBER', 'LEVEL', 'SCHOOL', 'CASTING_TIME', 'RANGE', 'COMPONENTS', 'COMPONENTS_MATERIALS', 'COMPONENETS_COST', 'DURATION', 'CLASSES', 'SUBCLASSES'])

# Open 5etools.com and set up the filters
driver, list_parent = startScraping("https://5e.tools/spells.html#abi-dalzim's%20horrid%20wilting_xge")

# VARIABLES

# Count the number of rows
myList = driver.find_element_by_xpath(list_parent)
vRowCount = len(myList.find_elements_by_xpath("./*"))
# Name
xName = '//*[@id="pagecontent"]/tr[2]'
# Level & School
xLevelSchool = '//*[@id="pagecontent"]/tr[3]'
# Casting Time
xCastingTime = '//*[@id="pagecontent"]/tr[4]'
# Range
xRange = '//*[@id="pagecontent"]/tr[5]'
# Components
xComponents = '//*[@id="pagecontent"]/tr[6]'
# Duration
xDuration = '//*[@id="pagecontent"]/tr[7]'
# Description
xDescription = '//*[@id="pagecontent"]/tr[9]'
# Classes
xClasses = '//*[@id="pagecontent"]/tr[10]/td/div[1]'
# Subclasses
xSubClasses = '//*[@id="pagecontent"]/tr[10]/td/div[2]'

# Loop through all the rows and scrape their data
for row in range(1, vRowCount + 1):
    print(f'Scraping item {row} of {vRowCount}')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # SCRAPING
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # Set up the row based on its index
    xRow = f'//*[@id="listcontainer"]/div[4]/div[{row}]'
    # Click into the row
    driver.find_element_by_xpath(xRow).click()
    # Sleep
    fSleep(0.5, 1)

    # ROW 2

    tName = driver.find_element_by_xpath(xName).text
    workingName = tName.split("\n")
    # Name
    finalName = workingName[0]
    # Source
    finalSource = workingName[1]
    # Page Number
    finalPageNumber = workingName[2]

    # ROW 3

    tLevelSchool = driver.find_element_by_xpath(xLevelSchool).text
    workingLevelSchool = tLevelSchool.split(" ")
    # Level 
    finalLevel = workingLevelSchool[0]
    # School
    finalSchool = workingLevelSchool[1]

    # ROW 4

    tCastingTime = driver.find_element_by_xpath(xCastingTime).text
    workingCastingTime = tCastingTime.split(": ")
    # Casting Time
    finalCastingTime = workingCastingTime[1]

    # ROW 5

    tRange = driver.find_element_by_xpath(xRange).text
    workingRange = tRange.split(": ")
    # Range
    finalRange = workingRange[1]

    # Components & Cost
    tComponents = driver.find_element_by_xpath(xComponents).text
    tComponents = tComponents.replace(")", "")
    workingComponents = tComponents.split(": ")
    workingComponents = workingComponents[1].split(" (")
    # Components Required
    finalComponentsRequired = workingComponents[0]
    # Material Components
    try:
        finalComponentsMaterial = workingComponents[1]
    except IndexError:
        finalComponentsMaterial = None

    # Description
    tDescription = driver.find_element_by_xpath(xDescription).text

    # Classes
    try:
        tClasses = driver.find_element_by_xpath(xClasses).text
        finalClasses = tClasses.split(": ")[1]
    except:
        finalClasses = None

    # Subclasses
    try:
        tSubClasses = driver.find_element_by_xpath(xSubClasses).text
        finalSubClasses = tSubClasses.split(": ")[1]
    except:
        finalSubClasses = None

    # Currency and Cost
    try:
        # Set up emtpy lists
        lCosts = []
        lCurrencies = []
        # Set up the material description for regex
        componentsMaterialsReg = finalComponentsMaterial.replace(",", "")
        # If "#### gp" exists in the string
        if len(re.findall("(\d+\s[cesgp]p)", componentsMaterialsReg)) >= 1:
            # Execute the regex
            lMatches = re.findall("(\d+\s[cesgp]p)", componentsMaterialsReg)
            # Set the string
            finalCost = ''
            # Loop through the matches
            for item in lMatches:    
                
                # DIGITS & CURRENCY

                # Pull out the cost
                cost = re.search("\d+", item)[0]
                # Append the cost to the cost list
                lCosts.append(cost)
                # Pull out the currency
                currency = item[-2:]
                # Append the currency to the currency list
                lCurrencies.append(currency)

            # Set the matches list to the components cost
            finalCost = ', '.join(lMatches)
        else:
            finalCost = None
    except:
        finalCost = None

    # ROW 7

    tDuration = driver.find_element_by_xpath(xDuration).text
    workingDuration = tDuration.split(": ")
    # Duration
    finalDuration = workingDuration[1]

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # DATAFRAME
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # Append the row to the DF
    dfSpells = dfSpells.append({'NAME':finalName, 'SOURCE':finalSource, 'PAGE_NUMBER':finalPageNumber, 'LEVEL':finalLevel, 'SCHOOL':finalSchool, 'CASTING_TIME':finalCastingTime, 'RANGE':finalRange, 'COMPONENTS':finalComponentsRequired, 'COMPONENTS_MATERIALS':finalComponentsMaterial, 'COMPONENETS_COST':finalCost, 'DURATION':finalDuration, 'CLASSES': finalClasses, 'SUBCLASSES': finalSubClasses}, ignore_index = True)

    # Export to CSV
    dfSpells.to_csv('SPELLS.csv')

driver.close()