In [21]:
# Import libraries for crawling
import requests
from bs4 import BeautifulSoup
import numpy as np

# Import PySpark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql import Window
## Creates spark session
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName('Dota2 Crawler').getOrCreate()

In [28]:
# Creates dataframe with schema for the dataset
emptyRDD = spark.sparkContext.emptyRDD()

## Create schema for in loops
schema = StructType([
  StructField('type', StringType(), True),
  StructField('counter', StringType(), True),
  StructField('srcline', IntegerType(), True)
  ])

## Create schema for main dataframe
main_schema = StructType([
  StructField('hero', StringType(), True),
  StructField('segment', StringType(), True),
  StructField('type', StringType(), True),
  StructField('counter', StringType(), True),
  StructField('srcline', IntegerType(), True)
  ])

df = spark.createDataFrame(emptyRDD,schema=main_schema)

In [29]:
# get all heroes
page_hero = requests.get('https://dota2.fandom.com/wiki/Category:Heroes')
soup_hero = BeautifulSoup(page_hero.text, 'html.parser')
heroes = []
for i in soup_hero.find(class_="mw-content-ltr").find_all("a"):
    if '/Category:' not in i['href']:
        heroes.append(i.get_text())
    else:
        continue

In [30]:
for hero in heroes:
    print('Scraping hero page: '+hero)
    # initiate beautiful soup scraping on a page
    page = requests.get(f'https://dota2.fandom.com/wiki/{hero}/Counters')
    soup = BeautifulSoup(page.text, 'html.parser')

    # Pull all text from body div
    hero_name_list = soup.find_all(style='font-size:12pt;display:inline;')
    header_list = soup.find_all(class_='mw-headline')
    item_list = soup.find_all(class_='image-link')

    # function that outputs a range to categorize the type
    def rangeSrc(title,segment=None):
        if segment == None:
            return df_headers.filter(F.lower(F.col('counter'))==title).select('srcline').collect()[0]['srcline']
        else:
            return df_headers.filter((F.lower(F.col('counter'))==title) & (F.col('segment')==segment)).select('srcline').collect()[0]['srcline']

    # user-defined function for creating nested dictionary
    def dotaDict(type,scrape_list):
        text = []
        srcline = []
        types = []
        for h in scrape_list:
            types.append(type)
            text.append(h.get_text())
            srcline.append(h.sourceline)
        arr = zip(types,text,srcline)
        return arr

    # created nested dictionary
    ## header
    headers = dotaDict('header',header_list)
    ## heroes
    heroes = dotaDict('hero',hero_name_list)
    ## items
    items = dotaDict('items/skills',item_list)

    df_heroes = spark.createDataFrame(data=heroes,schema=schema)
    df_items = spark.createDataFrame(data=items,schema=schema)
    df_headers = spark.createDataFrame(data=headers,schema=schema)

    ## Create another column for segment
    # create threshold for segment
    bad_against = rangeSrc('bad against...')
    good_against = rangeSrc('good against...')
    works_well = rangeSrc('works well with...')

    # function for determining if the hero or item is a counter or not
    def segment(srcline):
        if srcline >= bad_against and srcline < good_against:
            return 'Bad against...'
        elif srcline >= good_against and srcline < works_well:
            return 'Good against...'
        elif srcline >= works_well:
            return 'Works well with...'
        else:
            pass
    segmentUDF = F.udf(lambda z: segment(z), StringType())

    # add segment
    df_heroes = df_heroes.withColumn('segment', segmentUDF('srcline'))
    df_items = df_items.withColumn('segment', segmentUDF('srcline'))
    df_headers = df_headers.withColumn('segment', segmentUDF('srcline'))

    # ## Categorize items from skills in Others
    # # create threshold for others
    # bad_others = rangeSrc('Others','Bad against...')
    # bad_items = rangeSrc('Items','Bad against...')
    # good_others = rangeSrc('Others','Good against...')
    # good_items = rangeSrc('Items','Good against...')
    # works_others = rangeSrc('Others','Works well with...')
    # works_items = rangeSrc('Items','Works well with...')

    # # function for determining if the hero or item is a counter or not
    # def segment2(srcline):
    #     if (srcline >= bad_others and srcline < bad_items) or (srcline >= good_others and srcline < good_items) or (srcline >= works_others and srcline < works_items):
    #         return 'others'
    #     elif (srcline >= bad_against and srcline < bad_others) or (srcline >= good_against and srcline < good_others) or (srcline >= works_well and srcline < works_others):
    #         return 'hero_others'
    #     elif (srcline >= bad_items and srcline < good_against) or (srcline >= good_items and srcline < works_well) or (srcline >= works_items):
    #         return 'item'
    # segment2UDF = F.udf(lambda z: segment2(z), StringType())

    # correct type
    # df_heroes2 = df_heroes
    # df_items2 = df_items.withColumn('type', segment2UDF('srcline'))

    # join 2 dfs
    df_conso = df_heroes.unionByName(df_items)
    df_conso = df_conso.withColumn('hero',F.lit(hero)).select('hero','segment','type','counter','srcline')

    df = df.unionByName(df_conso)
    print('Scraping done: '+hero)

cols = ['id'] + df.columns
df = df.withColumn("monotonically_increasing_id", F.monotonically_increasing_id())
window = Window.orderBy(F.col('monotonically_increasing_id'))
df = df.withColumn('id', F.row_number().over(window)).select(cols)

Scraping hero page: Abaddon
Scraping done: Abaddon
Scraping hero page: Alchemist
Scraping done: Alchemist
Scraping hero page: Ancient Apparition
Scraping done: Ancient Apparition
Scraping hero page: Anti-Mage
Scraping done: Anti-Mage
Scraping hero page: Arc Warden
Scraping done: Arc Warden
Scraping hero page: Axe
Scraping done: Axe
Scraping hero page: Bane
Scraping done: Bane
Scraping hero page: Batrider
Scraping done: Batrider
Scraping hero page: Beastmaster
Scraping done: Beastmaster
Scraping hero page: Bloodseeker
Scraping done: Bloodseeker
Scraping hero page: Bounty Hunter
Scraping done: Bounty Hunter
Scraping hero page: Brewmaster
Scraping done: Brewmaster
Scraping hero page: Bristleback
Scraping done: Bristleback
Scraping hero page: Broodmother
Scraping done: Broodmother
Scraping hero page: Centaur Warrunner
Scraping done: Centaur Warrunner
Scraping hero page: Chaos Knight
Scraping done: Chaos Knight
Scraping hero page: Chen
Scraping done: Chen
Scraping hero page: Clinkz
Scraping

In [31]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- hero: string (nullable = true)
 |-- segment: string (nullable = true)
 |-- type: string (nullable = true)
 |-- counter: string (nullable = true)
 |-- srcline: integer (nullable = true)



In [33]:
df = df.withColumnRenamed('id','heroID') \
       .withColumnRenamed('hero','heroName') \
       .withColumnRenamed('segment','heroSegment') \
       .withColumnRenamed('type','heroType') \
       .withColumnRenamed('counter','heroCounter') \
       .withColumnRenamed('srcline','htmlCodeLine')

In [34]:
df.printSchema()

root
 |-- heroID: integer (nullable = false)
 |-- heroName: string (nullable = true)
 |-- heroSegment: string (nullable = true)
 |-- heroType: string (nullable = true)
 |-- heroCounter: string (nullable = true)
 |-- htmlCodeLine: integer (nullable = true)



In [35]:
df.schema

StructType([StructField('heroID', IntegerType(), False), StructField('heroName', StringType(), True), StructField('heroSegment', StringType(), True), StructField('heroType', StringType(), True), StructField('heroCounter', StringType(), True), StructField('htmlCodeLine', IntegerType(), True)])