In [1]:
# Import libraries for crawling
import requests
from bs4 import BeautifulSoup
import numpy as np

# Import PySpark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
import pyspark.sql.functions as F

## Creates spark session
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName('Dota2 Crawler').getOrCreate()

23/04/10 12:17:33 WARN Utils: Your hostname, DESKTOP-L7KNHUE resolves to a loopback address: 127.0.1.1; using 192.168.238.57 instead (on interface eth0)
23/04/10 12:17:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/10 12:17:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [71]:
# Creates dataframe with schema for the dataset
emptyRDD = spark.sparkContext.emptyRDD()

## Create schema for in loops
schema = StructType([
  StructField('type', StringType(), True),
  StructField('counter', StringType(), True),
  StructField('srcline', IntegerType(), True)
  ])

## Create schema for main dataframe
main_schema = StructType([
  StructField('hero', StringType(), True),
  StructField('segment', StringType(), True),
  StructField('type', StringType(), True),
  StructField('counter', StringType(), True),
  StructField('srcline', IntegerType(), True)
  ])

df = spark.createDataFrame(emptyRDD,schema=main_schema)

In [134]:
# get all heroes
page_hero = requests.get('https://dota2.fandom.com/wiki/Category:Heroes')
soup_hero = BeautifulSoup(page_hero.text, 'html.parser')
heroes = []
for i in soup_hero.find(class_="mw-content-ltr").find_all("a"):
    if '/Category:' not in i['href']:
        heroes.append(i.get_text())
    else:
        continue

In [136]:
for hero in heroes:
    print('Scraping hero page: '+hero)
    # initiate beautiful soup scraping on a page
    page = requests.get(f'https://dota2.fandom.com/wiki/{hero}/Counters')
    soup = BeautifulSoup(page.text, 'html.parser')

    # Pull all text from body div
    hero_name_list = soup.find_all(style='font-size:12pt;display:inline;')
    header_list = soup.find_all(class_='mw-headline')
    item_list = soup.find_all(class_='image-link')

    # function that outputs a range to categorize the type
    def rangeSrc(title,segment=None):
        if segment == None:
            return df_headers.filter(F.lower(F.col('counter'))==title).select('srcline').collect()[0]['srcline']
        else:
            return df_headers.filter((F.lower(F.col('counter'))==title) & (F.col('segment')==segment)).select('srcline').collect()[0]['srcline']

    # user-defined function for creating nested dictionary
    def dotaDict(type,scrape_list):
        text = []
        srcline = []
        types = []
        for h in scrape_list:
            types.append(type)
            text.append(h.get_text())
            srcline.append(h.sourceline)
        arr = zip(types,text,srcline)
        return arr

    # created nested dictionary
    ## header
    headers = dotaDict('header',header_list)
    ## heroes
    heroes = dotaDict('hero',hero_name_list)
    ## items
    items = dotaDict('items/skills',item_list)

    df_heroes = spark.createDataFrame(data=heroes,schema=schema)
    df_items = spark.createDataFrame(data=items,schema=schema)
    df_headers = spark.createDataFrame(data=headers,schema=schema)

    ## Create another column for segment
    # create threshold for segment
    bad_against = rangeSrc('bad against...')
    good_against = rangeSrc('good against...')
    works_well = rangeSrc('works well with...')

    # function for determining if the hero or item is a counter or not
    def segment(srcline):
        if srcline >= bad_against and srcline < good_against:
            return 'Bad against...'
        elif srcline >= good_against and srcline < works_well:
            return 'Good against...'
        elif srcline >= works_well:
            return 'Works well with...'
        else:
            pass
    segmentUDF = F.udf(lambda z: segment(z), StringType())

    # add segment
    df_heroes = df_heroes.withColumn('segment', segmentUDF('srcline'))
    df_items = df_items.withColumn('segment', segmentUDF('srcline'))
    df_headers = df_headers.withColumn('segment', segmentUDF('srcline'))

    # ## Categorize items from skills in Others
    # # create threshold for others
    # bad_others = rangeSrc('Others','Bad against...')
    # bad_items = rangeSrc('Items','Bad against...')
    # good_others = rangeSrc('Others','Good against...')
    # good_items = rangeSrc('Items','Good against...')
    # works_others = rangeSrc('Others','Works well with...')
    # works_items = rangeSrc('Items','Works well with...')

    # # function for determining if the hero or item is a counter or not
    # def segment2(srcline):
    #     if (srcline >= bad_others and srcline < bad_items) or (srcline >= good_others and srcline < good_items) or (srcline >= works_others and srcline < works_items):
    #         return 'others'
    #     elif (srcline >= bad_against and srcline < bad_others) or (srcline >= good_against and srcline < good_others) or (srcline >= works_well and srcline < works_others):
    #         return 'hero_others'
    #     elif (srcline >= bad_items and srcline < good_against) or (srcline >= good_items and srcline < works_well) or (srcline >= works_items):
    #         return 'item'
    # segment2UDF = F.udf(lambda z: segment2(z), StringType())

    # correct type
    # df_heroes2 = df_heroes
    # df_items2 = df_items.withColumn('type', segment2UDF('srcline'))

    # join 2 dfs
    df_conso = df_heroes.unionByName(df_items)
    df_conso = df_conso.withColumn('hero',F.lit(hero)).select('hero','segment','type','counter','srcline')

    df = df.unionByName(df_conso)
    print('Scraping done: '+hero)

Abaddon
Abaddondone
Alchemist
Alchemistdone
Ancient Apparition
Ancient Apparitiondone
Anti-Mage
Anti-Magedone
Arc Warden
Arc Wardendone
Axe
Axedone
Bane
Banedone
Batrider
Batriderdone
Beastmaster
Beastmasterdone
Bloodseeker
Bloodseekerdone
Bounty Hunter
Bounty Hunterdone
Brewmaster
Brewmasterdone
Bristleback
Bristlebackdone
Broodmother
Broodmotherdone
Centaur Warrunner
Centaur Warrunnerdone
Chaos Knight
Chaos Knightdone
Chen
Chendone
Clinkz
Clinkzdone
Clockwerk
Clockwerkdone
Crystal Maiden
Crystal Maidendone
Dark Seer
Dark Seerdone
Dark Willow
Dark Willowdone
Dawnbreaker
Dawnbreakerdone
Dazzle
Dazzledone
Death Prophet
Death Prophetdone
Disruptor
Disruptordone
Doom
Doomdone
Dragon Knight
Dragon Knightdone
Drow Ranger
Drow Rangerdone
Earth Spirit
Earth Spiritdone
Earthshaker
Earthshakerdone
Elder Titan
Elder Titandone
Ember Spirit
Ember Spiritdone
Enchantress
Enchantressdone
Enigma
Enigmadone
Faceless Void
Faceless Voiddone
Grimstroke
Grimstrokedone
Gyrocopter
Gyrocopterdone
Hoodwink
Hoo

In [140]:

df.printSchema()

root
 |-- hero: string (nullable = true)
 |-- segment: string (nullable = true)
 |-- type: string (nullable = true)
 |-- counter: string (nullable = true)
 |-- srcline: integer (nullable = true)



In [143]:
df.write.option("header",True) \
        .partitionBy("hero") \
        .mode("overwrite") \
        .parquet("s3://pavluff-dev-web-scrapes-public/dota2counterCrawler/")

Py4JJavaError: An error occurred while calling o31358.parquet.
: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "s3"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWritingFileFormat(DataSource.scala:461)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWriting(DataSource.scala:558)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:793)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
