In [1]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.\
     master('spark://spark-master:7077').\
     appName('week8lecture').getOrCreate()

In [2]:
df = ss.read.json('hdfs://namenode/Users/vagrant/structured-2018-01-14-neworleans/*.json')

In [4]:
df.count() #how many json files in the directory. it's a function

280

In [5]:
df.columns

['duration_ms',
 'end_time_s',
 'events',
 'hp_hill_names',
 'hp_hill_rotations',
 'id',
 'map',
 'mode',
 'platform',
 'players',
 'rounds',
 'series_id',
 'start_time_s',
 'teams',
 'title']

In [6]:
#select a few columns
df.select('map','mode','title').show(5)

+---------------+---------+-----+
|            map|     mode|title|
+---------------+---------+-----+
|Ardennes Forest|Hardpoint|  ww2|
|   London Docks|Hardpoint|  ww2|
|   London Docks|Hardpoint|  ww2|
|Ardennes Forest|Hardpoint|  ww2|
|Ardennes Forest|Hardpoint|  ww2|
+---------------+---------+-----+
only showing top 5 rows



In [7]:
df.select('map','mode','title').limit(5).show()

+---------------+---------+-----+
|            map|     mode|title|
+---------------+---------+-----+
|Ardennes Forest|Hardpoint|  ww2|
|   London Docks|Hardpoint|  ww2|
|   London Docks|Hardpoint|  ww2|
|Ardennes Forest|Hardpoint|  ww2|
|Ardennes Forest|Hardpoint|  ww2|
+---------------+---------+-----+



In [8]:
# how to reference columns
df.mode
# or
df['mode']

Column<b'mode'>

In [11]:
# average (min, max etc.)
# always need group by even you are not group by anything
df.groupBy().avg('duration_ms').show()

+-----------------+
| avg(duration_ms)|
+-----------------+
|608682.1428571428|
+-----------------+



In [12]:
df.groupBy().agg({'duration_ms':'avg'}).show()

+-----------------+
| avg(duration_ms)|
+-----------------+
|608682.1428571428|
+-----------------+



In [14]:
import pyspark.sql.functions as F

df.groupBy().agg(F.avg('duration_ms')).show()

+-----------------+
| avg(duration_ms)|
+-----------------+
|608682.1428571428|
+-----------------+



In [15]:
# group by
df.groupBy('mode').agg(F.avg('duration_ms')).show()

+----------------+-----------------+
|            mode| avg(duration_ms)|
+----------------+-----------------+
|       Hardpoint|636487.1794871795|
|Capture The Flag|639394.3661971831|
|Search & Destroy|549619.5652173914|
+----------------+-----------------+



In [16]:
df1 = df.withColumn('duration_s',df.duration_ms/1000)

In [17]:
df1.select('duration_s').show(5)

+----------+
|duration_s|
+----------+
|     765.0|
|     817.0|
|     776.0|
|     786.0|
|     777.0|
+----------+
only showing top 5 rows



In [20]:
df.select('teams').take(1) # a row of rows

[Row(teams=[Row(is_victor=False, name='ENIGMA6', round_scores=[24, 16, 21, 20, 31, 13, 18, 25, 4, 7, 21, 7, 0], score=207, side='home'), Row(is_victor=True, name='LUMINOSITY GAMING', round_scores=[17, 19, 27, 9, 5, 9, 20, 8, 32, 39, 8, 14, 13], score=220, side='away')])]

In [28]:
# explode
# toke my original table and explode it into two tables
# alias, change column name to team
teams_df = df.select('id',F.explode('teams').alias('team'))

In [29]:
teams_df.show(5)

+--------------------+--------------------+
|                  id|                team|
+--------------------+--------------------+
|64d15a2d-2f3c-5a2...|[false, ENIGMA6, ...|
|64d15a2d-2f3c-5a2...|[true, LUMINOSITY...|
|1111848b-1bfb-5d6...|[false, LUMINOSIT...|
|1111848b-1bfb-5d6...|[true, TEAM KALIB...|
|1b615383-6e9e-589...|[true, MINDFREAK,...|
+--------------------+--------------------+
only showing top 5 rows



In [31]:
teams_df.schema

StructType(List(StructField(id,StringType,true),StructField(team,StructType(List(StructField(is_victor,BooleanType,true),StructField(name,StringType,true),StructField(round_scores,ArrayType(LongType,true),true),StructField(score,LongType,true),StructField(side,StringType,true))),true)))

In [32]:
teams_df.dtypes

[('id', 'string'),
 ('team',
  'struct<is_victor:boolean,name:string,round_scores:array<bigint>,score:bigint,side:string>')]

In [33]:
teams_df.take(1)

[Row(id='64d15a2d-2f3c-5a28-844e-3d903d3cb9bc', team=Row(is_victor=False, name='ENIGMA6', round_scores=[24, 16, 21, 20, 31, 13, 18, 25, 4, 7, 21, 7, 0], score=207, side='home'))]

In [36]:
# want to get the name out. and change the column name to 'team name'
teams_df.select('id',teams_df['team.name'].alias('team name')).show()

+--------------------+-----------------+
|                  id|        team name|
+--------------------+-----------------+
|64d15a2d-2f3c-5a2...|          ENIGMA6|
|64d15a2d-2f3c-5a2...|LUMINOSITY GAMING|
|1111848b-1bfb-5d6...|LUMINOSITY GAMING|
|1111848b-1bfb-5d6...|     TEAM KALIBER|
|1b615383-6e9e-589...|        MINDFREAK|
|1b615383-6e9e-589...| INCONTROL GAMING|
|9cbc243f-ed67-565...|      RISE NATION|
|9cbc243f-ed67-565...|           SPLYCE|
|c1119828-62d8-512...|        FAZE CLAN|
|c1119828-62d8-512...|     TEAM KALIBER|
|09fd18b5-fc13-5ff...|      TEAM ENVYUS|
|09fd18b5-fc13-5ff...|    EVIL GENIUSES|
|a6183191-2641-57d...|LUMINOSITY GAMING|
|a6183191-2641-57d...|        FAZE CLAN|
|7873bd99-b6f8-579...| LIGHTNING PANDAS|
|7873bd99-b6f8-579...|              EZG|
|ba212c3d-3aa9-5d9...|           SPLYCE|
|ba212c3d-3aa9-5d9...| LIGHTNING PANDAS|
|c6634763-98f5-52e...|      RED RESERVE|
|c6634763-98f5-52e...|        FAZE CLAN|
+--------------------+-----------------+
only showing top

In [37]:
# rename the column another way
teams_df.select('id','team.name').withColumnRenamed('name','team_name').show()

+--------------------+-----------------+
|                  id|        team_name|
+--------------------+-----------------+
|64d15a2d-2f3c-5a2...|          ENIGMA6|
|64d15a2d-2f3c-5a2...|LUMINOSITY GAMING|
|1111848b-1bfb-5d6...|LUMINOSITY GAMING|
|1111848b-1bfb-5d6...|     TEAM KALIBER|
|1b615383-6e9e-589...|        MINDFREAK|
|1b615383-6e9e-589...| INCONTROL GAMING|
|9cbc243f-ed67-565...|      RISE NATION|
|9cbc243f-ed67-565...|           SPLYCE|
|c1119828-62d8-512...|        FAZE CLAN|
|c1119828-62d8-512...|     TEAM KALIBER|
|09fd18b5-fc13-5ff...|      TEAM ENVYUS|
|09fd18b5-fc13-5ff...|    EVIL GENIUSES|
|a6183191-2641-57d...|LUMINOSITY GAMING|
|a6183191-2641-57d...|        FAZE CLAN|
|7873bd99-b6f8-579...| LIGHTNING PANDAS|
|7873bd99-b6f8-579...|              EZG|
|ba212c3d-3aa9-5d9...|           SPLYCE|
|ba212c3d-3aa9-5d9...| LIGHTNING PANDAS|
|c6634763-98f5-52e...|      RED RESERVE|
|c6634763-98f5-52e...|        FAZE CLAN|
+--------------------+-----------------+
only showing top

In [38]:
# explode the playes
players_df = df.select('id',F.explode('players').alias('player'))
players_df.show()

+--------------------+--------------------+
|                  id|              player|
+--------------------+--------------------+
|64d15a2d-2f3c-5a2...|[3, 1, 0, 1, 0, 0...|
|64d15a2d-2f3c-5a2...|[2, 0, 0, 0, 0, 0...|
|64d15a2d-2f3c-5a2...|[6, 3, 0, 3, 1, 0...|
|64d15a2d-2f3c-5a2...|[4, 2, 0, 2, 0, 1...|
|64d15a2d-2f3c-5a2...|[6, 0, 0, 1, 0, 0...|
|64d15a2d-2f3c-5a2...|[4, 3, 1, 1, 0, 1...|
|64d15a2d-2f3c-5a2...|[5, 1, 0, 0, 0, 0...|
|64d15a2d-2f3c-5a2...|[4, 1, 0, 0, 0, 0...|
|1111848b-1bfb-5d6...|[6, 0, 0, 1, 0, 0...|
|1111848b-1bfb-5d6...|[7, 0, 0, 0, 1, 0...|
|1111848b-1bfb-5d6...|[4, 2, 1, 2, 1, 2...|
|1111848b-1bfb-5d6...|[4, 0, 0, 0, 0, 0...|
|1111848b-1bfb-5d6...|[6, 1, 0, 3, 0, 0...|
|1111848b-1bfb-5d6...|[5, 1, 0, 0, 0, 0...|
|1111848b-1bfb-5d6...|[2, 0, 0, 0, 1, 0...|
|1111848b-1bfb-5d6...|[6, 0, 0, 0, 1, 0...|
|1b615383-6e9e-589...|[4, 1, 0, 0, 0, 1...|
|1b615383-6e9e-589...|[3, 0, 0, 0, 0, 0...|
|1b615383-6e9e-589...|[4, 1, 0, 1, 0, 0...|
|1b615383-6e9e-589...|[4, 0, 0, 

In [39]:
players_df.take(1)

[Row(id='64d15a2d-2f3c-5a28-844e-3d903d3cb9bc', player=Row(2piece=3, 3piece=1, 4piece=0, 4streak=1, 5streak=0, 6streak=0, 7streak=0, 8+streak=0, accuracy=24.6, assists=14, avg_time_per_life_s=17.1, ctf_captures=None, ctf_defends=None, ctf_flag_carry_time_s=None, ctf_kill_carriers=None, ctf_pickups=None, ctf_returns=None, deaths=39, deaths_per_10min=30.6, fave_division='Airborne', fave_scorestreaks=['Fighter Pilot', 'Glide Bomb', 'Artillery Barrage'], fave_training='Hunker', fave_weapon='PPSh-41', headshots=2, hits=146, hp_captures=0, hp_defends=0, hp_hill_time_s=48, kd=0.9, kills=35, kills_per_10min=27.5, name='BLAZT', num_lives=40, scorestreaks_assists=0, scorestreaks_deployed=0, scorestreaks_earned=0, scorestreaks_kills=0, scorestreaks_used=0, shots=593, snd_1kill_round=None, snd_2kill_round=None, snd_3kill_round=None, snd_4kill_round=None, snd_defuses=None, snd_firstbloods=None, snd_firstdeaths=None, snd_pickups=None, snd_plants=None, snd_rounds=None, snd_sneak_defuses=None, snd_sur

In [40]:
# join team and palyer table

joined_table = players_df.join(teams_df,players_df.id == teams_df.id)

In [41]:
joined_table.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                  id|              player|                  id|                team|
+--------------------+--------------------+--------------------+--------------------+
|64d15a2d-2f3c-5a2...|[3, 1, 0, 1, 0, 0...|64d15a2d-2f3c-5a2...|[true, LUMINOSITY...|
|64d15a2d-2f3c-5a2...|[3, 1, 0, 1, 0, 0...|64d15a2d-2f3c-5a2...|[false, ENIGMA6, ...|
|64d15a2d-2f3c-5a2...|[2, 0, 0, 0, 0, 0...|64d15a2d-2f3c-5a2...|[true, LUMINOSITY...|
|64d15a2d-2f3c-5a2...|[2, 0, 0, 0, 0, 0...|64d15a2d-2f3c-5a2...|[false, ENIGMA6, ...|
|64d15a2d-2f3c-5a2...|[6, 3, 0, 3, 1, 0...|64d15a2d-2f3c-5a2...|[true, LUMINOSITY...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [42]:
# show distinct value
df.select('mode').distinct().show()

+----------------+
|            mode|
+----------------+
|       Hardpoint|
|Capture The Flag|
|Search & Destroy|
+----------------+



In [43]:
# filter
snd_df = df.filter(df.mode == 'Search & Destroy')

In [44]:
snd_df.count()

92