In [1]:
# -------------------------------------------------------------------------
# MODIFY WITH CARE
# Standard libraries to be used in AWS Glue jobs
# -------------------------------------------------------------------------

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, array, ArrayType, DateType
from pyspark.sql import Row, Column
import datetime
import json
import boto3
import time
import logging
import calendar
import uuid
from dateutil import relativedelta
from dateutil.relativedelta import relativedelta
from awsglue.utils import getResolvedOptions

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
16,application_1563280589119_0017,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
BATCH_MODE=False

if BATCH_MODE:
    args = getResolvedOptions(sys.argv, ['JOB_NAME','ARG_JOB_DATE'])
    JOB_DATE=args['ARG_JOB_DATE']
else:
    JOB_DATE='20190708' 

S3PATHREAD="s3://ds-operations-111-raw/worldcup/"+JOB_DATE+"/"
S3PATHWRITE="s3://ds-operations-111-curated"

PARTITION='dt='+JOB_DATE
CATALOG_TABLE_LIST=[]

CURATED_DATABASE='curated_worldcup'

BOWLER_TBL='bowlers'

CRAWLER_NAME='worldcup'
CRAWLER_ARN='arn:aws:iam::956630041263:role/glue-role'

In [3]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
client = boto3.client('glue', region_name='us-east-1')

In [4]:
def read_s3_file(spark, type, path, delimiter='|', header='true'):
    if (type == 'CSV'):
        return spark.read.format("com.databricks.spark.csv").option("header", header).option("delimiter", delimiter).load(path)
    if (type == 'PARQUET'):
        return spark.read.parquet(path)
    
def write_s3_file(df, table_location, table, partition=None, format='PARQUET', delimiter='\t', coalesce=1, header=False):
    if format == 'PARQUET':
        df.write.parquet(table_location+'/'+table+'/'+partition, mode = "overwrite")
    if format == 'CSV':
        df.coalesce(coalesce).write.option("delimiter", delimiter).option("quote", "\"").option("quoteAll", "true").csv(table_location +'/' + partition)
        
def append_path_to_list(list, location, table_name):
    list.append({'Path': location + '/' + table_name})
    
def delete_catalog_table(client, database, table):
    try:
        response = client.delete_table(DatabaseName=database,Name=table)
    except Exception as e:
        print(table+' does not exist in glue catalog')
        
def create_crawler(client, crawler_name, iam_role_arn, database_name):
    return client.create_crawler(
        Name=crawler_name,
        Role=iam_role_arn,
        DatabaseName=database_name,
        Targets={
            'S3Targets':[
                {'Path':'s3://bucket/placeholder'}
            ]}
        )
    
def update_crawler(client, crawler_name, s3targets):
    client.update_crawler(
        Name=crawler_name,
        Targets = {'S3Targets':s3targets}
            
    )
    
def start_crawler(client, crawler_name):
    print(crawler_name + ' started.')
    
    # Getting PRE-RUN READY status.
    while(True):
        time.sleep(1)
        response = client.get_crawler(
                        Name=crawler_name
                   )
        
        if response['Crawler']['State'] == 'READY':
            print(response['Crawler']['State'])
            break
            
    client.start_crawler(
        Name=crawler_name
    )
    
    # Getting RUNNING status for stdout.            
    while(True):
        time.sleep(15)
        response = client.get_crawler(
                        Name=crawler_name
                   )
        
        if response['Crawler']['State'] == 'RUNNING':
            print(response['Crawler']['State'])
            break
        
    # Getting STOPPING status for stdout.
    while(True):
        time.sleep(1)
        response = client.get_crawler(
                        Name=crawler_name
                   )
        
        if response['Crawler']['State'] == 'STOPPING':
            print(response['Crawler']['State'])
            break
    
   # Getting READY status.
    while(True):
        time.sleep(1)
        response = client.get_crawler(
                        Name=crawler_name
                   )
        
        if response['Crawler']['State'] == 'READY':
            print(response['Crawler']['State'])
            break

def delete_crawler(client, crawler_name):
    # Getting READY status before deleting making sure it won't delete a running crawler.
    while(True):
        time.sleep(1)
        response = client.get_crawler(
                        Name=crawler_name
                   )
        
        if response['Crawler']['State'] == 'READY':
            print(response['Crawler']['State'])
            break
    
    client.delete_crawler(
        Name=crawler_name
    )
    
    print(crawler_name + ' deleted.')

In [5]:
bowler_df=read_s3_file(spark, 'CSV', S3PATHREAD+"Bowler_data.csv", delimiter=',',header='true')
#bowler_df.show()

In [6]:
bowler_df=bowler_df \
          .withColumnRenamed('_c0','ID') \
          .withColumn('Overs', f.col('Overs').cast(IntegerType())) \
          .withColumn('Mdns', f.col('Mdns').cast(IntegerType())) \
          .withColumn('Runs', f.col('Runs').cast(IntegerType())) \
          .withColumn('Wkts', f.col('Wkts').cast(IntegerType())) \
          .withColumn('Econ', f.col('Econ').cast(IntegerType())) \
          .withColumn('Ave', f.col('Ave').cast(IntegerType())) \
          .withColumn('Player_ID', f.col('Player_ID').cast(IntegerType()))

#bowler_df.show()

In [7]:
bowler_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Overs: integer (nullable = true)
 |-- Mdns: integer (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- Wkts: integer (nullable = true)
 |-- Econ: integer (nullable = true)
 |-- Ave: integer (nullable = true)
 |-- SR: string (nullable = true)
 |-- Opposition: string (nullable = true)
 |-- Ground: string (nullable = true)
 |-- Start Date: string (nullable = true)
 |-- Match_ID: string (nullable = true)
 |-- Bowler: string (nullable = true)
 |-- Player_ID: integer (nullable = true)

In [8]:
bowler_df.registerTempTable('bowlers')

In [9]:
# Write Results to 
bowler_df = spark.sql(" SELECT `ID`,`Overs`,`Mdns`,`Runs`,`Wkts`,`Econ`,`Ave`,`SR`, " \
                      " SUBSTR(`Opposition`, 2,20) as `Opposition`, " \
                      " `Ground`, TO_DATE(`Start Date`, 'dd MMM yyyy') AS Start_Date, " \
                      " SUBSTR(Match_ID, 6,10) AS Match_Number,`Bowler`,`Player_ID` " \
                      " FROM bowlers")
bowler_df.count()

5118

In [10]:
write_s3_file(bowler_df, S3PATHWRITE, BOWLER_TBL, PARTITION)
append_path_to_list(CATALOG_TABLE_LIST, S3PATHWRITE, BOWLER_TBL) 
print(CATALOG_TABLE_LIST)

[{'Path': 's3://ds-operations-111-curated/bowlers'}]

In [11]:
# Filtering using Dataframe commands
bowler_df.filter(bowler_df.Player_ID == 49619).count()

0

In [12]:
# Filtering using Spark SQL commands
sqlContext.sql("SELECT * FROM bowlers where Player_ID = 49619").count()

0

In [13]:
# Average overs did a bowler bowl
avg_bowler_df = bowler_df.groupBy(bowler_df.Player_ID).avg('Overs')

In [14]:
avg_bowler_df.show()

+---------+------------------+
|Player_ID|        avg(Overs)|
+---------+------------------+
|   326434|              null|
|   311592| 8.253333333333334|
|   325012| 6.225806451612903|
|   297433|              null|
|   379504| 8.590909090909092|
|    25913| 8.158878504672897|
|    19264|             7.875|
|   793463|  8.62962962962963|
|   318339|           5.34375|
|   351588|             8.525|
|   272279|              9.48|
|   267192| 4.805555555555555|
|     8917| 8.101123595505618|
|   330902| 8.355555555555556|
|   272477|               9.0|
|    34102|2.5789473684210527|
|     5334|2.3529411764705883|
|   419873|              null|
|   440970|               1.5|
|   261354| 8.555555555555555|
+---------+------------------+
only showing top 20 rows

In [15]:
avg_bowler_df = spark.sql("SELECT Player_ID, avg(Overs) FROM bowlers GROUP BY Player_ID")

In [16]:
avg_bowler_df.show()

+---------+------------------+
|Player_ID|        avg(Overs)|
+---------+------------------+
|   326434|              null|
|   311592| 8.253333333333334|
|   325012| 6.225806451612903|
|   297433|              null|
|   379504| 8.590909090909092|
|    25913| 8.158878504672897|
|    19264|             7.875|
|   793463|  8.62962962962963|
|   318339|           5.34375|
|   351588|             8.525|
|   272279|              9.48|
|   267192| 4.805555555555555|
|     8917| 8.101123595505618|
|   330902| 8.355555555555556|
|   272477|               9.0|
|    34102|2.5789473684210527|
|     5334|2.3529411764705883|
|   419873|              null|
|   440970|               1.5|
|   261354| 8.555555555555555|
+---------+------------------+
only showing top 20 rows

In [17]:
# Bowlers with lowest Economy Rate per over
econ_bowler_df = spark.sql("SELECT Player_ID, (SELECT min(Econ) FROM bowlers) as Econ " \
                            " FROM bowlers WHERE Econ=(SELECT min(Econ) FROM bowlers)")
econ_bowler_df.show()

+---------+----+
|Player_ID|Econ|
+---------+----+
|   311592|   0|
|   320652|   0|
+---------+----+

In [18]:
# Match Numbers played in Port of Spain
econ_bowler_df = spark.sql("SELECT DISTINCT SUBSTR(Match_ID, 6,10) AS Match_Number " \
                            " FROM bowlers WHERE Ground='Port of Spain'")
econ_bowler_df.show()

+------------+
|Match_Number|
+------------+
|        2546|
|        3387|
|        3895|
|        3159|
|        2382|
|        2550|
|        3160|
|        3388|
|        2542|
|        2538|
|        3383|
|        2554|
|        3896|
|        2381|
+------------+

In [19]:
# -------------------------------------------------------------------------
# Script to delete previously catalogued tables
# -------------------------------------------------------------------------

delete_catalog_table(client, CURATED_DATABASE, BOWLER_TBL) 

bowlers does not exist in glue catalog

In [21]:
# -------------------------------------------------------------------------
# Crawl tables
# -------------------------------------------------------------------------
create_crawler(client, CRAWLER_NAME, CRAWLER_ARN, CURATED_DATABASE)
update_crawler(client, CRAWLER_NAME, CATALOG_TABLE_LIST)
start_crawler(client,  CRAWLER_NAME)
delete_crawler(client, CRAWLER_NAME)

worldcup started.
READY
RUNNING
STOPPING
READY
READY
worldcup deleted.