In [11]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame


def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
    for alias, frame in mapping.items():
        frame.toDF().createOrReplaceTempView(alias)
    result = spark.sql(query)
    return DynamicFrame.fromDF(result, glueContext, transformation_ctx)


sys.argv += ["--JOB_NAME", "TEST"]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)

# Script generated for node S3 bucket
S3bucket_node1 = glueContext.create_dynamic_frame.from_options(
    format_options={
        "quoteChar": '"',
        "withHeader": True,
        "separator": ",",
        "optimizePerformance": False,
    },
    connection_type="s3",
    format="csv",
    connection_options={"paths": ["s3://lmu-football/"], "recurse": True},
    transformation_ctx="S3bucket_node1",
)

# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
    frame=S3bucket_node1,
    mappings=[
        ("Country", "string", "Country", "string"),
        ("League", "string", "League", "string"),
        ("Season", "string", "Season", "string"),
        ("Date", "string", "Date", "string"),
        ("Time", "string", "Time", "string"),
        ("Home", "string", "Home", "string"),
        ("Away", "string", "Away", "string"),
        ("HG", "string", "HG", "string"),
        ("AG", "string", "AG", "string"),
        ("Res", "string", "Res", "string"),
        ("PH", "string", "PH", "string"),
        ("PD", "string", "PD", "string"),
        ("PA", "string", "PA", "string"),
        ("MaxH", "string", "MaxH", "string"),
        ("MaxD", "string", "MaxD", "string"),
        ("MaxA", "string", "MaxA", "string"),
        ("AvgH", "string", "AvgH", "string"),
        ("AvgD", "string", "AvgD", "string"),
        ("AvgA", "string", "AvgA", "string"),
    ],
    transformation_ctx="ApplyMapping_node2",
)

# Script generated for node SQL Query
SqlQuery0 = """
select * from myDataSource
"""
SQLQuery_node1692017719988 = sparkSqlQuery(
    glueContext,
    query=SqlQuery0,
    mapping={"myDataSource": ApplyMapping_node2},
    transformation_ctx="SQLQuery_node1692017719988",
)

# Script generated for node S3 bucket
S3bucket_node3 = glueContext.getSink(
    path="s3://lmu-data/bronze/football/",
    connection_type="s3",
    updateBehavior="LOG",
    partitionKeys=[],
    compression="snappy",
    enableUpdateCatalog=True,
    transformation_ctx="S3bucket_node3",
)
S3bucket_node3.setCatalogInfo(
    catalogDatabase="football", catalogTableName="football_test"
)
S3bucket_node3.setFormat("csv")
S3bucket_node3.writeFrame(SQLQuery_node1692017719988)
job.commit()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…