# 1. Downloading dependencies

In [None]:
import sys
import boto3
import logging
from awsglue.job import Job
from awsglue.transforms import *
from pyspark import SparkContext
from pyspark.sql import SparkSession
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from awsglue.dynamicframe import DynamicFrame

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

## 1.1 Creating GlueContext and loading data

Firstly I'm creating a Spark Dataframe to convert in a Glue Dynamic DF later.

In [None]:
spark = SparkSession.builder.master("local[*]").appName("trips_data").getOrCreate()

In [None]:
spark

In [None]:
glue_context = GlueContext(spark.sparkContext)

In [None]:
## params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glue_context)
job.init(args['JOB_NAME'], args)

In [None]:
#df = spark.read.parquet("s3://903442739132-source-bucket-trips-data-01/*/*")

In [None]:
df_dynamic = glue_context.create_dynamic_frame_from_options('s3',connection_options={'paths':['s3://903442739132-source-bucket-trips-data-01/*/*'],},format="parquet",transformation_ctx = "dynamic_frame0")

In [None]:
logger.info(f'printSchema: {df_dynamic.printSchema()}')

# 2. Separating files by type of license

In [None]:
from pyspark.sql.functions import col

In [None]:
df_uber = df_dynamic.filter(f=lambda x: x["hvfhs_license_num"] in "HV0003")

In [None]:
connection_options = {"path": "s3://903442739132-type-of-license-bucket/year=2021/*"}

In [None]:
glue_context.write_dynamic_frame.from_options(
    frame=df_uber,
    connection_type='s3',
    connection_options={
        'path': 's3://903442739132-type-of-license-bucket/year=2021/',
    },
    format='csv',
    format_options={
        'separator': ","
        # ...other kwargs
    }
)