# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

#args = getResolvedOptions(sys.argv, ['JOB_NAME'])
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
#job.init(args['JOB_NAME'])

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::883375387566:role/glue-full-access
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: b7603d1f-bcd9-41ae-a8a3-d5b3d001a97f
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
Waiting for session b7603d1f-bcd9-

#### Read source


In [2]:
BUCKET = 'sorel-20m'
PREFIX = '09-DEC-2020/binaries'
BUCKET_PATH = f's3://{BUCKET}'




In [3]:
import boto3
from typing import List

#filter_chars = '0000'
filter_chars = '00'

s3 = boto3.resource('s3')

def get_all_filepaths(filter_chars: str, bucket: str, prefix: str) -> List[str]:
    bucket = s3.Bucket(bucket)
    prefix = f'{prefix}/{filter_chars}'
    objects = bucket.objects.filter(Prefix=prefix)
    key_list = [f'{BUCKET_PATH}/{o.key}' for o in objects]
    return key_list

# File1_node = glueContext.create_dynamic_frame.from_options(
#    format_options={"quoteChar": '"', "withHeader": True, "separator": ","},
#    connection_type="s3",
#    format="binary",
#    connection_options={
#        "paths": get_all_filepaths(filter_chars, BUCKET, PREFIX)
#    }
# )

paths = get_all_filepaths(filter_chars, BUCKET, PREFIX)

#print(len(paths), paths[:10])
#df= File1_node.toDF()




In [4]:
df = spark.read.format("binaryFile").option("wholeFile","true").load(paths)
#print(df.count())




#### Write to destination

In [5]:
df_out = df.select('Path')
#df_out.show()




In [5]:
#df_out = df_out.withColumn('partition_prefix', df['Path'].substr(37, 3))

In [6]:
#df_out.write.mode("overwrite").csv('s3://sorel-20m-demo/tmp/app_output')
#df_out.write.mode("overwrite").option('groupFiles','inPartition').option('groupSize', '1048576').csv('s3://sorel-20m-demo/tmp/app_output')
df_out.coalesce(10).write.mode("overwrite").csv('s3://sorel-20m-demo/tmp/app_output')
#df_out.write.mode("overwrite").partitionBy('partition_prefix').csv('s3://sorel-20m-demo/tmp/app_output')




In [7]:
df_out.rdd.getNumPartitions()

1485


In [31]:
#df_out


Row(Path='s3://sorel-20m/09-DEC-2020/binaries/005dc6bfc14a63342c4d061cd0d61726da729936700306e1f10097774137912e', partition_prefix='00')


In [6]:
%list_sessions

The first 3 sessions are:
6605dd26-cea8-4576-97e6-49ad61c37721
6a10f511-0aac-4df8-9720-468dab47bffe
7d9c1595-db11-41e7-9983-6214f01bf55d


In [12]:
%stop_session 

There is no current session.
