# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2
%additional_python_modules faker
# s3://essaimdev/glue_extra_requirements/deps.zip

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.2 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Additional python modules to be included:
faker
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Session ID: ccf6ded2-1cb7-4d36-a1c5-f2118763c0df
Applying the following default arguments:
--glue_kernel_version 1.0.2
--enable-glue-datacatalog true
--additional-python-modules faker
Waiting for session ccf6ded2-1cb7-4d36-a1c5-f2118763c0df to get into 

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [7]:
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame

dyf = glueContext.create_dynamic_frame.from_catalog(database='bank_statement', table_name='csv')
dyf.printSchema()
dfc = DynamicFrameCollection({'frame0': 'xx' }, glueContext)

root
|-- rivino: long
|-- kirjauspäivä: string
|-- arvopäivä: string
|-- viite/viesti: string
|-- kpl: string
|-- määrä eur: double
|-- kirjaussaldo eur: double
|-- tila: string
|-- arkistointitunnus: string
|-- määrä eur (saldo): double


#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [7]:
df = dyf.toDF()
df.show()
pdf = df.toPandas()
print(pdf)

+------+------------+----------+--------------------+---+---------+----------------+----------+--------------------+-----------------+
|rivino|kirjauspäivä| arvopäivä|        viite/viesti|kpl|määrä eur|kirjaussaldo eur|      tila|   arkistointitunnus|määrä eur (saldo)|
+------+------------+----------+--------------------+---+---------+----------------+----------+--------------------+-----------------+
|     1|  02.01.2023|02.01.2023|       Mirjami Autio|   | -15000.0|         1712.98|Toteutunut|32MD      2594365362|          30393.4|
|     2|  04.01.2023|04.01.2023|RF57200126320778 ...|   |    -62.0|          403.81|Toteutunut|  814697    00332320|         75126.36|
|     3|  12.01.2023|12.01.2023|RF57200126320778 ...|   |     -2.6|           479.6|Toteutunut|32MD      2603436178|         51099.42|
|     4|  12.01.2023|12.01.2023|Quality-focused r...|   | -5562.56|         2293.51|Toteutunut|4963      7263012527|          5836.29|
|     5|  12.01.2023|12.01.2023|RF57200126320778 ...|  

#### Example: Visualize data with matplotlib


In [19]:
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame

dyf = glueContext.create_dynamic_frame.from_catalog(database='bank_statement', table_name='csv')
dyf.printSchema()
dfc = DynamicFrameCollection({'frame0': dyf }, glueContext)

def MyTransform (glueContext, dfc) -> DynamicFrameCollection:
    logger = glueContext.get_logger()
    frame_name = list(dfc.keys())[0]
    logger.info('frame_name ' + frame_name)
    print('frame_name ' + frame_name)
    
    dyf = dfc.select(frame_name)
    
    #import numpy as np
    from random import randint
    from faker import Faker
    
    fake = Faker("fi-FI")
    
    viiteset = []
    for _ in range(5):
        viiteset.extend(
            (
                    fake.company(),
                    fake.name(),
                    fake.catch_phrase(),
                    "600109898403 OmaVero FI5689199710000724",
                    "RF57200126320778 Verohallinto FI5689199710000724",
                )
            )
            
    def mask(rec):
        rec['määrä eur'] = randint(-10000,10000)/100 # fake.pyfloat(5, 2, True)
        rec['kirjaussaldo eur'] = randint(-10000,10000)/100
        rec['viite/viesti'] = viiteset[ randint(0, 4 )] #np.random.choice(viiteset)
        return rec
    print(dyf)
    masked_dynamicframe = dyf.map( f = mask ) # Map.apply(frame=dyf, f=mask)
    return (DynamicFrameCollection({frame_name: masked_dynamicframe}, glueContext))

MyTransform(glueContext, dfc)

root
|-- rivino: long
|-- kirjauspäivä: string
|-- arvopäivä: string
|-- viite/viesti: string
|-- kpl: string
|-- määrä eur: double
|-- kirjaussaldo eur: double
|-- tila: string
|-- arkistointitunnus: string
|-- määrä eur (saldo): double

frame_name frame0
<awsglue.dynamicframe.DynamicFrame object at 0x7f7b31e4c760>
<awsglue.dynamicframe.DynamicFrameCollection object at 0x7f7b31e4c8e0>


#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog
