In [1]:
# enable pyspark
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf
# ref: https://towardsai.net/p/programming/pyspark-aws-s3-read-write-operations
#spark configuration
conf = SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true'). \
 set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true'). \
 setAppName('pyspark_aw_glue').setMaster('local[*]')

sc=SparkContext(conf=conf)
sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')

# read aws credentials
import configparser
config = configparser.ConfigParser()
config.read_file(open(r'C:\Users\padma\.aws\credentials'))

accessKeyId= config['default']['AWS_ACCESS_KEY_ID']
secretAccessKey= config['default']['AWS_SECRET_ACCESS_KEY']

hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set('fs.s3a.access.key', accessKeyId)
hadoopConf.set('fs.s3a.secret.key', secretAccessKey)
hadoopConf.set('fs.s3a.endpoint', 's3.amazonaws.com')
hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')

print(sc)
from pyspark.sql import SparkSession
spark=SparkSession(sc)

<SparkContext master=local[*] appName=pyspark_aw_glue>


### Create glue catalog via boto3
https://stackoverflow.com/questions/58329935/how-to-create-a-data-catalog-in-amazon-glue-externally]

#### Create sample_db database

In [17]:
from pprint import pprint
import boto3

client = boto3.client('glue')
response = client.create_database(
    DatabaseInput={
        'Name': 'sample_db',  # Required
        'Description': 'Database created with boto3 API',
        'Parameters': {
            'my_param_1': 'my_param_value_1'
        },
    }
)
pprint(response)


AlreadyExistsException: An error occurred (AlreadyExistsException) when calling the CreateDatabase operation: Database already exists.

#### Delete sample_db database in glue catalog

In [None]:
from pprint import pprint
import boto3

#client = boto3.client('glue')
#response = client.delete_database(
#    #CatalogId='string',
#    Name='sample_db'
#)
pprint(response)

#### Reload the aws_glue package wiithout restarting the kernel

In [3]:
import importlib
from glue_helper import aws_glue
importlib.reload(aws_glue)

<module 'glue_helper.aws_glue' from 'C:\\Users\\padma\\github\\sparksql-awsglue\\aws-glue\\glue_helper\\aws_glue.py'>

In [4]:
from glue_helper import aws_glue

In [None]:
# aws_glue.create_table_helper()

In [None]:
# run this evety time you create the table or add a partition, 
#spark.sql('MSCK REPAIR TABLE sample_db.airlines')

In [6]:
aws_glue.get_current_schema_partition('sample_db')

FLIGHTS airlines date flight_number 


In [None]:
# tried to fix table creates via aws_glue.create_table_helper(), but did not work
#response = aws_glue.apply_msck_repair('sample_db', 'airlines')
#print(response)

#### Getting  a small data set and doing some experiments with dataframe

In [8]:
flightsPath = "datasets/flights.csv"
flights = spark.read\
                .format("csv")\
                .option("header", "true")\
                .load(flightsPath)

import pyspark.sql.functions as F
flights= flights.withColumn("date1", F.expr("replace(date, '-', '')"))

from pyspark.sql.types import IntegerType
flights = flights.withColumn("date1",flights["date1"].cast(IntegerType()))
flights.select("date1").dtypes

flights_19690_20304 = flights.filter(flights['airlines'].isin(["19690", "20304"])) 
flights_19690_20304 = flights_19690_20304.filter(flights_19690_20304['date1'].isin(["20140401", "20140402"]))
flights_19690_20304 = flights_19690_20304.drop("date")
flights_19690_20304 = flights_19690_20304.withColumnRenamed("date1","date") 
#flights_19690_20304.count()
#flights_19690_20304.show()
flights_sml = flights_19690_20304.limit(10)
# small data set
flights_sml.show()

+--------+-------------+------+-----------+---------+---------------+-------+-------------+--------+--------+--------+
|airlines|flight_number|origin|destination|departure|departure_delay|arrival|arrival_delay|air_time|distance|    date|
+--------+-------------+------+-----------+---------+---------------+-------+-------------+--------+--------+--------+
|   19690|            1|   LAX|        HNL|     0834|          -6.00|   1204|        44.00|  360.00| 2556.00|20140401|
|   19690|            2|   HNL|        LAX|     1342|          -3.00|   2151|       -19.00|  279.00| 2556.00|20140401|
|   19690|            3|   LAX|        HNL|     0952|          -8.00|   1251|        21.00|  339.00| 2556.00|20140401|
|   19690|            4|   HNL|        LAX|     2215|           0.00|   0615|       -20.00|  273.00| 2556.00|20140401|
|   19690|            7|   LAS|        HNL|     0848|         -12.00|   1219|         4.00|  376.00| 2762.00|20140401|
|   19690|            8|   HNL|        LAS|     

#### Version and location of python exe  and othes used in this jupyter notebook

In [None]:
from platform import python_version
print(python_version())

#####################
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

# pyspark jar location i think is
# C:\Users\padma\spark\spark-sql\tools\spark-3.1.2-bin-hadoop2.7\jars


#### Create parquet to file in local file system

In [None]:
flights_sml.count()
flights_sml.write.mode("overwrite").partitionBy('airlines', 'date', 'flight_number').parquet("output/flights")

#### Create parquet with partitons file in S3 Bucket

In [9]:
# writing parquet file to S3, without partitions
flights_sml.write.mode("overwrite").parquet("s3a://pp-database/tables/flights")

In [36]:
#writing parquet file to s3 with partitions
aws_glue.write_parquet_to_s3(flights_sml, 's3a://pp-database/tables/flights',
                            ['airlines', 'date', 'flight_number'])

# create table in glue catalog
columns_types= {'origin': 'string', 'destination': 'string', \
                'departure': 'string', 'departure_delay': 'string',\
                'arrival': 'string', 'arrival_delay': 'string',\
                'air_time': 'string', 'distance': 'string' }

partitions_types = { 'airlines': 'string',\
                    'date': 'int',\
                    'flight_number': 'string' }

aws_glue.create_glue_table('sample_db', 'flights', 's3://pp-database/tables/flights', columns_types, partitions_types)


#### Read back the recently created parquet file from S3

In [3]:
flights_read_from_s3=spark.read.parquet('s3a://pp-database/tables/flights',header=True,inferSchema=True)
flights_read_from_s3.show(5)

flights_read_from_s3_filter_by_airline=spark.read.parquet('s3a://pp-database/tables/flights/airlines=19690',header=True,inferSchema=True)
flights_read_from_s3_filter_by_airline.show(5)


KeyboardInterrupt: 

# Notes 
##### discard below

In [None]:
# parse aws credentials
# import configparser

# config = configparser.ConfigParser()
# config.read_file(open(r'C:\Users\padma\.aws\credentials'))

#os.environ["AWS_ACCESS_KEY_ID"]= config['default']['AWS_ACCESS_KEY_ID']
#os.environ["AWS_SECRET_ACCESS_KEY"]= config['default']['AWS_SECRET_ACCESS_KEY']
#os.environ["SPARK_HOME"]=r"C:\Users\padma\spark\spark-sql\tools\spark-3.1.2-bin-hadoop2.7"
#os.environ["HADOOP_HOME"]=r"C:\Users\padma\spark\spark-sql\tools\spark-3.1.2-bin-hadoop2.7\hadoop"

#import sys
#sys.path.append(r"C:\Users\padma\spark\spark-sql\tools\spark-3.1.2-bin-hadoop2.7\hadoopbin")

#.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.4") \
#.config("spark.jars.packages", "com.amazonaws:aws-java-sdk-pom:1.7.4.2") \
#.config("spark.hadoop.fs.s3a.awsAccessKeyId", os.environ['AWS_ACCESS_KEY_ID']) \
#.config("spark.hadoop.fs.s3a.awsSecretAccessKey", os.environ['AWS_SECRET_ACCESS_KEY']) \

# '''
# fs.s3a.access.key your access key
# fs.s3a.secret.key your secret key
# ("fs.s3a.endpoint", "s3.amazonaws.com")
# .config("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")\
# '''
# spark = SparkSession \
#     .builder \
#     .config("fs.s3a.endpoint", "s3.amazonaws.com")\
#     .config("com.amazonaws.services.s3a.enableV4", "true")\
#     .config("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")\
#     .config("fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID']) \
#     .config("fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY']) \
#     .appName("Analyzing airline data") \
#     .getOrCreate()
# '''
# spark = SparkSession \
# .builder \
# .config("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")\
# .config("com.amazonaws.services.s3a.enableV4", "true")\
# .config("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")\
# .config("fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID']) \
# .config("fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY']) \
# .getOrCreate()
# '''

# sc = spark.sparkContext
# print(sc)
# sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")