# Imports

In [1]:
import os
from pyspark import SparkContext
import findspark
import pandas as pd

- https://medium.com/@kashif.sohail/how-to-read-compressed-csv-files-from-s3-using-local-pyspark-and-jupyter-notebook-b30e50c41b95
- http://bartek-blog.github.io/python/spark/2019/04/22/how-to-access-s3-from-pyspark.html

### Configuring credentials

In [2]:
import configparser
config = configparser.ConfigParser()
config.read('/Users/paulogier/81-GithubPackages/Udacity-Data-Engineer-NanoDegree/P4-Data_Lake_with_Spark/p4src/etl/dbuser_config.cfg')
os.environ['AWS_KEY_ID'] = config.get("AWS", "KEY")
os.environ['AWS_SECRET'] = config.get("AWS", 'SECRET')

In [8]:
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('S3CSVRead').getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")

In [9]:
spark._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", os.environ.get('AWS_KEY_ID'))
spark._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", os.environ.get('AWS_SECRET'))


### Read single file

In [17]:
mycsv = "s3a://dendpaulogieruswest2/sampledata/titanic-data.csv"
df = spark.read.option("header", "true").csv(mycsv)
df.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

### Read multiple files

https://stackoverflow.com/questions/35409539/corrupt-record-error-when-reading-a-json-file-into-spark

In [25]:
myprefix = "s3a://dendpaulogieruswest2/sampledata/connectionssample*"
df = spark.read.options(multiLine=True).json(myprefix)
print(df.count())
df.show(5)

2000
+--------------------+----------+------+---+-------------+----------+
|               email|first_name|gender| id|   ip_address| last_name|
+--------------------+----------+------+---+-------------+----------+
|rjosskoviz0@forbe...|   Randolf|  Male|  1| 51.159.229.1| Josskoviz|
|coloughlin1@huged...|    Camala|Female|  2| 145.22.1.196|O'Loughlin|
|abofield2@statcou...|      Arie|  Male|  3| 56.82.90.115|   Bofield|
|mhenkens3@utexas.edu|      Mark|  Male|  4| 88.25.101.43|   Henkens|
|atilliard4@nature...|    Amelia|Female|  5|23.17.182.175|  Tilliard|
+--------------------+----------+------+---+-------------+----------+
only showing top 5 rows



#### With a schema

In [24]:
from pyspark.sql import types as T
s = T.StructType([
    T.StructField("email", T.StringType()),
    T.StructField("first_name", T.StringType()),
    T.StructField("gender", T.StringType()),
    T.StructField("id", T.IntegerType()),
    T.StructField("ip_address", T.StringType()),
    T.StructField("last_name", T.StringType())
])
df = spark.read.options(multiLine=True).json(myprefix, schema=s)
print(df.count())
df.show(5)

2000
+--------------------+----------+------+---+-------------+----------+
|               email|first_name|gender| id|   ip_address| last_name|
+--------------------+----------+------+---+-------------+----------+
|rjosskoviz0@forbe...|   Randolf|  Male|  1| 51.159.229.1| Josskoviz|
|coloughlin1@huged...|    Camala|Female|  2| 145.22.1.196|O'Loughlin|
|abofield2@statcou...|      Arie|  Male|  3| 56.82.90.115|   Bofield|
|mhenkens3@utexas.edu|      Mark|  Male|  4| 88.25.101.43|   Henkens|
|atilliard4@nature...|    Amelia|Female|  5|23.17.182.175|  Tilliard|
+--------------------+----------+------+---+-------------+----------+
only showing top 5 rows

