In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('DataFrames').getOrCreate()
sc =spark.sparkContext

- In memory and temporary
- have aggregated functions
- schema enforcement
- each column has a definate data type

In [2]:
df = spark.read.option('delimiter',",").csv('./data/SalesAnalysis.csv',header=True,inferSchema = True)
df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|     11.95|04/29/19 13:03|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|         

In [3]:
df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [4]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType
in_file_schema = StructType([
    StructField("Order ID",IntegerType(),True),
    StructField("Product",StringType(),True),
    StructField("Quantity Ordered",IntegerType(),True),
    StructField("Price Each",IntegerType(),True),
    StructField("Order Date",IntegerType(),True),
    StructField("Purchase Address",StringType(),True)
])

In [5]:
df = spark.read.csv('./data/SalesAnalysis.csv',header=True,inferSchema = True,schema=in_file_schema)
df.show()

+--------+--------------------+----------------+----------+----------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+----------+--------------------+
|  176558|USB-C Charging Cable|               2|      NULL|      NULL|917 1st St, Dalla...|
|    NULL|                NULL|            NULL|      NULL|      NULL|                NULL|
|  176559|Bose SoundSport H...|               1|      NULL|      NULL|682 Chestnut St, ...|
|  176560|        Google Phone|               1|       600|      NULL|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|      NULL|      NULL|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|      NULL|      NULL|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|      NULL|      NULL|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|               1|      NULL|      NULL|668 Center

In [6]:
df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: integer (nullable = true)
 |-- Order Date: integer (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [7]:
df = spark.read.parquet('./data/transactions.parquet')
df.show()

+------------------+------+----------+
|    account_number|amount|  datetime|
+------------------+------+----------+
|GSHP69974285041169| -9034|2022-09-08|
|LVTT31407534039456| -2952|2022-04-02|
|RJSO40371783238228|  3405|2022-05-02|
|UOIH17449189991636| -4049|2022-10-07|
|FBXP58394333953676| -4157|2022-06-22|
|QGOZ77832068336993|   927|2022-08-30|
|GYUV35083145696797|  1141|2022-09-16|
|QTDR82759027697693| -4007|2022-05-24|
|JUHT34822041952317| -9365|2022-05-06|
|VTDD61848346661625|  2876|2022-02-26|
|SMHV03707065634664|   688|2022-10-04|
|OZKO18032288972878| -4953|2022-01-11|
|UTNV10141922412942| -6721|2022-09-22|
|LUWG85866520744371|  8803|2022-06-29|
|QJBG57598735642013|   367|2022-10-04|
|WZIW26836360947357| -1489|2022-04-21|
|FSXA86319779917884| -1240|2022-08-25|
|NCPS87127445310431|  1947|2022-02-02|
|PZMT98263913720790| -2758|2022-08-02|
|SGTO99755660979428|  3530|2022-03-17|
+------------------+------+----------+
only showing top 20 rows



In [8]:
df.printSchema()

root
 |-- account_number: string (nullable = true)
 |-- amount: long (nullable = true)
 |-- datetime: date (nullable = true)



In [10]:
df = spark.read.json('./data/example_2.json',multiLine=True)
df.show()

+--------------------+
|                quiz|
+--------------------+
|{{{12, [10, 11, 1...|
+--------------------+



In [11]:
df.printSchema()

root
 |-- quiz: struct (nullable = true)
 |    |-- maths: struct (nullable = true)
 |    |    |-- q1: struct (nullable = true)
 |    |    |    |-- answer: string (nullable = true)
 |    |    |    |-- options: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- question: string (nullable = true)
 |    |    |-- q2: struct (nullable = true)
 |    |    |    |-- answer: string (nullable = true)
 |    |    |    |-- options: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- question: string (nullable = true)
 |    |-- sport: struct (nullable = true)
 |    |    |-- q1: struct (nullable = true)
 |    |    |    |-- answer: string (nullable = true)
 |    |    |    |-- options: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- question: string (nullable = true)

