## PySpark Fundamentals

In [1]:
import pyspark
from pyspark.sql import Row, SparkSession
pyspark.__version__

'3.1.2'

In [2]:
spark = SparkSession.builder.appName("Spark Intro").getOrCreate()
spark

#### Reading in Data

In [3]:
df = spark.read.csv('sales.csv')

#### Display dataset

In [4]:
df.show(n=10)

+----------+------------+----------------+---------------+
|       _c0|         _c1|             _c2|            _c3|
+----------+------------+----------------+---------------+
|Order Date|Total Profit|         Country|      Item Type|
|2012-07-27|     3839.13|    South Africa|         Fruits|
|2013-09-14|   338631.84|         Morocco|        Clothes|
|2015-05-15|     20592.0|Papua New Guinea|           Meat|
|2017-05-17|    41273.28|        Djibouti|        Clothes|
|2016-10-26|    62217.18|        Slovakia|      Beverages|
|2011-11-07|     3323.39|       Sri Lanka|         Fruits|
|2013-01-18|     9349.02|     Seychelles |      Beverages|
|2016-11-30|    23114.16|        Tanzania|      Beverages|
|2017-03-23|    113120.0|           Ghana|Office Supplies|
+----------+------------+----------------+---------------+
only showing top 10 rows



In [5]:
# Also an option
df.head(10)

[Row(_c0='Order Date', _c1='Total Profit', _c2='Country', _c3='Item Type'),
 Row(_c0='2012-07-27', _c1='3839.13', _c2='South Africa', _c3='Fruits'),
 Row(_c0='2013-09-14', _c1='338631.84', _c2='Morocco', _c3='Clothes'),
 Row(_c0='2015-05-15', _c1='20592.0', _c2='Papua New Guinea', _c3='Meat'),
 Row(_c0='2017-05-17', _c1='41273.28', _c2='Djibouti', _c3='Clothes'),
 Row(_c0='2016-10-26', _c1='62217.18', _c2='Slovakia', _c3='Beverages'),
 Row(_c0='2011-11-07', _c1='3323.39', _c2='Sri Lanka', _c3='Fruits'),
 Row(_c0='2013-01-18', _c1='9349.02', _c2='Seychelles ', _c3='Beverages'),
 Row(_c0='2016-11-30', _c1='23114.16', _c2='Tanzania', _c3='Beverages'),
 Row(_c0='2017-03-23', _c1='113120.0', _c2='Ghana', _c3='Office Supplies')]

#### Read in the data, but have the columns be displayed properly rather than _c0, _c1 etc.

In [6]:
df_new = spark.read.option(key='header', value='true').csv('sales.csv')
df_new.show(n=10)
print(f"This df_new object is of type: {type(df_new)}")

+----------+------------+----------------+---------------+
|Order Date|Total Profit|         Country|      Item Type|
+----------+------------+----------------+---------------+
|2012-07-27|     3839.13|    South Africa|         Fruits|
|2013-09-14|   338631.84|         Morocco|        Clothes|
|2015-05-15|     20592.0|Papua New Guinea|           Meat|
|2017-05-17|    41273.28|        Djibouti|        Clothes|
|2016-10-26|    62217.18|        Slovakia|      Beverages|
|2011-11-07|     3323.39|       Sri Lanka|         Fruits|
|2013-01-18|     9349.02|     Seychelles |      Beverages|
|2016-11-30|    23114.16|        Tanzania|      Beverages|
|2017-03-23|    113120.0|           Ghana|Office Supplies|
|2016-05-23|  1350622.16|        Tanzania|      Cosmetics|
+----------+------------+----------------+---------------+
only showing top 10 rows

This df_new object is of type: <class 'pyspark.sql.dataframe.DataFrame'>


#### Get info on our dataset (similar to `df.info()` in pandas)

In [7]:
df_new.printSchema()

root
 |-- Order Date: string (nullable = true)
 |-- Total Profit: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Item Type: string (nullable = true)

