In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
# spark is our connection to the database 
spark = SparkSession.builder \ 
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()
# master contains the URL of your remote spark instance or 'local'

<h4>Resilient Distributed Dataset (RDD)</h4>
<ul>
    <li>Fault tolerant</li>
    <li>Resilient</li>
    <li>Immutable</li>
    <li>Partitioned</li>
</ul>

<h4>Data Upload</h4>

In [None]:
# Upload data from a list  
data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"])]

# Create an RDD
rdd = spark.sparkContext.parallelize(data)

In [None]:
# Upload list from a file
rdd_2 = spark.sparkContext.textFile("menu.txt")

<h4>Dataframe Creation</h4>

In [None]:
# Create a Dataframe
df_data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Diavola", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
        ("Prosciutto", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Speck & Brie", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
        ("Tonno & Cipolle", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"]),
        ("Fries", 3.95, ["Potatoes"])]
        
columns = ["Pizza Name", "Price", "Ingredients"]
df = spark.createDataFrame(data = df_data, schema = columns)

In [None]:
# Show the first 20 elements of a dataframe
df.show()

In [None]:
# Load a DataFrame
df = spark.read.option("header", True).option("delimiter", ";").csv("menu_csv.txt")

# Print detected 
df.printSchema()

df.show()

<h4>Dataframes from RDDs</h4>

In [None]:
# Transform the RDD into a Dataframe
df_from_rdd = rdd.toDF()

# Print the schema of the Dataframe
df_from_rdd.printSchema()

In [None]:
#Transform the RDD into a Dataframe, specifying the columns
columns = ["Pizza Name", "Price", "Ingredients"]
df_from_rdd = rdd.toDF(columns)
df_from_rdd.printSchema()

In [None]:
df_2_from_rdd = spark.createDataFrame(rdd).toDF(*columns)
df_from_rdd.printSchema()

<h4>Custom Dataframe</h4>

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([ \
    StructField("Pizza Name", StringType(), True), \
    StructField("Price", FloatType(), True), \
    StructField("Ingredients", ArrayType(StringType()), True) \
])
 
df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show(truncate=False)

<h4>Organizing Data</h4>

In [None]:
# Sorting depending on the fields (default = ascending order)
df.sort("Price").show(truncate = False)

In [None]:
from pyspark.sql.functions import col
# Sorting depending on the fields
df.sort(col("Price"), col("Pizza Name")).show(truncate = False) # or use df.Price

In [None]:
# Sorting using orderBy
df.orderBy(col("Price"), col("Pizza Name")).show(truncate = False) # in case of df.Pizza Name i cant use it
# so i use the function col("Pizza Name")

In [None]:
# Expliciting the sorting (work the same with orderBy)
df.sort(col("Price").asc(), col("Pizza Name").desc()).show(truncate = False)

In [None]:
# We could also use raw SQL
# No spoilers -> We'll see how to use it later on

<h4>Explode Arrays in Individual Rows</h4>

In [None]:
from pyspark.sql.functions import explode # its like unwind on mongodb 
# it splut the array into a lot of row with only one value of the array

exploded_df = df.select(col("Pizza Name"), df.Price, explode(df.Ingredients))
exploded_df.printSchema()
exploded_df.show(truncate = False)

In [None]:
# How can we rename a column?
exploded_df = exploded_df.withColumnRenamed("col", "Ingredient").printSchema()