In [None]:
from pyspark.sql import SparkSession

In [None]:
import os

In [None]:
# Set SPARK_HOME environment variable
os.environ['SPARK_HOME']='/Users/mehmet.senturk/opt/miniconda3/envs/investment_insights/lib/python3.7/site-packages/pyspark'

In [None]:
# Create a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

# Creating DataFrames

In [None]:
df = spark.read.json("examples/src/main/resources/people.json")

df.show()

# Untyped Dataset Operations (aka DataFrame Operations)

In [None]:
df.printSchema()

In [None]:
df.select("name").show()

In [None]:
df.select(df['name'], df['age'] + 1).show()

In [None]:
df.filter(df['age'] > 21).show()

In [None]:
df.groupBy("age").count().show()

# Running SQL Queries Programmatically

In [None]:
df.createOrReplaceTempView("people")

In [None]:
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

# Global Temporary View

In [None]:
df.createGlobalTempView("people")

In [None]:
spark.sql("SELECT * FROM global_temp.people").show()

# Inferring the Schema Using Reflection
Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. Rows are constructed by passing a list of key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table, and the types are inferred by sampling the whole dataset, similar to the inference that is performed on JSON files.

In [None]:
from pyspark.sql import Row
sc = spark.sparkContext

In [None]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")

In [None]:
parts = lines.map(lambda l: l.split(","))

In [None]:
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

In [None]:
people.collect()

In [None]:
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

In [None]:
# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

In [None]:
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
    print(name)
# Name: Justin

# Programmatically Specifying the Schema

When a dictionary of kwargs cannot be defined ahead of time (for example, the structure of records is encoded in a string, or a text dataset will be parsed and fields will be projected differently for different users), a DataFrame can be created programmatically with three steps.

1) Create an RDD of tuples or lists from the original RDD;
2) Create the schema represented by a StructType matching the structure of tuples or lists in the RDD created in the step 1.
3) Apply the schema to the RDD via createDataFrame method provided by SparkSession.

For example:

In [None]:
# Import data types
from pyspark.sql.types import StringType, StructType, StructField

sc = spark.sparkContext

In [None]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))

In [None]:
# Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], p[1].strip()))

In [None]:
# The schema is encoded in a string.
schemaString = "name age"

In [None]:
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

In [None]:
# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people, schema)

In [None]:
# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")

In [None]:
# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT name FROM people")

In [None]:
results.show()

# Generic Load/Save Functions

In [None]:
# Parquet
df = spark.read.load("examples/src/main/resources/users.parquet")
df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")

In [None]:
df.show()

In [None]:
# Json to Parquet
df = spark.read.load("examples/src/main/resources/people.json", format="json")
df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")

In [None]:
df.show()

In [None]:
# Csv
df = spark.read.load("examples/src/main/resources/people.csv",
                     format="csv", sep=";", inferSchema="true", header="true")

In [None]:
df.show()

In [None]:
# ORC with bloom filter and dictonary encoding
df = spark.read.orc("examples/src/main/resources/users.orc")
(df.write.format("orc")
    .option("orc.bloom.filter.columns", "favorite_color")
    .option("orc.dictionary.key.threshold", "1.0")
    .option("orc.column.encoding.direct", "name")
    .save("users_with_options.orc"))

In [None]:
df.show()

In [None]:
# Parquet with bloom filter and dictonary encoding
df = spark.read.parquet("examples/src/main/resources/users.parquet")
(df.write.format("parquet")
    .option("parquet.bloom.filter.enabled#favorite_color", "true")
    .option("parquet.bloom.filter.expected.ndv#favorite_color", "1000000")
    .option("parquet.enable.dictionary", "true")
    .option("parquet.page.write-checksum.enabled", "false")
    .save("users_with_options.parquet"))

In [None]:
df.show()

## Run SQL on files directly

In [None]:
df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")

In [None]:
df.show()

In [None]:
# Test out google sheets extraction
df = spark.read.parquet('../output/Investments-*.parquet')

In [None]:
df.describe()

In [None]:
df.createOrReplaceTempView("investments")

In [None]:
df_converted = spark.sql("""
select 
    investment_instrument
    ,cast(investment_date as date) as investment_date
    ,cast(amount as float) as amount 
    ,cast(price as float) as price
from investments
""")

In [None]:
df_converted.show()