In [None]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']

In [None]:
import random
import toolz

In [None]:
from pyspark import SparkContext
sc = SparkContext("local[*]", "demo")
print sc.version  # should be >= 1.5.1 for distributed matrices

In [None]:
# needed to convert RDDs into DataFrames
from pyspark.sql import SQLContext
from pyspark.sql.functions import UserDefinedFunction as udf
from pyspark.sql.types import DoubleType
sqlContext = SQLContext(sc)

# DataFrames

* Immutable, like RDDs
* Lineage is remembered, like RDDs (resiliency)
* Lazy execution, like RDDs
* So why do we care?


DataFrames are an abstraction that lets us think of data in a familiar form (Panda, data.frame, SQL table, etc.).

We can use a similar API to RDDs!

Access to SQL-like optimizations and cost analysis due to it being in a columnar format.

What about type safety?

What are these UDF things?

In [None]:
data = sc.parallelize(xrange(1,10001)) \
         .map(lambda x: (random.random(), random.random()))

In [None]:
df = data.toDF()
# Note: this isn't always easy!

In [None]:
df.printSchema()

In [None]:
df = df.withColumnRenamed("_1", "x").withColumnRenamed("_2", "y")
df.write.save("demo", format="parquet")
# df.write.parquet("demo")

Try rerunning the above cell.

Save modes:
* error
* append
* overwrite
* ignore (ie. CREATE TABLE IF NOT EXISTS)

In [None]:
df.write.mode("ignore").parquet("demo")

In [None]:
dfp = sqlContext.read.parquet("demo")

In [None]:
dfp.describe("x").show()

In [None]:
filtered_dfp = dfp.filter(dfp["x"] < 0.5)

In [None]:
filtered_dfp.count()

In [None]:
filtered_dfp.explain(True)

In [None]:
filtered_df = df.filter(df["x"] < 0.5)

In [None]:
filtered_df.explain(True)

In [None]:
filtered_df = df.filter(df["x"] < 0.5).filter(df["y"] < 0.5)

In [None]:
filtered_df.explain(True)

In [None]:
filtered_dfp = dfp.filter(dfp["x"] < 0.5).filter(dfp["y"] < 0.5)

In [None]:
filtered_dfp.explain(True)

### Catalyst optimizer

Manipulating trees based on rules.
The introductory [blog post](https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html) has good pictures.

### Project Tungsten

* Memory management and GC (better than the JVM)
* Cache-aware computation
* Codegen (compile queries into Java bytecode)

Cache-aware computation example:
Case 1: pointer -> key, value
Case 2: ke, pointer -> key, value
Where to find keys for sort purposes?

[More](https://databricks.com/blog/2015/04/28/project-tungsten-bringing-spark-closer-to-bare-metal.html)

### DataFrame performance and tuning

See [here](http://spark.apache.org/docs/latest/sql-programming-guide.html#performance-tuning) for details.

## SQL

In [None]:
sqlContext.sql("select * from parquet.`demo` limit 3").show()

In [None]:
# Requires Hive to permanently store tables
df.registerTempTable('nums')  # This is NOT the same as a temp table in SQL proper
sql_df = sqlContext.sql("select x, y from nums where y > 0.9 limit 3")
sql_df.show()

In [None]:
sql_df.explain(True)

*Reminder:* Check the UI for tables in memory.

*Reminder:* A number of interactive tutorials are available on the DataBricks [community cloud](https://community.cloud.databricks.com). I highly recommend making an account and checking out the guide.

This is also a good place to learn about connecting to databases like Cassandra or using JDBC protocol.

## Adding columns and functions

Because DataFrames are immutable, adding new information means appending columns to an existing DataFrame.

In [None]:
@toolz.curry
def prediction(threshold, val):
    if val > threshold:
        return 1.0
    else:
        return 0.0

In [None]:
x_labelizer = udf(prediction(0.5), DoubleType())
y_labelizer = udf(prediction(0.9), DoubleType())

In [None]:
new_df = dfp.withColumn("x_label", x_labelizer("x")).withColumn("y_label", y_labelizer("y"))

In [None]:
new_df.show()

## Type safety and Datasets

In [None]:
rdd = new_df.rdd
print rdd.take(1)

In [None]:
rdd.take(1)[0][3]

*Note:* The type-safety benefits of datasets don't matter much in Python yet.

*Copyright &copy; 2016 The Data Incubator.  All rights reserved.*