## [API](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Apache Spark test') \
    .config("spark.sql.shuffle.partitions", "500") \
    .getOrCreate()

In [2]:
spark

## Data 
[download data](https://github.com/databricks/Spark-The-Definitive-Guide)

In [3]:
df = spark.read.parquet("/Users/sg0218817/Downloads/Spark-The-Definitive-Guide-master/data/flight-data/parquet/2010-summary.parquet")

In [4]:
df.show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [5]:
df.count()

255

## Schema

In [6]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [7]:
df.describe()

DataFrame[summary: string, DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: string]

In [8]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [9]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

my_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", IntegerType(), True)
])

df = spark.read \
    .format("json") \
    .schema(my_schema) \
    .load("/Users/sg0218817/Downloads/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")

df.show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
|United States    |Singapore          |1    |
|United States    |Grenada            |62   |
|Costa Rica       |United States      |588  |
|Senegal          |United States      |40   |
|Moldova          |United States      |1    |
+-----------------+-------------------+-----+
only showing top 10 rows



## Columns

In [10]:
from pyspark.sql.functions import col, expr, lit

new_df = df.withColumn('exprMet', expr("(((count + 5) * 200) - 6) < 225 * count"))
new_df.show(10, False)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+-----------------+-------------------+-----+-------+
|United States    |Romania            |15   |false  |
|United States    |Croatia            |1    |false  |
|United States    |Ireland            |344  |true   |
|Egypt            |United States      |15   |false  |
|United States    |India              |62   |true   |
|United States    |Singapore          |1    |false  |
|United States    |Grenada            |62   |true   |
|Costa Rica       |United States      |588  |true   |
|Senegal          |United States      |40   |true   |
|Moldova          |United States      |1    |false  |
+-----------------+-------------------+-----+-------+
only showing top 10 rows



In [11]:
new_df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count', 'exprMet']

In [12]:
new_df.createOrReplaceTempView('df_table')

## Rows

In [13]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [14]:
from pyspark.sql import Row

my_row = Row('Poland', 'Norway', 10)

In [15]:
my_row[0]

'Poland'

## DataFrame

In [18]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
from pyspark.sql import Row

my_schema = StructType([
    StructField("some", StringType(), True),
    StructField("col", StringType(), True),
    StructField("names", LongType(), False)
])

my_row = Row("Hello", None, 1)
# my_df = spark.createDataFrame([my_row], my_schema)

# my_df.show()

## Select

In [19]:
spark.sql("SELECT DEST_COUNTRY_NAME, count * 10, 23 AS TwentyThree FROM df_table").show(2)

+-----------------+------------+-----------+
|DEST_COUNTRY_NAME|(count * 10)|TwentyThree|
+-----------------+------------+-----------+
|    United States|         150|         23|
|    United States|          10|         23|
+-----------------+------------+-----------+
only showing top 2 rows



In [20]:
new_df.select('DEST_COUNTRY_NAME', expr('count * 10 AS countTimes10'), lit(23).alias('TwentyThree')).show(2)

+-----------------+------------+-----------+
|DEST_COUNTRY_NAME|countTimes10|TwentyThree|
+-----------------+------------+-----------+
|    United States|         150|         23|
|    United States|          10|         23|
+-----------------+------------+-----------+
only showing top 2 rows



In [21]:
new_df.selectExpr('DEST_COUNTRY_NAME AS destination', 'count * 10 AS c10').show(2)

+-------------+---+
|  destination|c10|
+-------------+---+
|United States|150|
|United States| 10|
+-------------+---+
only showing top 2 rows



In [22]:
new_df.selectExpr('*', '(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS withinCountry').show(2)

+-----------------+-------------------+-----+-------+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|withinCountry|
+-----------------+-------------------+-----+-------+-------------+
|    United States|            Romania|   15|  false|        false|
|    United States|            Croatia|    1|  false|        false|
+-----------------+-------------------+-----+-------+-------------+
only showing top 2 rows



In [23]:
new_df.selectExpr('AVG(count)', 'COUNT(DISTINCT(DEST_COUNTRY_NAME))').show()

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



## Add/Rename/Remove Columns

In [24]:
new_df.withColumn("numberOne", lit(1)) \
    .withColumn("withinCountry", expr('DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME')) \
    .withColumnRenamed("DEST_COUNTRY_NAME", 'dest') \
    .withColumnRenamed("ORIGIN_COUNTRY_NAME", 'origin') \
    .withColumn("count", col('count').cast('integer')) \
    .drop('exprMet') \
    .show(2)

+-------------+-------+-----+---------+-------------+
|         dest| origin|count|numberOne|withinCountry|
+-------------+-------+-----+---------+-------------+
|United States|Romania|   15|        1|        false|
|United States|Croatia|    1|        1|        false|
+-------------+-------+-----+---------+-------------+
only showing top 2 rows



## Filtering

In [25]:
new_df.filter('count > 100').show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+-----------------+-------------------+-----+-------+
|    United States|            Ireland|  344|   true|
|       Costa Rica|      United States|  588|   true|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [26]:
new_df.filter(col('count') > 100).show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+-----------------+-------------------+-----+-------+
|    United States|            Ireland|  344|   true|
|       Costa Rica|      United States|  588|   true|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [27]:
new_df \
    .where(col('count') > 100) \
    .where(col('count') < 400) \
    .show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+-----------------+-------------------+-----+-------+
|    United States|            Ireland|  344|   true|
|    United States|       Sint Maarten|  325|   true|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



## Distinct

In [28]:
new_df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').distinct().count()

256

In [29]:
spark.sql('SELECT COUNT(DISTINCT(ORIGIN_COUNTRY_NAME, DEST_COUNTRY_NAME)) AS count FROM df_table').show()

+-----+
|count|
+-----+
|  256|
+-----+



## Sample

In [30]:
new_df.sample(False, 0.05, 1).show()

+--------------------+-------------------+-----+-------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+--------------------+-------------------+-----+-------+
|       United States|            Ireland|  344|   true|
|             Bolivia|      United States|   30|  false|
|    Marshall Islands|      United States|   42|   true|
|       United States|             Angola|   13|  false|
|       United States|             Cyprus|    1|  false|
|       United States|              Samoa|   25|  false|
|       United States|        The Bahamas|  986|   true|
|              Cyprus|      United States|    1|  false|
|Saint Kitts and N...|      United States|  139|   true|
|    Papua New Guinea|      United States|    3|  false|
|               Ghana|      United States|   18|  false|
|              Poland|      United States|   32|  false|
|            Bulgaria|      United States|    3|  false|
+--------------------+-------------------+-----+-------+



In [31]:
dfs = new_df.randomSplit([0.25, 0.75])

In [32]:
print(new_df.count(), dfs[0].count(), dfs[1].count())

256 52 204


## Union

In [33]:
dfs[0].union(dfs[1]).show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+-----------------+-------------------+-----+-------+
|         Anguilla|      United States|   41|   true|
|            Aruba|      United States|  346|   true|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



## Sorting

In [34]:
new_df.sort('ORIGIN_COUNTRY_NAME', col('DEST_COUNTRY_NAME').desc()) \
    .select('ORIGIN_COUNTRY_NAME', 'DEST_COUNTRY_NAME') \
    .show(2)

+-------------------+-----------------+
|ORIGIN_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-------------------+-----------------+
|             Angola|    United States|
|           Anguilla|    United States|
+-------------------+-----------------+
only showing top 2 rows



In [35]:
new_df.sortWithinPartitions('ORIGIN_COUNTRY_NAME').show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|exprMet|
+-----------------+-------------------+-----+-------+
|    United States|             Angola|   13|  false|
|    United States|           Anguilla|   38|  false|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



## Repartition vs. Coalesce

The repartition algorithm does a full shuffle of the data and creates equal sized partitions of data. coalesce combines existing partitions to avoid a full shuffle.

In [36]:
new_df.rdd.getNumPartitions()

1

In [37]:
rep_df = new_df.repartition(5)
rep_df.rdd.getNumPartitions()

# rep_df.write.json('output')

5

In [38]:
rep_df = new_df.repartition('DEST_COUNTRY_NAME')
rep_df.rdd.getNumPartitions()

# rep_df.write.json('output')

500

In [39]:
coalesce_df = new_df.coalesce(2)
coalesce_df.rdd.getNumPartitions()

1

## Collect

In [40]:
new_df.collect()
new_df.take(5)
new_df.toLocalIterator()

<itertools.chain at 0x109c50c10>

## SQL

In [41]:
df.createOrReplaceTempView('my_data')

In [43]:
spark.sql("""
    SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count 
    FROM my_data
    WHERE count > 100
    SORT BY count DESC
    """) \
    .show(10, False)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count |
+-----------------+-------------------+------+
|United States    |United States      |370002|
|United States    |Canada             |8483  |
|Canada           |United States      |8399  |
|United States    |Mexico             |7187  |
|Mexico           |United States      |7140  |
|United Kingdom   |United States      |2025  |
|United States    |United Kingdom     |1970  |
|Japan            |United States      |1548  |
|United States    |Japan              |1496  |
|Germany          |United States      |1468  |
+-----------------+-------------------+------+
only showing top 10 rows

