In [1]:
import findspark
findspark.init()

In [2]:
findspark.find()
import pyspark

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [4]:
import numpy as np
import pandas as pd

# Examining the `SparkContext`

In [19]:
sc = SparkContext()

In [20]:
# verify SparkContext
print(sc)

# print Spark version
print(sc.version)

<SparkContext master=local[*] appName=pyspark-shell>
3.0.1


# Creating a `SparkSession`

In [21]:
# create my_spark
spark = SparkSession.builder.getOrCreate()

In [22]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001A9DC33BF08>


# Viewing tables

In [23]:
flights = spark.read.csv('flights_small.csv', header=True)

In [25]:
flights.name = flights.createOrReplaceTempView('flights')

In [26]:
spark.catalog.listTables()

[Table(name='flights', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

# Are you query-ious?

In [27]:
query = "FROM flights SELECT * LIMIT 10"

# get the first 10 rows of flights
flights10 = spark.sql(query)

flights10.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
|2014|    1| 15|    1037|        7|    1

# Pandafy A Spark DataFrame

In [28]:
query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest"

# run the query
flight_counts = spark.sql(query)

flight_counts.show()

+------+----+---+
|origin|dest|  N|
+------+----+---+
|   SEA| RNO|  8|
|   SEA| DTW| 98|
|   SEA| CLE|  2|
|   SEA| LAX|450|
|   PDX| SEA|144|
|   SEA| BLI|  5|
|   PDX| IAH| 57|
|   PDX| PHX|209|
|   SEA| SLC|225|
|   SEA| SBA| 23|
|   SEA| BWI| 29|
|   PDX| IAD| 23|
|   PDX| SFO|305|
|   SEA| KOA| 40|
|   PDX| MCI| 15|
|   SEA| SJC|213|
|   SEA| ABQ| 43|
|   SEA| SAT| 18|
|   PDX| ONT| 57|
|   SEA| LAS|364|
+------+----+---+
only showing top 20 rows



In [29]:
# convert the results to a pandas DataFrame
pd_counts = flight_counts.toPandas()
pd_counts.head()

Unnamed: 0,origin,dest,N
0,SEA,RNO,8
1,SEA,DTW,98
2,SEA,CLE,2
3,SEA,LAX,450
4,PDX,SEA,144


# Put some Spark in your data

In [30]:
# create pd_temp
pd_temp = pd.DataFrame(np.random.random(10))

# create spark_temp from pd_temp
spark_temp = spark.createDataFrame(pd_temp)

# examine the tables in the catalog
print(spark.catalog.listTables())

[Table(name='flights', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [31]:
# add spark_temp to the catalog
spark_temp.name = spark_temp.createOrReplaceTempView('temp')

print(spark.catalog.listTables())

[Table(name='flights', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


# Dropping the middle man

In [32]:
airports = spark.read.csv('airports.csv', header=True)
airports.show()

+---+--------------------+----------------+-----------------+----+---+---+
|faa|                name|             lat|              lon| alt| tz|dst|
+---+--------------------+----------------+-----------------+----+---+---+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|
|0S9|Jefferson County ...|      48.0538086|     -122.8106436| 108| -8|  A|
|0W3|Harford County Ai...