In [1]:
import findspark

In [2]:
findspark.init("/usr/local/spark")

In [3]:
from pyspark.sql import SparkSession

In [4]:
# May take awhile locally
spark = SparkSession.builder.appName("Operations").getOrCreate()

In [5]:
import os

In [6]:
os.getenv('PYTHON_SPARK')

'/home/pasharma/workspace/CodingChills/PySpark'

In [7]:
data_dir = os.path.join(os.getenv('PYTHON_SPARK'),'Data')

In [13]:
# We'll discuss how to read other options later.
# This dataset is from Spark's examples

# Might be a little slow locally
df = spark.read.json(os.path.join(data_dir,'people.json'))

In [14]:
#show Data
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [15]:
df.printSchema()
# This is like df.info, Important as per data types

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [18]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [16]:
df.columns

['age', 'name']

Some data types make it easier to infer schema (like tabular formats such as csv which we will show later). 
However you often have to set the schema yourself if you aren't dealing with a .read method that doesn't have inferSchema() built-in.
Spark has all the tools you need for this, it just requires a very specific structure:

In [19]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType

In [20]:
#Next we need to create the list of Structure fields
#    * :param name: string, name of the field.
#    * :param dataType: :class:`DataType` of the field.
#    * :param nullable: boolean, whether the field can be null (None) or not.
data_schema = [StructField("age", IntegerType(), True),
               StructField("name", StringType(), True)]           

In [21]:
final_struc = StructType(fields=data_schema)

In [23]:
df = spark.read.json(os.path.join(data_dir, 'people.json'), schema=final_struc)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



##  Grabbing Data Select Columns

In [24]:
df['age']

Column<b'age'>

In [25]:
type(df['age'])

pyspark.sql.column.Column

In [26]:
df.select('age')

DataFrame[age: int]

In [28]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [27]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [30]:
df.select(['name','age']).show()

+-------+----+
|   name| age|
+-------+----+
|Michael|null|
|   Andy|  30|
| Justin|  19|
+-------+----+



In [31]:
# Returns list of Row objects
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

## Creating new columns

In [32]:
# Adding a new column with a simple copy
df.withColumn('newage',df['age'])

DataFrame[age: int, name: string, newage: int]

In [34]:
df.withColumn('newage',df['age']).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    30|
|  19| Justin|    19|
+----+-------+------+



In [33]:
#But orignal DF remains intact
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [36]:
# Simple Rename
df.withColumnRenamed('age','newage').show()

+------+-------+
|newage|   name|
+------+-------+
|  null|Michael|
|    30|   Andy|
|    19| Justin|
+------+-------+



In [37]:
df.withColumn('doubleage',df['age']*2).show()

+----+-------+---------+
| age|   name|doubleage|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



In [39]:
#orignal remains intact, We dont have inplace here
df.show(1)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
+----+-------+
only showing top 1 row



In [40]:
df = df.withColumn('half_age',df['age']/2)

In [41]:
df.show()

+----+-------+--------+
| age|   name|half_age|
+----+-------+--------+
|null|Michael|    null|
|  30|   Andy|    15.0|
|  19| Justin|     9.5|
+----+-------+--------+



### Using SQL

To use SQL queries directly with the dataframe, you will need to register it to a temporary view:

In [42]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

In [44]:
spark.sql("SELECT * FROM people").show()

+----+-------+--------+
| age|   name|half_age|
+----+-------+--------+
|null|Michael|    null|
|  30|   Andy|    15.0|
|  19| Justin|     9.5|
+----+-------+--------+



In [45]:
spark.sql("SELECT * FROM people WHERE age=30").show()

+---+----+--------+
|age|name|half_age|
+---+----+--------+
| 30|Andy|    15.0|
+---+----+--------+



## Basic Operations


In [46]:
# Let Spark know about the header and infer the Schema types!, read_csv example
df = spark.read.csv(os.path.join(data_dir,'appl_stock.csv'),inferSchema=True,header=True)

In [47]:
df.show(2)
#Replica of df.head()

+-------------------+----------+----------+------------------+----------+---------+------------------+
|               Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+-------------------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [48]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



## Filtering Data

A large part of working with DataFrames is the ability to quickly filter out data based on conditions. Spark DataFrames are built on top of the Spark SQL platform, which means that is you already know SQL, you can quickly and easily grab that data using SQL commands, or using the DataFram method.

In [49]:
# Using SQL
df.filter("Close<500").show(2)

+-------------------+----------+----------+------------------+----------+---------+------------------+
|               Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+-------------------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [52]:
# Using SQL with .select()
df.filter("Close<500").select(['Date','Open','Close']).show(2)

+-------------------+----------+----------+
|               Date|      Open|     Close|
+-------------------+----------+----------+
|2010-01-04 00:00:00|213.429998|214.009998|
|2010-01-05 00:00:00|214.599998|214.379993|
+-------------------+----------+----------+
only showing top 2 rows



In [53]:
#using df likewise python way
df.filter(df["Close"] < 200).select(['Date','Open','Close']).show(2)

+-------------------+------------------+----------+
|               Date|              Open|     Close|
+-------------------+------------------+----------+
|2010-01-22 00:00:00|206.78000600000001|    197.75|
|2010-01-28 00:00:00|        204.930004|199.289995|
+-------------------+------------------+----------+
only showing top 2 rows



In [54]:
# Will produce an error, make sure to read the error!
df.filter(df["Close"] < 200 and df['Open'] > 200).show()
#similar to pandas df

ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [56]:
df.filter((df["Close"] < 200) & (df['Open'] > 200)).show()

+-------------------+------------------+----------+----------+----------+---------+------------------+
|               Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+-------------------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28 00:00:00|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
|2010-01-29 00:00:00|        201.079996|202.199995|190.250002|192.060003|311488100|         24.883208|
+-------------------+------------------+----------+----------+----------+---------+------------------+



In [57]:
df.filter( (df["Close"] < 200) | (df['Open'] > 200) ).show(2)

+-------------------+----------+----------+------------------+----------+---------+------------------+
|               Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+-------------------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [58]:
# Collecting results as Python objects 
df.filter(df["Low"] == 197.16).collect()

[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]

In [59]:
result = df.filter(df["Low"] == 197.16).collect()

In [62]:
# Note the nested structure returns a nested row object
type(result[0])

pyspark.sql.types.Row

In [63]:
result[0].asDict()

{'Date': datetime.datetime(2010, 1, 22, 0, 0),
 'Open': 206.78000600000001,
 'High': 207.499996,
 'Low': 197.16,
 'Close': 197.75,
 'Volume': 220441900,
 'Adj Close': 25.620401}

In [64]:
for item in result[0]:
    print(item)

2010-01-22 00:00:00
206.78000600000001
207.499996
197.16
197.75
220441900
25.620401
