In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [5]:
df = spark.read.json('people.json')

In [8]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [9]:
df.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [10]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [12]:
df.columns[0]

'age'

In [13]:
df.columns

['age', 'name']

In [16]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [19]:
from pyspark.sql.types import (StructField, StringType, 
                               IntegerType, StructType)

In [31]:
# Setup schema - If have issues with inferring schema types this is an option.
data_schema = [StructField('age', IntegerType(), True),
              StructField('name', StringType(), True)]

In [32]:
final_struc = StructType(fields=data_schema)

In [33]:
df = spark.read.json('people.json', schema=final_struc)

In [34]:
df.printSchema()  #Age is an integer now.

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [37]:
type(df['age']) # Returns back a column

pyspark.sql.column.Column

In [38]:
df.select('age') #See that you have a dataframe that contains with a single column

DataFrame[age: int]

In [39]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [41]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [44]:
df.head(2)  # Row Object

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [45]:
df.head(2)[0]

Row(age=None, name='Michael')

In [47]:
type(df.head(2)[0])  # Row Object

pyspark.sql.types.Row

In [51]:
df.select('age','name').show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [56]:
# Add a new column
df.withColumn('double_age', df['age'] * 2).show() 

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [58]:
#Notice that our changes are not saved.
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [60]:
#Rename Column
df.withColumnRenamed('age','my_new_age').show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [61]:
df.createOrReplaceTempView('people')

In [65]:
results = spark.sql("SELECT * FROM PEOPLE WHERE AGE <25")

In [66]:
results.show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+



In [73]:
spark.sql('select * from people where upper(name) = "MICHAEL"').show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
+----+-------+



In [77]:
df = spark.read.csv('appl_stock.csv', inferSchema = True, header=True)

In [83]:
df.show(5)

+----------+----------+----------+------------------+------------------+---------+------------------+
|      Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+----------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows



In [80]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [82]:
df.head(3)[0]

Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

In [89]:
# Example with sql
df.filter('Close < 500').select(['Open', 'Close']).show(5)

+----------+------------------+
|      Open|             Close|
+----------+------------------+
|213.429998|        214.009998|
|214.599998|        214.379993|
|214.379993|        210.969995|
|    211.75|            210.58|
|210.299994|211.98000499999998|
+----------+------------------+
only showing top 5 rows



In [91]:
df.filter(df['Close'] < 500).select('Volume').show(5)

+---------+
|   Volume|
+---------+
|123432400|
|150476200|
|138040000|
|119282800|
|111902700|
+---------+
only showing top 5 rows



In [94]:
# Selecting 2 conditons
df.filter((df['Close'] < 200) & (df['Open'] > 200)).show()

+----------+------------------+----------+----------+----------+---------+------------------+
|      Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+----------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
|2010-01-29|        201.079996|202.199995|190.250002|192.060003|311488100|         24.883208|
+----------+------------------+----------+----------+----------+---------+------------------+



In [105]:
# More realistic workflow in the real world
results = df.filter(df['Low'] == 197.16).collect()

In [106]:
results

[Row(Date='2010-01-22', Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]

In [107]:
row = results[0]

In [108]:
# Convert to Dictionary
row.asDict()

{'Date': '2010-01-22',
 'Open': 206.78000600000001,
 'High': 207.499996,
 'Low': 197.16,
 'Close': 197.75,
 'Volume': 220441900,
 'Adj Close': 25.620401}

In [104]:
row.asDict()['Volume']

220441900

In [109]:
row.asDict().keys()

dict_keys(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'])