In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('operations').getOrCreate()

In [4]:
df = spark.read.csv('appl_stock.csv',inferSchema=True, header=True)

In [5]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [8]:
df.head(3)[0]

Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

In [20]:
df.createOrReplaceTempView('people')   # first way of running sql
result_1 = spark.sql("SELECT * FROM people WHERE High < 200")
result_1.show(2)

+--------------------+------------------+----------+------------------+----------+---------+------------------+
|                Date|              Open|      High|               Low|     Close|   Volume|         Adj Close|
+--------------------+------------------+----------+------------------+----------+---------+------------------+
|2010-02-01 00:00:...|192.36999699999998|     196.0|191.29999899999999|194.729998|187469100|         25.229131|
|2010-02-02 00:00:...|        195.909998|196.319994|193.37999299999998|195.859997|174585600|25.375532999999997|
+--------------------+------------------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [19]:
result_2 = df.filter("High < 200").show(2)  # another way of running sql queries

+--------------------+------------------+----------+------------------+----------+---------+------------------+
|                Date|              Open|      High|               Low|     Close|   Volume|         Adj Close|
+--------------------+------------------+----------+------------------+----------+---------+------------------+
|2010-02-01 00:00:...|192.36999699999998|     196.0|191.29999899999999|194.729998|187469100|         25.229131|
|2010-02-02 00:00:...|        195.909998|196.319994|193.37999299999998|195.859997|174585600|25.375532999999997|
+--------------------+------------------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [23]:
result_3 = df.filter("High<200").select(['Date','Volume']).show()

+--------------------+---------+
|                Date|   Volume|
+--------------------+---------+
|2010-02-01 00:00:...|187469100|
|2010-02-02 00:00:...|174585600|
|2010-02-04 00:00:...|189413000|
|2010-02-05 00:00:...|212576700|
|2010-02-08 00:00:...|119567700|
|2010-02-09 00:00:...|158221700|
|2010-02-10 00:00:...| 92590400|
|2010-02-11 00:00:...|137586400|
|2014-06-09 00:00:...| 75415000|
|2014-06-10 00:00:...| 62777000|
|2014-06-11 00:00:...| 45681000|
|2014-06-12 00:00:...| 54749000|
|2014-06-13 00:00:...| 54525000|
|2014-06-16 00:00:...| 35561000|
|2014-06-17 00:00:...| 29726000|
|2014-06-18 00:00:...| 33514000|
|2014-06-19 00:00:...| 35528000|
|2014-06-20 00:00:...|100898000|
|2014-06-23 00:00:...| 43694000|
|2014-06-24 00:00:...| 39036000|
+--------------------+---------+
only showing top 20 rows



In [26]:
df.filter(df['Close'] < 500). select('Volume').show(5)

+---------+
|   Volume|
+---------+
|123432400|
|150476200|
|138040000|
|119282800|
|111902700|
+---------+
only showing top 5 rows



In [30]:
# 3 multiple conditions:
#error one:  df.filter(df['Close'] < 200 and df['Open']>500).show()
#error two:  df.filter(df['Close'] < 200 & df['Open']>500).show()
df.filter((df['Close'] < 200) & ~(df['Open']>200)).show()

+--------------------+------------------+------------------+------------------+------------------+---------+------------------+
|                Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+--------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-02-01 00:00:...|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|
|2010-02-02 00:00:...|        195.909998|        196.319994|193.37999299999998|        195.859997|174585600|25.375532999999997|
|2010-02-03 00:00:...|        195.169994|        200.200003|        194.420004|        199.229994|153832000|25.812148999999998|
|2010-02-04 00:00:...|        196.730003|        198.370001|        191.570005|        192.050003|189413000|         24.881912|
|2010-02-05 00:00:...|192.63000300000002|             196.0|        190.850002|        195.460001|212576

In [33]:
result = df.filter(df['Low'] == 197.16).collect()

In [40]:
result

[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]

In [43]:
print('Open price is:' , result[0][1])

Open price is: 206.78000600000001


In [45]:
# we can convert the results to dictionary:
row = result[0]
row.asDict()

{'Adj Close': 25.620401,
 'Close': 197.75,
 'Date': datetime.datetime(2010, 1, 22, 0, 0),
 'High': 207.499996,
 'Low': 197.16,
 'Open': 206.78000600000001,
 'Volume': 220441900}

In [46]:
row.asDict()['Volume']

220441900